diff options
Diffstat (limited to 'net/netfilter')
228 files changed, 26762 insertions, 11875 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0ffe2b8723c4..6cdc994fdc8a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only menu "Core Netfilter Configuration" - depends on NET && INET && NETFILTER + depends on INET && NETFILTER config NETFILTER_INGRESS bool "Netfilter ingress support" @@ -10,6 +10,17 @@ config NETFILTER_INGRESS This allows you to classify packets from ingress using the Netfilter infrastructure. +config NETFILTER_EGRESS + bool "Netfilter egress support" + default y + select NET_EGRESS + help + This allows you to classify packets before transmission using the + Netfilter infrastructure. + +config NETFILTER_SKIP_EGRESS + def_bool NETFILTER_EGRESS && (NET_CLS_ACT || IFB) + config NETFILTER_NETLINK tristate @@ -19,6 +30,19 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config NETFILTER_BPF_LINK + def_bool BPF_SYSCALL + +config NETFILTER_NETLINK_HOOK + tristate "Netfilter base hook dump support" + depends on NETFILTER_ADVANCED + depends on NF_TABLES + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + to list the base netfilter hooks via NFNETLINK. + This is helpful for debugging. + config NETFILTER_NETLINK_ACCT tristate "Netfilter NFACCT over NFNETLINK interface" depends on NETFILTER_ADVANCED @@ -71,12 +95,17 @@ config NF_CONNTRACK To compile it as a module, choose M here. If unsure, say N. -config NF_LOG_COMMON - tristate - -config NF_LOG_NETDEV - tristate "Netdev packet logging" - select NF_LOG_COMMON +config NF_LOG_SYSLOG + tristate "Syslog packet logging" + default m if NETFILTER_ADVANCED=n + help + This option enable support for packet logging via syslog. + It supports IPv4, IPV6, ARP and common transport protocols such + as TCP and UDP. + This is a simpler but less flexible logging method compared to + CONFIG_NETFILTER_NETLINK_LOG. + If both are enabled the backend to use can be configured at run-time + by means of per-address-family sysctl tunables. if NF_CONNTRACK config NETFILTER_CONNCOUNT @@ -94,7 +123,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' depends on NETWORK_SECMARK - default m if NETFILTER_ADVANCED=n + default y if NETFILTER_ADVANCED=n help This option enables security markings to be applied to connections. Typically they are copied to connections from @@ -118,7 +147,6 @@ config NF_CONNTRACK_ZONES config NF_CONNTRACK_PROCFS bool "Supply CT list in procfs (OBSOLETE)" - default y depends on PROC_FS help This option enables for the list of known conntrack entries @@ -164,15 +192,8 @@ config NF_CONNTRACK_LABELS to connection tracking entries. It can be used with xtables connlabel match and the nftables ct expression. -config NF_CT_PROTO_DCCP - bool 'DCCP protocol connection tracking support' - depends on NETFILTER_ADVANCED - default y - help - With this option enabled, the layer 3 independent connection - tracking code will be able to do state tracking on DCCP connections. - - If unsure, say Y. +config NF_CONNTRACK_OVS + bool config NF_CT_PROTO_GRE bool @@ -181,7 +202,7 @@ config NF_CT_PROTO_SCTP bool 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default y - select LIBCRC32C + select NET_CRC32C help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -434,6 +455,9 @@ config NF_NAT_REDIRECT config NF_NAT_MASQUERADE bool +config NF_NAT_OVS + bool + config NETFILTER_SYNPROXY tristate @@ -441,13 +465,14 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK + select NET_CRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to replace the existing {ip,ip6,arp,eb}_tables infrastructure. It provides a pseudo-state machine with an extensible instruction-set (also known as expressions) that the userspace 'nft' utility - (http://www.netfilter.org/projects/nftables) uses to build the + (https://www.netfilter.org/projects/nftables) uses to build the rule-set. It also comes with the generic set infrastructure that allows you to construct mappings between matchings and actions for performance lookups. @@ -481,6 +506,12 @@ config NFT_CT This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. +config NFT_EXTHDR_DCCP + bool "Netfilter nf_tables exthdr DCCP support (DEPRECATED)" + default n + help + This option adds support for matching on DCCP extension headers. + config NFT_FLOW_OFFLOAD depends on NF_CONNTRACK && NF_FLOW_TABLE tristate "Netfilter nf_tables hardware flow offload module" @@ -488,12 +519,6 @@ config NFT_FLOW_OFFLOAD This option adds the "flow_offload" expression that you can use to choose what flows are placed into the hardware. -config NFT_COUNTER - tristate "Netfilter nf_tables counter module" - help - This option adds the "counter" expression that you can use to - include packet and byte counters in a rule. - config NFT_CONNLIMIT tristate "Netfilter nf_tables connlimit module" depends on NF_CONNTRACK @@ -548,12 +573,6 @@ config NFT_TUNNEL This option adds the "tunnel" expression that you can use to set tunneling policies. -config NFT_OBJREF - tristate "Netfilter nf_tables stateful object reference module" - help - This option adds the "objref" expression that allows you to refer to - stateful objects, such as counters and quotas. - config NFT_QUEUE depends on NETFILTER_NETLINK_QUEUE tristate "Netfilter nf_tables queue module" @@ -681,6 +700,16 @@ config NFT_FIB_NETDEV The lookup will be delegated to the IPv4 or IPv6 FIB depending on the protocol of the packet. +config NFT_REJECT_NETDEV + depends on NFT_REJECT_IPV4 + depends on NFT_REJECT_IPV6 + tristate "Netfilter nf_tables netdev REJECT support" + help + This option enables the REJECT support from the netdev table. + The return packet generation will be delegated to the IPv4 + or IPv6 ICMP or TCP RST implementation depending on the + protocol of the packet. + endif # NF_TABLES_NETDEV endif # NF_TABLES @@ -703,6 +732,14 @@ config NF_FLOW_TABLE To compile it as a module, choose M here. +config NF_FLOW_TABLE_PROCFS + bool "Supply flow table statistics in procfs" + depends on NF_FLOW_TABLE + depends on PROC_FS + help + This option enables for the flow table offload statistics + to be shown in procfs under net/netfilter/nf_flowtable. + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n @@ -712,6 +749,25 @@ config NETFILTER_XTABLES if NETFILTER_XTABLES +config NETFILTER_XTABLES_COMPAT + bool "Netfilter Xtables 32bit support" + depends on COMPAT + help + This option provides a translation layer to run 32bit arp,ip(6),ebtables + binaries on 64bit kernels. + + If unsure, say N. + +config NETFILTER_XTABLES_LEGACY + bool "Netfilter legacy tables support" + depends on !PREEMPT_RT + help + Say Y here if you still require support for legacy tables. This is + required by the legacy tools (iptables-legacy) and is not needed if + you use iptables over nftables (iptables-nft). + Legacy support is not limited to IP, it also includes EBTABLES and + ARPTABLES. + comment "Xtables combined modules" config NETFILTER_XT_MARK @@ -768,7 +824,7 @@ config NETFILTER_XT_TARGET_AUDIT config NETFILTER_XT_TARGET_CHECKSUM tristate "CHECKSUM target support" - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `CHECKSUM' target, which can be used in the iptables mangle @@ -790,7 +846,7 @@ config NETFILTER_XT_TARGET_CLASSIFY the priority of a packet. Some qdiscs can use this value for classification, among these are: - atm, cbq, dsmark, pfifo_fast, htb, prio + atm, cbq, dsmark, pfifo_fast, htb, prio To compile it as a module, choose M here. If unsure, say N. @@ -819,7 +875,7 @@ config NETFILTER_XT_TARGET_CONNSECMARK config NETFILTER_XT_TARGET_CT tristate '"CT" target support' depends on NF_CONNTRACK - depends on IP_NF_RAW || IP6_NF_RAW + depends on IP_NF_RAW || IP6_NF_RAW || NFT_COMPAT depends on NETFILTER_ADVANCED help This options adds a `CT' target, which allows to specify initial @@ -830,7 +886,7 @@ config NETFILTER_XT_TARGET_CT config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `DSCP' target, which allows you to manipulate @@ -846,7 +902,7 @@ config NETFILTER_XT_TARGET_DSCP config NETFILTER_XT_TARGET_HL tristate '"HL" hoplimit target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds the "HL" (for IPv6) and "TTL" (for IPv4) @@ -911,8 +967,7 @@ config NETFILTER_XT_TARGET_LED config NETFILTER_XT_TARGET_LOG tristate "LOG target support" - select NF_LOG_COMMON - select NF_LOG_IPV4 + select NF_LOG_SYSLOG select NF_LOG_IPV6 if IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help @@ -1031,7 +1086,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on NETFILTER_ADVANCED depends on IPV6 || IPV6=n depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n - depends on IP_NF_MANGLE + depends on IP_NF_MANGLE || NFT_COMPAT select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n select NF_TPROXY_IPV4 @@ -1098,7 +1153,7 @@ config NETFILTER_XT_TARGET_TCPMSS config NETFILTER_XT_TARGET_TCPOPTSTRIP tristate '"TCPOPTSTRIP" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a "TCPOPTSTRIP" target, which allows you to strip @@ -1131,7 +1186,7 @@ config NETFILTER_XT_MATCH_CGROUP tristate '"control group" match support' depends on NETFILTER_ADVANCED depends on CGROUPS - select CGROUP_NET_CLASSID + select SOCK_CGROUP_DATA help Socket/process control group matching allows you to match locally generated packets based on which net_cls control group processes @@ -1229,9 +1284,9 @@ config NETFILTER_XT_MATCH_CPU To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_MATCH_DCCP - tristate '"dccp" protocol match support' + tristate '"dccp" protocol match support (DEPRECATED)' depends on NETFILTER_ADVANCED - default IP_DCCP + default n help With this option enabled, you will be able to use the iptables `dccp' match in order to match on DCCP source/destination ports diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 0e0ded87e27b..6bfc250e474f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -11,17 +11,24 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o -nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o +ifeq ($(CONFIG_NF_CONNTRACK),m) +nf_conntrack-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_conntrack_bpf.o +else ifeq ($(CONFIG_NF_CONNTRACK),y) +nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o +endif obj-$(CONFIG_NETFILTER) = netfilter.o +obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o +obj-$(CONFIG_NETFILTER_NETLINK_HOOK) += nfnetlink_hook.o # connection tracking obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o @@ -48,15 +55,18 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o nf_nat-y := nf_nat_core.o nf_nat_proto.o nf_nat_helper.o -# generic transport layer logging -obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o - -# packet logging for netdev family -obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o +obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o obj-$(CONFIG_NF_NAT) += nf_nat.o nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o +nf_nat-$(CONFIG_NF_NAT_OVS) += nf_nat_ovs.o + +ifeq ($(CONFIG_NF_NAT),m) +nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o +else ifeq ($(CONFIG_NF_NAT),y) +nf_nat-$(CONFIG_DEBUG_INFO_BTF) += nf_nat_bpf.o +endif # NAT helpers obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o @@ -77,7 +87,8 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ - nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ + nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \ + nft_counter.o nft_objref.o nft_inner.o \ nft_chain_route.o nf_tables_offload.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ nft_set_pipapo.o @@ -88,6 +99,12 @@ nf_tables-objs += nft_set_pipapo_avx2.o endif endif +ifdef CONFIG_NFT_CT +ifdef CONFIG_MITIGATION_RETPOLINE +nf_tables-objs += nft_ct_fast.o +endif +endif + obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o @@ -96,13 +113,12 @@ obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o -obj-$(CONFIG_NFT_OBJREF) += nft_objref.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o +obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o -obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o obj-$(CONFIG_NFT_REDIR) += nft_redir.o @@ -125,7 +141,14 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ - nf_flow_table_offload.o + nf_flow_table_path.o \ + nf_flow_table_offload.o nf_flow_table_xdp.o +nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o +ifeq ($(CONFIG_NF_FLOW_TABLE),m) +nf_flow_table-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_flow_table_bpf.o +else ifeq ($(CONFIG_NF_FLOW_TABLE),y) +nf_flow_table-$(CONFIG_DEBUG_INFO_BTF) += nf_flow_table_bpf.o +endif obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o @@ -214,3 +237,6 @@ obj-$(CONFIG_IP_SET) += ipset/ # IPVS obj-$(CONFIG_IP_VS) += ipvs/ + +# lwtunnel +obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 3ac7c8c1548d..11a702065bab 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@ const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); - #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -58,7 +55,7 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num) if (num == 0) return NULL; - e = kvzalloc(alloc, GFP_KERNEL); + e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT); if (e) e->num_hook_entries = num; return e; @@ -119,6 +116,18 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, for (i = 0; i < old_entries; i++) { if (orig_ops[i] != &dummy_ops) alloc_entries++; + + /* Restrict BPF hook type to force a unique priority, not + * shared at attach time. + * + * This is mainly to avoid ordering issues between two + * different bpf programs, this doesn't prevent a normal + * hook at same priority as a bpf one (we don't want to + * prevent defrag, conntrack, iptables etc from attaching). + */ + if (reg->priority == orig_ops[i]->priority && + reg->hook_ops_type == NF_HOOK_OP_BPF) + return ERR_PTR(-EBUSY); } } @@ -282,6 +291,16 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, return NULL; return net->nf.hooks_bridge + hooknum; #endif +#ifdef CONFIG_NETFILTER_INGRESS + case NFPROTO_INET: + if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS)) + return NULL; + if (!dev || dev_net(dev) != net) { + WARN_ON_ONCE(1); + return NULL; + } + return &dev->nf_hooks_ingress; +#endif case NFPROTO_IPV4: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum)) return NULL; @@ -290,12 +309,6 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum)) return NULL; return net->nf.hooks_ipv6 + hooknum; -#if IS_ENABLED(CONFIG_DECNET) - case NFPROTO_DECNET: - if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum)) - return NULL; - return net->nf.hooks_decnet + hooknum; -#endif default: WARN_ON_ONCE(1); return NULL; @@ -307,24 +320,106 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, return &dev->nf_hooks_ingress; } #endif +#ifdef CONFIG_NETFILTER_EGRESS + if (hooknum == NF_NETDEV_EGRESS) { + if (dev && dev_net(dev) == net) + return &dev->nf_hooks_egress; + } +#endif WARN_ON_ONCE(1); return NULL; } +static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg, + int hooknum) +{ +#ifndef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == hooknum) + return -EOPNOTSUPP; +#endif + if (reg->hooknum != hooknum || + !reg->dev || dev_net(reg->dev) != net) + return -EINVAL; + + return 0; +} + +static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg, + int pf) +{ + if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) || + (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS)) + return true; + + return false; +} + +static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg, + int pf) +{ + return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS; +} + +static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf) +{ +#ifdef CONFIG_JUMP_LABEL + int hooknum; + + if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { + pf = NFPROTO_NETDEV; + hooknum = NF_NETDEV_INGRESS; + } else { + hooknum = reg->hooknum; + } + static_key_slow_inc(&nf_hooks_needed[pf][hooknum]); +#endif +} + +static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf) +{ +#ifdef CONFIG_JUMP_LABEL + int hooknum; + + if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { + pf = NFPROTO_NETDEV; + hooknum = NF_NETDEV_INGRESS; + } else { + hooknum = reg->hooknum; + } + static_key_slow_dec(&nf_hooks_needed[pf][hooknum]); +#endif +} + static int __nf_register_net_hook(struct net *net, int pf, const struct nf_hook_ops *reg) { struct nf_hook_entries *p, *new_hooks; struct nf_hook_entries __rcu **pp; + int err; - if (pf == NFPROTO_NETDEV) { + switch (pf) { + case NFPROTO_NETDEV: #ifndef CONFIG_NETFILTER_INGRESS if (reg->hooknum == NF_NETDEV_INGRESS) return -EOPNOTSUPP; #endif - if (reg->hooknum != NF_NETDEV_INGRESS || +#ifndef CONFIG_NETFILTER_EGRESS + if (reg->hooknum == NF_NETDEV_EGRESS) + return -EOPNOTSUPP; +#endif + if ((reg->hooknum != NF_NETDEV_INGRESS && + reg->hooknum != NF_NETDEV_EGRESS) || !reg->dev || dev_net(reg->dev) != net) return -EINVAL; + break; + case NFPROTO_INET: + if (reg->hooknum != NF_INET_INGRESS) + break; + + err = nf_ingress_check(net, reg, NF_INET_INGRESS); + if (err < 0) + return err; + break; } pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); @@ -336,21 +431,25 @@ static int __nf_register_net_hook(struct net *net, int pf, p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); - if (!IS_ERR(new_hooks)) + if (!IS_ERR(new_hooks)) { + hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); + } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); - hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS - if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); #endif -#ifdef CONFIG_JUMP_LABEL - static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]); +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_egress_hook(reg, pf)) + net_inc_egress_queue(); #endif + nf_static_key_inc(reg, pf); + BUG_ON(p == new_hooks); nf_hook_entries_free(p); return 0; @@ -403,12 +502,14 @@ static void __nf_unregister_net_hook(struct net *net, int pf, if (nf_remove_net_hook(p, reg)) { #ifdef CONFIG_NETFILTER_INGRESS - if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + if (nf_ingress_hook(reg, pf)) net_dec_ingress_queue(); #endif -#ifdef CONFIG_JUMP_LABEL - static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]); +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_egress_hook(reg, pf)) + net_dec_egress_queue(); #endif + nf_static_key_dec(reg, pf); } else { WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum); } @@ -425,8 +526,12 @@ static void __nf_unregister_net_hook(struct net *net, int pf, void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { if (reg->pf == NFPROTO_INET) { - __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); - __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); + if (reg->hooknum == NF_INET_INGRESS) { + __nf_unregister_net_hook(net, NFPROTO_INET, reg); + } else { + __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); + __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); + } } else { __nf_unregister_net_hook(net, reg->pf, reg); } @@ -451,14 +556,20 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) int err; if (reg->pf == NFPROTO_INET) { - err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); - if (err < 0) - return err; - - err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); - if (err < 0) { - __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); - return err; + if (reg->hooknum == NF_INET_INGRESS) { + err = __nf_register_net_hook(net, NFPROTO_INET, reg); + if (err < 0) + return err; + } else { + err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); + if (err < 0) + return err; + + err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); + if (err < 0) { + __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); + return err; + } } } else { err = __nf_register_net_hook(net, reg->pf, reg); @@ -514,7 +625,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, case NF_ACCEPT: break; case NF_DROP: - kfree_skb(skb); + kfree_skb_reason(skb, + SKB_DROP_REASON_NETFILTER_DROP); ret = NF_DROP_GETERR(verdict); if (ret == 0) ret = -EPERM; @@ -524,10 +636,10 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, if (ret == 1) continue; return ret; + case NF_STOLEN: + return NF_DROP_GETERR(verdict); default: - /* Implicit handling for NF_STOLEN, as well as any other - * non conventional verdicts. - */ + WARN_ON_ONCE(1); return 0; } } @@ -540,11 +652,9 @@ void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); int ret; - INIT_LIST_HEAD(&sublist); - list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); ret = nf_hook_slow(skb, state, e, 0); @@ -559,32 +669,38 @@ EXPORT_SYMBOL(nf_hook_slow_list); /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ -struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; +const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfnl_ct_hook); -struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; +const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); +const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v4_hook); + +const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v6_hook); + #if IS_ENABLED(CONFIG_NF_CONNTRACK) -/* This does not belong here, but locally generated errors need it if connection - tracking in use: without this, connection may not be in hash table, and hence - manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) - __rcu __read_mostly; -EXPORT_SYMBOL(ip_ct_attach); +u8 nf_ctnetlink_has_listener; +EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener); -struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; +const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_hook); +/* This does not belong here, but locally generated errors need it if connection + * tracking in use: without this, connection may not be in hash table, and hence + * manufactured ICMP or RST packets will not be associated with it. + */ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { - void (*attach)(struct sk_buff *, const struct sk_buff *); + const struct nf_ct_hook *ct_hook; if (skb->_nfct) { rcu_read_lock(); - attach = rcu_dereference(ip_ct_attach); - if (attach) - attach(new, skb); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ct_hook->attach(new, skb); rcu_read_unlock(); } } @@ -592,20 +708,38 @@ EXPORT_SYMBOL(nf_ct_attach); void nf_conntrack_destroy(struct nf_conntrack *nfct) { - struct nf_ct_hook *ct_hook; + const struct nf_ct_hook *ct_hook; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - BUG_ON(ct_hook == NULL); - ct_hook->destroy(nfct); + if (ct_hook) + ct_hook->destroy(nfct); rcu_read_unlock(); + + WARN_ON(!ct_hook); } EXPORT_SYMBOL(nf_conntrack_destroy); +void nf_ct_set_closing(struct nf_conntrack *nfct) +{ + const struct nf_ct_hook *ct_hook; + + if (!nfct) + return; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ct_hook->set_closing(nfct); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_set_closing); + bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { - struct nf_ct_hook *ct_hook; + const struct nf_ct_hook *ct_hook; bool ret = false; rcu_read_lock(); @@ -644,10 +778,6 @@ static int __net_init netfilter_net_init(struct net *net) #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge)); #endif -#if IS_ENABLED(CONFIG_DECNET) - __netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet)); -#endif - #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); @@ -680,12 +810,21 @@ int __init netfilter_init(void) if (ret < 0) goto err; +#ifdef CONFIG_LWTUNNEL + ret = netfilter_lwtunnel_init(); + if (ret < 0) + goto err_lwtunnel_pernet; +#endif ret = netfilter_log_init(); if (ret < 0) - goto err_pernet; + goto err_log_pernet; return 0; -err_pernet: +err_log_pernet: +#ifdef CONFIG_LWTUNNEL + netfilter_lwtunnel_fini(); +err_lwtunnel_pernet: +#endif unregister_pernet_subsys(&netfilter_net_ops); err: return ret; diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 3c273483df23..b1ea054bb82c 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -30,7 +30,7 @@ config IP_SET_BITMAP_IP depends on IP_SET help This option adds the bitmap:ip set type support, by which one - can store IPv4 addresses (or network addresse) from a range. + can store IPv4 addresses (or network addresses) from a range. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 26ab0e9612d8..798c7993635e 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -4,6 +4,8 @@ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H +#include <linux/rcupdate_wait.h> + #define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) #define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) #define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) @@ -28,6 +30,7 @@ #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) @@ -57,9 +60,6 @@ mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; - if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); - if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); @@ -264,7 +264,7 @@ out: static void mtype_gc(struct timer_list *t) { - struct mtype *map = from_timer(map, t, gc); + struct mtype *map = timer_container_of(map, t, gc); struct ip_set *set = map->set; void *x; u32 id; @@ -288,6 +288,15 @@ mtype_gc(struct timer_list *t) add_timer(&map->gc); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + timer_delete_sync(&map->gc); +} + static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, @@ -301,6 +310,7 @@ static const struct ip_set_type_variant mtype = { .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 486959f70cf3..5988b9bb9029 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -163,11 +163,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) { + if (ip > ip_to) swap(ip, ip_to); - if (ip < map->first_ip) - return -IPSET_ERR_BITMAP_RANGE; - } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -178,7 +175,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ip_to = ip; } - if (ip_to > map->last_ip) + if (ip < map->first_ip || ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; for (; !before(ip_to, ip); ip += map->hosts) { @@ -308,8 +305,8 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask); - hosts = 2 << (32 - netmask - 1); - elements = 2 << (netmask - mask_bits - 1); + hosts = 2U << (32 - netmask - 1); + elements = 2UL << (netmask - mask_bits - 1); } if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; @@ -326,7 +323,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], set->variant = &bitmap_ip; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 2310a316e0af..2c625e0f49ec 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -363,7 +363,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index e56ced66f202..7138e080def4 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -274,7 +274,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 340cb955af25..cc20e6d56807 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -53,14 +53,17 @@ MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ -#define ip_set_dereference(p) \ - rcu_dereference_protected(p, \ +#define ip_set_dereference(inst) \ + rcu_dereference_protected((inst)->ip_set_list, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ - lockdep_is_held(&ip_set_ref_lock)) + lockdep_is_held(&ip_set_ref_lock) || \ + (inst)->is_deleted) #define ip_set(inst, id) \ - ip_set_dereference((inst)->ip_set_list)[id] + ip_set_dereference(inst)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] +#define ip_set_dereference_nfnl(p) \ + rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -101,14 +104,19 @@ find_set_type(const char *name, u8 family, u8 revision) static bool load_settype(const char *name) { + if (!try_module_get(THIS_MODULE)) + return false; + nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { pr_warn("Can't find ip_set type %s\n", name); nfnl_lock(NFNL_SUBSYS_IPSET); + module_put(THIS_MODULE); return false; } nfnl_lock(NFNL_SUBSYS_IPSET); + module_put(THIS_MODULE); return true; } @@ -250,22 +258,7 @@ EXPORT_SYMBOL_GPL(ip_set_type_unregister); void * ip_set_alloc(size_t size) { - void *members = NULL; - - if (size < KMALLOC_MAX_SIZE) - members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - - if (members) { - pr_debug("%p: allocated with kmalloc\n", members); - return members; - } - - members = vzalloc(size); - if (!members) - return NULL; - pr_debug("%p: allocated with vmalloc\n", members); - - return members; + return kvzalloc(size, GFP_KERNEL_ACCOUNT); } EXPORT_SYMBOL_GPL(ip_set_alloc); @@ -286,8 +279,7 @@ flag_nested(const struct nlattr *nla) static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, - [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, + [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), }; int @@ -369,7 +361,7 @@ ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); if (unlikely(!c)) return; - strlcpy(c->str, ext->comment, len + 1); + strscpy(c->str, ext->comment, len + 1); set->ext_size += sizeof(*c) + strlen(c->str) + 1; rcu_assign_pointer(comment->c, c); } @@ -460,6 +452,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; + if (align < ip_set_extensions[id].align) + align = ip_set_extensions[id].align; len = ALIGN(len, ip_set_extensions[id].align); set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; @@ -650,13 +644,14 @@ ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, if (SET_WITH_COUNTER(set)) { struct ip_set_counter *counter = ext_counter(data, set); + ip_set_update_counter(counter, ext, flags); + if (flags & IPSET_FLAG_MATCH_COUNTERS && !(ip_set_match_counter(ip_set_get_packets(counter), mext->packets, mext->packets_op) && ip_set_match_counter(ip_set_get_bytes(counter), mext->bytes, mext->bytes_op))) return false; - ip_set_update_counter(counter, ext, flags); } if (SET_WITH_SKBINFO(set)) ip_set_get_skbinfo(ext_skbinfo(data, set), @@ -696,6 +691,14 @@ __ip_set_put(struct ip_set *set) * a separate reference counter */ static void +__ip_set_get_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref_netlink++; + write_unlock_bh(&ip_set_ref_lock); +} + +static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -713,15 +716,10 @@ __ip_set_put_netlink(struct ip_set *set) static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { - struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); - rcu_read_lock(); - /* ip_set_list itself needs to be protected */ - set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock(); - - return set; + /* ip_set_list and the set pointer need to be protected */ + return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } static inline void @@ -752,9 +750,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; - rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); - rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ @@ -887,7 +883,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) BUG_ON(!set); read_lock_bh(&ip_set_ref_lock); - strncpy(name, set->name, IPSET_MAXNAMELEN); + strscpy_pad(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -976,20 +972,9 @@ static struct nlmsghdr * start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, enum ipset_cmd cmd) { - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - - nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), - sizeof(*nfmsg), flags); - if (!nlh) - return NULL; - - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = NFPROTO_IPV4; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - return nlh; + return nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags, + NFPROTO_IPV4, NFNETLINK_V0, 0); } /* Create a set */ @@ -1055,26 +1040,22 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, return 0; } -static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { return -EOPNOTSUPP; } -static int ip_set_create(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; - u32 flags = flag_exist(nlh); + u32 flags = flag_exist(info->nlh); int ret = 0; if (unlikely(protocol_min_failed(attr) || @@ -1100,7 +1081,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, if (!set) return -ENOMEM; spin_lock_init(&set->lock); - strlcpy(set->name, name, IPSET_MAXNAMELEN); + strscpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; @@ -1122,8 +1103,10 @@ static int ip_set_create(struct net *net, struct sock *ctnl, ret = -IPSET_ERR_PROTOCOL; goto put_out; } + /* Set create flags depending on the type revision */ + set->flags |= set->type->create_flags[revision]; - ret = set->type->create(net, set, tb, flags); + ret = set->type->create(info->net, set, tb, flags); if (ret != 0) goto put_out; @@ -1156,7 +1139,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ - tmp = ip_set_dereference(inst->ip_set_list); + tmp = ip_set_dereference(inst); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ @@ -1177,6 +1160,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, return ret; cleanup: + set->variant->cancel_gc(set); set->variant->destroy(set); put_out: module_put(set->type->me); @@ -1194,23 +1178,56 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; +/* In order to return quickly when destroying a single set, it is split + * into two stages: + * - Cancel garbage collector + * - Destroy the set itself via call_rcu() + */ + static void -ip_set_destroy_set(struct ip_set *set) +ip_set_destroy_set_rcu(struct rcu_head *head) { - pr_debug("set: %s\n", set->name); + struct ip_set *set = container_of(head, struct ip_set, rcu); - /* Must call it without holding any lock */ set->variant->destroy(set); module_put(set->type->me); kfree(set); } -static int ip_set_destroy(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static void +_destroy_all_sets(struct ip_set_net *inst) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set *set; + ip_set_id_t i; + bool need_wait = false; + + /* First cancel gc's: set:list sets are flushed as well */ + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + set->variant->cancel_gc(set); + if (set->type->features & IPSET_TYPE_NAME) + need_wait = true; + } + } + /* Must wait for flush to be really finished */ + if (need_wait) + rcu_barrier(); + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); + } + } +} + +static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) +{ + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; int ret = 0; @@ -1218,21 +1235,18 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); - /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call - * ip_set_put|get_nfnl_* functions, that way we + * ip_set_nfnl_get_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference * counter, so if it's already zero, we can proceed * without holding the lock. */ - read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { + read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && (s->ref || s->ref_netlink)) { @@ -1242,29 +1256,34 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); - for (i = 0; i < inst->ip_set_max; i++) { - s = ip_set(inst, i); - if (s) { - ip_set(inst, i) = NULL; - ip_set_destroy_set(s); - } - } + _destroy_all_sets(inst); /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { + u32 flags = flag_exist(info->nlh); + u16 features = 0; + + read_lock_bh(&ip_set_ref_lock); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { - ret = -ENOENT; + if (!(flags & IPSET_FLAG_EXIST)) + ret = -ENOENT; goto out; } else if (s->ref || s->ref_netlink) { ret = -IPSET_ERR_BUSY; goto out; } + features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - - ip_set_destroy_set(s); + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); + if (features & IPSET_TYPE_NAME) { + /* Must wait for flush to be really finished */ + rcu_barrier(); + } + call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; out: @@ -1284,12 +1303,10 @@ ip_set_flush_set(struct ip_set *set) ip_set_unlock(set); } -static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; @@ -1324,12 +1341,10 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; -static int ip_set_rename(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *s; const char *name2; ip_set_id_t i; @@ -1358,7 +1373,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, goto out; } } - strncpy(set->name, name2, IPSET_MAXNAMELEN); + strscpy_pad(set->name, name2, IPSET_MAXNAMELEN); out: write_unlock_bh(&ip_set_ref_lock); @@ -1374,12 +1389,10 @@ out: * so the ip_set_list always contains valid pointers to the sets. */ -static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; @@ -1414,9 +1427,9 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, return -EBUSY; } - strncpy(from_name, from->name, IPSET_MAXNAMELEN); - strncpy(from->name, to->name, IPSET_MAXNAMELEN); - strncpy(to->name, from_name, IPSET_MAXNAMELEN); + strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN); + strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN); + strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN); swap(from->ref, to->ref); ip_set(inst, from_id) = to; @@ -1642,7 +1655,7 @@ dump_last: goto next_set; if (set->variant->uref) set->variant->uref(set, cb, true); - /* fall through */ + fallthrough; default: ret = set->variant->list(set, skb, cb); if (!cb->args[IPSET_CB_ARG0]) @@ -1689,10 +1702,8 @@ out: return ret < 0 ? ret : skb->len; } -static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; @@ -1703,7 +1714,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, .dump = ip_set_dump_do, .done = ip_set_dump_done, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } } @@ -1719,8 +1730,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static int -call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, - struct nlattr *tb[], enum ipset_adt adt, +call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, + struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; @@ -1728,13 +1739,22 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { + if (retried) { + __ip_set_get_netlink(set); + nfnl_unlock(NFNL_SUBSYS_IPSET); + cond_resched(); + nfnl_lock(NFNL_SUBSYS_IPSET); + __ip_set_put_netlink(set); + } + ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); retried = true; - } while (ret == -EAGAIN && - set->variant->resize && - (ret = set->variant->resize(set, retried)) == 0); + } while (ret == -ERANGE || + (ret == -EAGAIN && + set->variant->resize && + (ret = set->variant->resize(set, retried)) == 0)); if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) return 0; @@ -1753,11 +1773,13 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, skb2 = nlmsg_new(payload, GFP_KERNEL); if (!skb2) return -ENOMEM; - rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); + rep = nlmsg_put(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = ret; - memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); + unsafe_memcpy(&errmsg->msg, nlh, nlh->nlmsg_len, + /* Bounds checked by the skb layer. */); + cmdattr = (void *)&errmsg->msg + min_len; ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, @@ -1772,8 +1794,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1817,7 +1838,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, flags, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; @@ -1828,7 +1849,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1837,30 +1858,24 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, return ret; } -static int ip_set_uadd(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - return ip_set_ad(net, ctnl, skb, - IPSET_ADD, nlh, attr, extack); + return ip_set_ad(info->net, info->sk, skb, + IPSET_ADD, info->nlh, attr, info->extack); } -static int ip_set_udel(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - return ip_set_ad(net, ctnl, skb, - IPSET_DEL, nlh, attr, extack); + return ip_set_ad(info->net, info->sk, skb, + IPSET_DEL, info->nlh, attr, info->extack); } -static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; @@ -1892,16 +1907,13 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, /* Get headed data of a set */ -static int ip_set_header(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -1915,7 +1927,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_HEADER); if (!nlh2) goto nlmsg_failure; @@ -1927,11 +1939,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1949,10 +1957,8 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, }; -static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -1975,7 +1981,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_TYPE); if (!nlh2) goto nlmsg_failure; @@ -1988,11 +1994,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2008,14 +2010,11 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, }; -static int ip_set_protocol(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0; if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; @@ -2024,7 +2023,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_PROTOCOL); if (!nlh2) goto nlmsg_failure; @@ -2034,11 +2033,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2049,17 +2044,14 @@ nlmsg_failure: /* Get set by name or index, from userspace */ -static int ip_set_byname(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -2073,7 +2065,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYNAME); if (!nlh2) goto nlmsg_failure; @@ -2083,11 +2075,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2101,17 +2089,14 @@ static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_INDEX] = { .type = NLA_U16 }, }; -static int ip_set_byindex(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) @@ -2128,7 +2113,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYINDEX); if (!nlh2) goto nlmsg_failure; @@ -2137,11 +2122,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2153,80 +2134,96 @@ nlmsg_failure: static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_NONE] = { .call = ip_set_none, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, }, [IPSET_CMD_CREATE] = { .call = ip_set_create, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_create_policy, }, [IPSET_CMD_DESTROY] = { .call = ip_set_destroy, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_FLUSH] = { .call = ip_set_flush, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_RENAME] = { .call = ip_set_rename, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_SWAP] = { .call = ip_set_swap, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_LIST] = { .call = ip_set_dump, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_ADD] = { .call = ip_set_uadd, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_DEL] = { .call = ip_set_udel, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_TEST] = { .call = ip_set_utest, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_HEADER] = { .call = ip_set_header, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_TYPE] = { .call = ip_set_type, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_type_policy, }, [IPSET_CMD_PROTOCOL] = { .call = ip_set_protocol, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_protocol_policy, }, [IPSET_CMD_GET_BYNAME] = { .call = ip_set_byname, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_GET_BYINDEX] = { .call = ip_set_byindex, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_index_policy, }, @@ -2390,29 +2387,25 @@ ip_set_net_init(struct net *net) } static void __net_exit -ip_set_net_exit(struct net *net) +ip_set_net_pre_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set = NULL; - ip_set_id_t i; - inst->is_deleted = true; /* flag for ip_set_nfnl_put */ +} - nfnl_lock(NFNL_SUBSYS_IPSET); - for (i = 0; i < inst->ip_set_max; i++) { - set = ip_set(inst, i); - if (set) { - ip_set(inst, i) = NULL; - ip_set_destroy_set(set); - } - } - nfnl_unlock(NFNL_SUBSYS_IPSET); +static void __net_exit +ip_set_net_exit(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); + + _destroy_all_sets(inst); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, + .pre_exit = ip_set_net_pre_exit, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), @@ -2451,8 +2444,11 @@ ip_set_fini(void) { nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - unregister_pernet_subsys(&ip_set_net_ops); + + /* Wait for call_rcu() in destroy */ + rcu_barrier(); + pr_debug("these are the famous last words\n"); } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 1ee43752d6d3..5e4453e9ef8e 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -5,6 +5,7 @@ #define _IP_SET_HASH_GEN_H #include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/jhash.h> #include <linux/types.h> #include <linux/netfilter/nfnetlink.h> @@ -37,37 +38,12 @@ */ /* Number of elements to store in an initial array block */ -#define AHASH_INIT_SIZE 4 +#define AHASH_INIT_SIZE 2 /* Max number of elements to store in an array block */ -#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE) +#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE) /* Max muber of elements in the array block when tuned */ #define AHASH_MAX_TUNED 64 - -/* Max number of elements can be tuned */ -#ifdef IP_SET_HASH_WITH_MULTI -#define AHASH_MAX(h) ((h)->ahash_max) - -static u8 -tune_ahash_max(u8 curr, u32 multi) -{ - u32 n; - - if (multi < curr) - return curr; - - n = curr + AHASH_INIT_SIZE; - /* Currently, at listing one hash bucket must fit into a message. - * Therefore we have a hard limit here. - */ - return n > curr && n <= AHASH_MAX_TUNED ? n : curr; -} - -#define TUNE_AHASH_MAX(h, multi) \ - ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) -#else -#define AHASH_MAX(h) AHASH_MAX_SIZE -#define TUNE_AHASH_MAX(h, multi) -#endif +#define AHASH_MAX(h) ((h)->bucketsize) /* A hash bucket */ struct hbucket { @@ -87,8 +63,8 @@ struct hbucket { : jhash_size((htable_bits) - HTABLE_REGION_BITS)) #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) -#define ahash_region(n, htable_bits) \ - ((n) % ahash_numof_locks(htable_bits)) +#define ahash_region(n) \ + ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ : (h) * jhash_size(HTABLE_REGION_BITS)) @@ -132,31 +108,17 @@ htable_size(u8 hbits) { size_t hsize; - /* We must fit both into u32 in jhash and size_t */ + /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) + if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } -/* Compute htable_bits from the user input parameter hashsize */ -static u8 -htable_bits(u32 hashsize) -{ - /* Assume that hashsize == 2^htable_bits */ - u8 bits = fls(hashsize - 1); - - if (jhash_size(bits) != hashsize) - /* Round up to the first 2^n value */ - bits = fls(hashsize); - - return bits; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -198,6 +160,17 @@ htable_bits(u32 hashsize) (SET_WITH_TIMEOUT(set) && \ ip_set_timeout_expired(ext_timeout(d, set))) +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) +static const union nf_inet_addr onesmask = { + .all[0] = 0xffffffff, + .all[1] = 0xffffffff, + .all[2] = 0xffffffff, + .all[3] = 0xffffffff +}; + +static const union nf_inet_addr zeromask = {}; +#endif + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -249,6 +222,7 @@ htable_bits(u32 hashsize) #undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init +#undef mtype_cancel_gc #undef mtype_variant #undef mtype_data_match @@ -293,6 +267,7 @@ htable_bits(u32 hashsize) #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) @@ -321,11 +296,10 @@ struct htype { #ifdef IP_SET_HASH_WITH_MARKMASK u32 markmask; /* markmask value for mark mask to store */ #endif -#ifdef IP_SET_HASH_WITH_MULTI - u8 ahash_max; /* max elements in an array block */ -#endif -#ifdef IP_SET_HASH_WITH_NETMASK + u8 bucketsize; /* max elements in an array block */ +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) u8 netmask; /* netmask value for subnets to store */ + union nf_inet_addr bitmask; /* stores bitmask */ #endif struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ @@ -458,7 +432,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference(hbucket(t, i)); + n = (__force struct hbucket *)hbucket(t, i); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -478,10 +452,7 @@ mtype_destroy(struct ip_set *set) struct htype *h = set->data; struct list_head *l, *lt; - if (SET_WITH_TIMEOUT(set)) - cancel_delayed_work_sync(&h->gc.dwork); - - mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + mtype_ahash_destroy(set, (__force struct htable *)h->table, true); list_for_each_safe(l, lt, &h->ad) { list_del(l); kfree(l); @@ -500,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) /* Resizing changes htable_bits, so we ignore it */ return x->maxelem == y->maxelem && a->timeout == b->timeout && -#ifdef IP_SET_HASH_WITH_NETMASK - x->netmask == y->netmask && +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + nf_inet_addr_cmp(&x->bitmask, &y->bitmask) && #endif #ifdef IP_SET_HASH_WITH_MARKMASK x->markmask == y->markmask && @@ -627,6 +598,15 @@ mtype_gc_init(struct htable_gc *gc) queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct htype *h = set->data; + + if (SET_WITH_TIMEOUT(set)) + cancel_delayed_work_sync(&h->gc.dwork); +} + static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags); @@ -644,7 +624,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t dsize = set->dsize; + size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -668,21 +648,19 @@ mtype_resize(struct ip_set *set, bool retried) retry: ret = 0; htable_bits++; - if (!htable_bits) { - /* In case we have plenty of memory :-) */ - pr_warn("Cannot increase the hashsize of set %s further\n", - set->name); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - t = ip_set_alloc(htable_size(htable_bits)); + if (!htable_bits) + goto hbwarn; + hsize = htable_size(htable_bits); + if (!hsize) + goto hbwarn; + t = ip_set_alloc(hsize); if (!t) { ret = -ENOMEM; goto out; } t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); if (!t->hregion) { - kfree(t); + ip_set_free(t); ret = -ENOMEM; goto out; } @@ -724,7 +702,7 @@ retry: #endif key = HKEY(data, h->initval, htable_bits); m = __ipset_dereference(hbucket(t, key)); - nr = ahash_region(key, htable_bits); + nr = ahash_region(key); if (!m) { m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, @@ -817,6 +795,12 @@ cleanup: if (ret == -EAGAIN) goto retry; goto out; + +hbwarn: + /* In case we have plenty of memory :-) */ + pr_warn("Cannot increase the hashsize of set %s further\n", set->name); + ret = -IPSET_ERR_HASH_FULL; + goto out; } /* Get the current number of elements and ext_size in the set */ @@ -868,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); elements = t->hregion[r].elements; maxelem = t->maxelem; @@ -950,7 +934,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, goto set_full; /* Create a new slot */ if (n->pos >= n->size) { - TUNE_AHASH_MAX(h, multi); +#ifdef IP_SET_HASH_WITH_MULTI + if (h->bucketsize >= AHASH_MAX_TUNED) + goto set_full; + else if (h->bucketsize <= multi) + h->bucketsize += AHASH_INIT_SIZE; +#endif if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); @@ -1061,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); rcu_read_unlock_bh(); @@ -1296,15 +1285,32 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_BITMASK + /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask + * and not bitmask. These two are mutually exclusive. */ + if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) { + if (set->family == NFPROTO_IPV4) { + if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip)) + goto nla_put_failure; + } else if (set->family == NFPROTO_IPV6) { + if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6)) + goto nla_put_failure; + } + } +#endif #ifdef IP_SET_HASH_WITH_NETMASK - if (h->netmask != HOST_MASK && - nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) goto nla_put_failure; #endif #ifdef IP_SET_HASH_WITH_MARKMASK if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) goto nla_put_failure; #endif + if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE) { + if (nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize) || + nla_put_net32(skb, IPSET_ATTR_INITVAL, htonl(h->initval))) + goto nla_put_failure; + } if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) @@ -1443,6 +1449,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, .region_lock = true, }; @@ -1456,8 +1463,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u32 markmask; #endif u8 hbits; -#ifdef IP_SET_HASH_WITH_NETMASK - u8 netmask; +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + int ret __attribute__((unused)) = 0; + u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + union nf_inet_addr bitmask = onesmask; #endif size_t hsize; struct htype *h; @@ -1495,7 +1504,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_NETMASK - netmask = set->family == NFPROTO_IPV4 ? 32 : 128; if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); @@ -1503,6 +1511,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, (set->family == NFPROTO_IPV6 && netmask > 128) || netmask == 0) return -IPSET_ERR_INVALID_NETMASK; + + /* we convert netmask to bitmask and store it */ + if (set->family == NFPROTO_IPV4) + bitmask.ip = ip_set_netmask(netmask); + else + ip6_netmask(&bitmask, netmask); + } +#endif + +#ifdef IP_SET_HASH_WITH_BITMASK + if (tb[IPSET_ATTR_BITMASK]) { + /* bitmask and netmask do the same thing, allow only one of these options */ + if (tb[IPSET_ATTR_NETMASK]) + return -IPSET_ERR_BITMASK_NETMASK_EXCL; + + if (set->family == NFPROTO_IPV4) { + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip); + if (ret || !bitmask.ip) + return -IPSET_ERR_INVALID_NETMASK; + } else if (set->family == NFPROTO_IPV6) { + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask); + if (ret || ipv6_addr_any(&bitmask.in6)) + return -IPSET_ERR_INVALID_NETMASK; + } + + if (nf_inet_addr_cmp(&bitmask, &zeromask)) + return -IPSET_ERR_INVALID_NETMASK; } #endif @@ -1520,7 +1555,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (!h) return -ENOMEM; - hbits = htable_bits(hashsize); + /* Compute htable_bits from the user input parameter hashsize. + * Assume that hashsize == 2^htable_bits, + * otherwise round up to the first 2^n value. + */ + hbits = fls(hashsize - 1); hsize = htable_size(hbits); if (hsize == 0) { kfree(h); @@ -1533,7 +1572,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, } t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits)); if (!t->hregion) { - kfree(t); + ip_set_free(t); kfree(h); return -ENOMEM; } @@ -1541,14 +1580,27 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, for (i = 0; i < ahash_numof_locks(hbits); i++) spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + h->bitmask = bitmask; h->netmask = netmask; #endif #ifdef IP_SET_HASH_WITH_MARKMASK h->markmask = markmask; #endif - get_random_bytes(&h->initval, sizeof(h->initval)); - + if (tb[IPSET_ATTR_INITVAL]) + h->initval = ntohl(nla_get_be32(tb[IPSET_ATTR_INITVAL])); + else + get_random_bytes(&h->initval, sizeof(h->initval)); + h->bucketsize = AHASH_MAX_SIZE; + if (tb[IPSET_ATTR_BUCKETSIZE]) { + h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]); + if (h->bucketsize < AHASH_INIT_SIZE) + h->bucketsize = AHASH_INIT_SIZE; + else if (h->bucketsize > AHASH_MAX_SIZE) + h->bucketsize = AHASH_MAX_SIZE; + else if (h->bucketsize % 2) + h->bucketsize += 1; + } t->htable_bits = hbits; t->maxelem = h->maxelem / ahash_numof_locks(hbits); RCU_INIT_POINTER(h->table, t); diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 5d6d68eaf6a9..c9f4e3859663 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -23,7 +23,9 @@ /* 1 Counters support */ /* 2 Comments support */ /* 3 Forceadd support */ -#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ +/* 4 skbinfo support */ +/* 5 bucketsize, initval support */ +#define IPSET_TYPE_REV_MAX 6 /* bitmask support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -33,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ #define HTYPE hash_ip #define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -85,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); - ip &= ip_set_netmask(h->netmask); + ip &= h->bitmask.ip; if (ip == 0) return -EINVAL; @@ -97,11 +100,11 @@ static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ip4 *h = set->data; + struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, hosts; + u32 ip = 0, ip_to = 0, hosts, i = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) @@ -118,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - ip &= ip_set_hostmask(h->netmask); + ip &= ntohl(h->bitmask.ip); e.ip = htonl(ip); if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; @@ -131,8 +134,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) + if (ip > ip_to) { + if (ip_to == 0) + return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); + } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -143,18 +149,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - if (retried) { + if (retried) ip = ntohl(h->next.ip); + for (; ip <= ip_to; i++) { e.ip = htonl(ip); - } - for (; ip <= ip_to;) { + if (i > IPSET_MAX_RANGE) { + hash_ip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ip += hosts; - e.ip = htonl(ip); - if (e.ip == 0) + if (ip == 0) return 0; ret = 0; @@ -179,12 +187,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static void -hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip6_netmask(ip, prefix); -} - static bool hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { @@ -221,7 +223,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; @@ -260,7 +262,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; @@ -277,14 +279,17 @@ static struct ip_set_type hash_ip_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index eceb7bc4a93a..467c59a83c0a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -23,7 +23,7 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 +#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>"); @@ -268,11 +268,13 @@ static struct ip_set_type hash_ipmac_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index aba1df617d6e..a22ec1a6f6ec 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -21,7 +21,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */ +/* 2 skbinfo support */ +#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); @@ -96,11 +97,11 @@ static int hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipmark4 *h = set->data; + struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0; + u32 ip, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -120,6 +121,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; + if (e.mark == 0 && e.ip == 0) + return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { @@ -132,8 +135,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) + if (ip > ip_to) { + if (e.mark == 0 && ip_to == 0) + return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); + } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -144,8 +150,12 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to; ip++, i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ipmark4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) @@ -274,12 +284,14 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmark_create, .create_policy = { [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 1ff228717e29..e977b5a9c48d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -25,7 +25,9 @@ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ +/* 5 skbinfo support added */ +/* 6 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -34,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -91,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -104,11 +112,11 @@ static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipport4 *h = set->data; + struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -128,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -177,9 +189,13 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) @@ -249,12 +265,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -294,6 +315,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -341,15 +366,19 @@ static struct ip_set_type hash_ipport_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index fa88afd812fa..39a01934b153 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -25,7 +25,8 @@ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ +/* 5 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -107,11 +108,11 @@ static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportip4 *h = set->data; + struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -184,9 +185,13 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipportip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) @@ -356,11 +361,13 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index eef6ecfcb409..5c6de605a9fb 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -27,7 +27,8 @@ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ +/* 7 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -159,12 +160,12 @@ static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportnet4 *h = set->data; + struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -278,9 +279,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; + if (i > IPSET_MAX_RANGE) { + hash_ipportnet4_data_next(&h->next, + &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) @@ -513,11 +520,13 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 0b61593165ef..718814730acf 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -16,7 +16,7 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 0 +#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -125,11 +125,13 @@ static struct ip_set_type hash_mac_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_mac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 136cf0781d3a..ce0a9ce5a91f 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -24,7 +24,8 @@ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ +/* 6 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -135,11 +136,11 @@ static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_net4 *h = set->data; + struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -187,10 +188,16 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } + if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_net4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) @@ -354,11 +361,13 @@ static struct ip_set_type hash_net_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_net_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index be5e95a0d876..30a655e5c4fd 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -26,7 +26,8 @@ /* 4 Comments support added */ /* 5 Forceadd support added */ /* 6 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */ +/* 7 interface wildcard support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -39,7 +40,7 @@ MODULE_ALIAS("ip_set_hash:net,iface"); #define IP_SET_HASH_WITH_MULTI #define IP_SET_HASH_WITH_NET0 -#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ) +#define STRSCPY(a, b) strscpy(a, b, IFNAMSIZ) /* IPv4 variant */ @@ -137,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next, #include "ip_set_hash_gen.h" #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -static const char *get_physindev_name(const struct sk_buff *skb) +static const char *get_physindev_name(const struct sk_buff *skb, struct net *net) { - struct net_device *dev = nf_bridge_get_physindev(skb); + struct net_device *dev = nf_bridge_get_physindev(skb, net); return dev ? dev->name : NULL; } @@ -176,16 +177,16 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const char *eiface = SRCDIR ? get_physindev_name(skb) : + const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) return -EINVAL; - STRLCPY(e.iface, eiface); + STRSCPY(e.iface, eiface); e.physdev = 1; #endif } else { - STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); } if (strlen(e.iface) == 0) @@ -201,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -225,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); + nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -259,7 +260,12 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_netiface4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); @@ -389,16 +395,16 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const char *eiface = SRCDIR ? get_physindev_name(skb) : + const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) return -EINVAL; - STRLCPY(e.iface, eiface); + STRSCPY(e.iface, eiface); e.physdev = 1; #endif } else { - STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); } if (strlen(e.iface) == 0) @@ -442,7 +448,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip, e.cidr); - nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); + nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -470,11 +476,13 @@ static struct ip_set_type hash_netiface_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netiface_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index da4ef910b12d..8fbe649c9dd3 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -22,7 +22,9 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ +/* 2 skbinfo support added */ +/* 3 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 4 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -32,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net"); /* Type specific function prefix */ #define HTYPE hash_netnet #define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK #define IPSET_NET_COUNT 2 /* IPv4 variants */ @@ -152,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); - e.ip[0] &= ip_set_netmask(e.cidr[0]); - e.ip[1] &= ip_set_netmask(e.cidr[1]); + e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip); + e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -162,12 +166,12 @@ static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netnet4 *h = set->data; + struct hash_netnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -211,8 +215,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) { - e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); - e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1])); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; @@ -255,7 +259,12 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip[0] = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); do { + i++; e.ip[1] = htonl(ip2); + if (i > IPSET_MAX_RANGE) { + hash_netnet4_data_next(&h->next, &e); + return -ERANGE; + } ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) @@ -389,6 +398,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -399,6 +413,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + const struct hash_netnet6 *h = set->data; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -438,6 +453,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -IPSET_ERR_HASH_ELEM; + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -459,14 +479,18 @@ static struct ip_set_type hash_netnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 34448df80fb9..d1a0628df4ef 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -26,7 +26,8 @@ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ +/* 7 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -153,11 +154,11 @@ static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netport4 *h = set->data; + struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0; + u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -245,8 +246,12 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_netport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -460,11 +465,13 @@ static struct ip_set_type hash_netport_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 934c1712cba8..bf4f91b78e1d 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -23,7 +23,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 0 Comments support added */ /* 1 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ +/* 2 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -35,6 +36,7 @@ MODULE_ALIAS("ip_set_hash:net,port,net"); #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS #define IPSET_NET_COUNT 2 +#define IP_SET_HASH_WITH_NET0 /* IPv4 variant */ @@ -172,16 +174,26 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } +static u32 +hash_netportnet4_range_to_cidr(u32 from, u32 to, u8 *cidr) +{ + if (from == 0 && to == UINT_MAX) { + *cidr = 0; + return to; + } + return ip_set_range_to_cidr(from, to, cidr); +} + static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netportnet4 *h = set->data; + struct hash_netportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; int ret; @@ -295,13 +307,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], do { e.ip[0] = htonl(ip); - ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); + ip = hash_netportnet4_range_to_cidr(ip, ip_to, &e.cidr[0]); for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip[1] = htonl(ip2); - ip2 = ip_set_range_to_cidr(ip2, ip2_to, - &e.cidr[1]); + if (i > IPSET_MAX_RANGE) { + hash_netportnet4_data_next(&h->next, + &e); + return -ERANGE; + } + ip2 = hash_netportnet4_range_to_cidr(ip2, + ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -558,11 +576,13 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, + .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, - [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, + [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 5a67f7966574..13c7a08aa868 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -79,7 +79,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -99,7 +99,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -188,9 +188,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *next, *prev = NULL; - int ret; + int ret = 0; - list_for_each_entry(e, &map->members, list) { + rcu_read_lock(); + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -201,6 +202,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (d->before == 0) { ret = 1; + goto out; } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && @@ -208,9 +210,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, } else { ret = prev && prev->id == d->refid; } - return ret; + goto out; } - return 0; +out: + rcu_read_unlock(); + return ret; } static void @@ -239,7 +243,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* Find where to add the new entry */ n = prev = next = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -316,9 +320,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e, *next, *prev = NULL; + struct set_elem *e, *n, *next, *prev = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_safe(e, n, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -424,17 +428,8 @@ static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e, *n; - - if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); - list_for_each_entry_safe(e, n, &map->members, list) { - list_del(&e->list); - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - kfree(e); - } + WARN_ON_ONCE(!list_empty(&map->members)); kfree(map); set->data = NULL; @@ -545,6 +540,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } +static void +list_set_cancel_gc(struct ip_set *set) +{ + struct list_set *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + timer_shutdown_sync(&map->gc); + + /* Flush list to drop references to other ipsets */ + list_set_flush(set); +} + static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, @@ -558,12 +565,13 @@ static const struct ip_set_type_variant set_variant = { .head = list_set_head, .list = list_set_list, .same_set = list_set_same_set, + .cancel_gc = list_set_cancel_gc, }; static void list_set_gc(struct timer_list *t) { - struct list_set *map = from_timer(map, t, gc); + struct list_set *map = timer_container_of(map, t, gc); struct ip_set *set = map->set; spin_lock_bh(&set->lock); @@ -603,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) return true; } +static struct lock_class_key list_set_lockdep_key; + static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -619,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; + lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 2c1593089ede..c203252e856d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -4,7 +4,7 @@ # menuconfig IP_VS tristate "IP virtual server support" - depends on NET && INET && NETFILTER + depends on INET && NETFILTER depends on (NF_CONNTRACK || NF_CONNTRACK=n) help IP Virtual Server support will let you build a high-performance @@ -29,7 +29,6 @@ if IP_VS config IP_VS_IPV6 bool "IPv6 support for IPVS" depends on IPV6 = y || IP_VS = IPV6 - select IP6_NF_IPTABLES select NF_DEFRAG_IPV6 help Add IPv6 support to IPVS. @@ -45,7 +44,8 @@ config IP_VS_DEBUG config IP_VS_TAB_BITS int "IPVS connection table size (the Nth power of 2)" - range 8 20 + range 8 20 if !64BIT + range 8 27 if 64BIT default 12 help The IPVS connection hash table uses the chaining scheme to handle @@ -55,24 +55,24 @@ config IP_VS_TAB_BITS Note the table size must be power of 2. The table size will be the value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). + from 8 to 27 for 64BIT(20 otherwise), the default number is 12, + which means the table size is 4096. Don't input the number too + small, otherwise you will lose performance on it. You can adapt the + table size yourself, according to your virtual server application. + It is good to set the table size not far less than the number of + connections per second multiplying average lasting time of + connection in the table. For example, your virtual server gets 200 + connections per second, the connection lasts for 200 seconds in + average in the connection table, the table size should be not far + less than 200x200, it is good to set the table size 32768 (2**15). Another note that each connection occupies 128 bytes effectively and each hash entry uses 8 bytes, so you can estimate how much memory is needed for your box. You can overwrite this number setting conn_tab_bits module parameter - or by appending ip_vs.conn_tab_bits=? to the kernel command line - if IP VS was compiled built-in. + or by appending ip_vs.conn_tab_bits=? to the kernel command line if + IP VS was compiled built-in. comment "IPVS transport protocol load balancing support" @@ -105,7 +105,7 @@ config IP_VS_PROTO_AH config IP_VS_PROTO_SCTP bool "SCTP load balancing support" - select LIBCRC32C + select NET_CRC32C help This option enables support for load balancing SCTP transport protocol. Say Y if unsure. @@ -272,6 +272,17 @@ config IP_VS_NQ If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_TWOS + tristate "weighted random twos choice least-connection scheduling" + help + The weighted random twos choice least-connection scheduling + algorithm picks two random real servers and directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + comment 'IPVS SH scheduler' config IP_VS_SH_TAB_BITS @@ -308,7 +319,7 @@ config IP_VS_MH_TAB_INDEX comment 'IPVS application helper' config IP_VS_FTP - tristate "FTP protocol helper" + tristate "FTP protocol helper" depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ NF_CONNTRACK_FTP select IP_VS_NFCT diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index bfce2677fda2..bb5d8125c82a 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o +obj-$(CONFIG_IP_VS_TWOS) += ip_vs_twos.o # IPVS application helpers obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index f9b16f2b2219..d54d7da58334 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -13,8 +13,7 @@ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -599,13 +598,19 @@ static const struct seq_operations ip_vs_app_seq_ops = { int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->app_list); - proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops, - sizeof(struct seq_net_private)); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, + &ip_vs_app_seq_ops, + sizeof(struct seq_net_private))) + return -ENOMEM; +#endif return 0; } void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { unregister_ip_vs_app(ipvs, NULL /* all */); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_app", ipvs->net->proc_net); +#endif } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 02f2f636798d..50cc492c7553 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -17,8 +17,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/interrupt.h> #include <linux/in.h> @@ -26,12 +25,12 @@ #include <linux/net.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/vmalloc.h> #include <linux/proc_fs.h> /* for proc_net_* */ #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/rcupdate_wait.h> #include <net/net_namespace.h> #include <net/ip_vs.h> @@ -402,6 +401,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { unsigned int hash; struct ip_vs_conn *cp, *ret=NULL; + const union nf_inet_addr *saddr; + __be16 sport; /* * Check for "full" addressed entries @@ -411,10 +412,20 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) rcu_read_lock(); hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (p->vport == cp->cport && p->cport == cp->dport && - cp->af == p->af && + if (p->vport != cp->cport) + continue; + + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + sport = cp->vport; + saddr = &cp->vaddr; + } else { + sport = cp->dport; + saddr = &cp->daddr; + } + + if (p->cport == sport && cp->af == p->af && ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && - ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && + ip_vs_addr_equal(p->af, p->caddr, saddr) && p->protocol == cp->protocol && cp->ipvs == p->ipvs) { if (!__ip_vs_conn_get(cp)) @@ -807,9 +818,34 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) kmem_cache_free(ip_vs_conn_cachep, cp); } +/* Try to delete connection while not holding reference */ +static void ip_vs_conn_del(struct ip_vs_conn *cp) +{ + if (timer_delete(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + ip_vs_conn_expire(&cp->timer); + } +} + +/* Try to delete connection while holding reference */ +static void ip_vs_conn_del_put(struct ip_vs_conn *cp) +{ + if (timer_delete(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + __ip_vs_conn_put(cp); + ip_vs_conn_expire(&cp->timer); + } else { + __ip_vs_conn_put(cp); + } +} + static void ip_vs_conn_expire(struct timer_list *t) { - struct ip_vs_conn *cp = from_timer(cp, t, timer); + struct ip_vs_conn *cp = timer_container_of(cp, t, timer); struct netns_ipvs *ipvs = cp->ipvs; /* @@ -823,18 +859,21 @@ static void ip_vs_conn_expire(struct timer_list *t) struct ip_vs_conn *ct = cp->control; /* delete the timer if it is activated by other users */ - del_timer(&cp->timer); + timer_delete(&cp->timer); /* does anybody control me? */ if (ct) { + bool has_ref = !cp->timeout && __ip_vs_conn_get(ct); + ip_vs_control_del(cp); /* Drop CTL or non-assured TPL if not used anymore */ - if (!cp->timeout && !atomic_read(&ct->n_control) && + if (has_ref && !atomic_read(&ct->n_control) && (!(ct->flags & IP_VS_CONN_F_TEMPLATE) || !(ct->state & IP_VS_CTPL_S_ASSURED))) { IP_VS_DBG(4, "drop controlling connection\n"); - ct->timeout = 0; - ip_vs_conn_expire_now(ct); + ip_vs_conn_del_put(ct); + } else if (has_ref) { + __ip_vs_conn_put(ct); } } @@ -845,7 +884,7 @@ static void ip_vs_conn_expire(struct timer_list *t) * conntrack cleanup for the net. */ smp_rmb(); - if (ipvs->enable) + if (READ_ONCE(ipvs->enable)) ip_vs_conn_drop_conntrack(cp); } @@ -886,7 +925,7 @@ static void ip_vs_conn_expire(struct timer_list *t) void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { /* Using mod_timer_pending will ensure the timer is not - * modified after the final del_timer in ip_vs_conn_expire. + * modified after the final timer_delete in ip_vs_conn_expire. */ if (timer_pending(&cp->timer) && time_after(cp->timer.expires, jiffies)) @@ -1006,28 +1045,35 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, #ifdef CONFIG_PROC_FS struct ip_vs_iter_state { struct seq_net_private p; - struct hlist_head *l; + unsigned int bucket; + unsigned int skip_elems; }; -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +static void *ip_vs_conn_array(struct ip_vs_iter_state *iter) { int idx; struct ip_vs_conn *cp; - struct ip_vs_iter_state *iter = seq->private; - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) { + unsigned int skip = 0; + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { /* __ip_vs_conn_get() is not needed by * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show */ - if (pos-- == 0) { - iter->l = &ip_vs_conn_tab[idx]; + if (skip >= iter->skip_elems) { + iter->bucket = idx; return cp; } + + ++skip; } + + iter->skip_elems = 0; cond_resched_rcu(); } + iter->bucket = idx; return NULL; } @@ -1036,9 +1082,14 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) { struct ip_vs_iter_state *iter = seq->private; - iter->l = NULL; rcu_read_lock(); - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; + if (*pos == 0) { + iter->skip_elems = 0; + iter->bucket = 0; + return SEQ_START_TOKEN; + } + + return ip_vs_conn_array(iter); } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -1046,28 +1097,22 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct ip_vs_conn *cp = v; struct ip_vs_iter_state *iter = seq->private; struct hlist_node *e; - struct hlist_head *l = iter->l; - int idx; ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); + return ip_vs_conn_array(iter); /* more on same hash chain? */ e = rcu_dereference(hlist_next_rcu(&cp->c_list)); - if (e) + if (e) { + iter->skip_elems++; return hlist_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - while (++idx < ip_vs_conn_tab_size) { - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - iter->l = &ip_vs_conn_tab[idx]; - return cp; - } - cond_resched_rcu(); } - iter->l = NULL; - return NULL; + + iter->skip_elems = 0; + iter->bucket++; + + return ip_vs_conn_array(iter); } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) @@ -1225,8 +1270,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp) * The drop rate array needs tuning for real environments. * Called from timer bh only => no locking */ - static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static char todrop_counter[9] = {0}; + static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static signed char todrop_counter[9] = {0}; int i; /* if the conn entry hasn't lasted for 60 seconds, don't drop it. @@ -1268,7 +1313,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; + unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->ipvs != ipvs) @@ -1317,8 +1362,7 @@ try_drop: drop: IP_VS_DBG(4, "drop connection\n"); - cp->timeout = 0; - ip_vs_conn_expire_now(cp); + ip_vs_conn_del(cp); } cond_resched_rcu(); } @@ -1341,19 +1385,15 @@ flush_again: hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (cp->ipvs != ipvs) continue; - /* As timers are expired in LIFO order, restart - * the timer of controlling connection first, so - * that it is expired after us. - */ + if (atomic_read(&cp->n_control)) + continue; cp_c = cp->control; - /* cp->control is valid only with reference to cp */ - if (cp_c && __ip_vs_conn_get(cp)) { + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { IP_VS_DBG(4, "del controlling connection\n"); - ip_vs_conn_expire_now(cp_c); - __ip_vs_conn_put(cp); + ip_vs_conn_del(cp_c); } - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); } cond_resched_rcu(); } @@ -1366,6 +1406,45 @@ flush_again: goto flush_again; } } + +#ifdef CONFIG_SYSCTL +void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) +{ + int idx; + struct ip_vs_conn *cp, *cp_c; + struct ip_vs_dest *dest; + + rcu_read_lock(); + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + if (cp->ipvs != ipvs) + continue; + + dest = cp->dest; + if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) + continue; + + if (atomic_read(&cp->n_control)) + continue; + + cp_c = cp->control; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { + IP_VS_DBG(4, "del controlling connection\n"); + ip_vs_conn_del(cp_c); + } + } + cond_resched_rcu(); + + /* netns clean up started, abort delayed work */ + if (!READ_ONCE(ipvs->enable)) + break; + } + rcu_read_unlock(); +} +#endif + /* * per netns init and exit */ @@ -1373,51 +1452,78 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) { atomic_set(&ipvs->conn_count, 0); - proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, - &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state)); - proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, - &ip_vs_conn_sync_seq_ops, - sizeof(struct ip_vs_iter_state)); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, + &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn; + + if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, + &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn_sync; +#endif + return 0; + +#ifdef CONFIG_PROC_FS +err_conn_sync: + remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); +err_conn: + return -ENOMEM; +#endif } void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) { /* flush all the connection entries first */ ip_vs_conn_flush(ipvs); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); +#endif } int __init ip_vs_conn_init(void) { + size_t tab_array_size; + int max_avail; +#if BITS_PER_LONG > 32 + int max = 27; +#else + int max = 20; +#endif + int min = 8; int idx; - /* Compute size and mask */ + max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT; + max_avail -= 2; /* ~4 in hash row */ + max_avail -= 1; /* IPVS up to 1/2 of mem */ + max_avail -= order_base_2(sizeof(struct ip_vs_conn)); + max = clamp(max_avail, min, max); + ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; /* * Allocate the connection hash table and initialize its list heads */ - ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size, - sizeof(*ip_vs_conn_tab))); + tab_array_size = array_size(ip_vs_conn_tab_size, + sizeof(*ip_vs_conn_tab)); + ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size, + sizeof(*ip_vs_conn_tab), GFP_KERNEL); if (!ip_vs_conn_tab) return -ENOMEM; /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); + ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); + kvfree(ip_vs_conn_tab); return -ENOMEM; } - pr_info("Connection hash table configured " - "(size=%d, memory=%ldKbytes)\n", - ip_vs_conn_tab_size, - (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); + pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n", + ip_vs_conn_tab_size, tab_array_size / 1024); IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n", sizeof(struct ip_vs_conn)); @@ -1440,5 +1546,5 @@ void ip_vs_conn_cleanup(void) rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - vfree(ip_vs_conn_tab); + kvfree(ip_vs_conn_tab); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index aa6a603a2425..90d56f92c0f6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -19,8 +19,7 @@ * Harald Welte don't use nfcache */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -68,18 +67,6 @@ EXPORT_SYMBOL(ip_vs_get_debug_level); #endif EXPORT_SYMBOL(ip_vs_new_conn_out); -#ifdef CONFIG_IP_VS_PROTO_TCP -INDIRECT_CALLABLE_DECLARE(int - tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); -#endif - -#ifdef CONFIG_IP_VS_PROTO_UDP -INDIRECT_CALLABLE_DECLARE(int - udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); -#endif - #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) \ INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) @@ -144,21 +131,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -180,21 +167,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -212,17 +199,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) s = this_cpu_ptr(cp->dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -694,16 +681,10 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) return ipvs->sysctl_nat_icmp_send; } -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) -{ - return ipvs->sysctl_expire_nodest_conn; -} - #else static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } #endif @@ -748,12 +729,12 @@ static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, struct dst_entry *dst = skb_dst(skb); if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && - ip6_route_me_harder(ipvs->net, skb) != 0) + ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0) return 1; } else #endif if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && - ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) + ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0) return 1; return 0; @@ -881,7 +862,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, unsigned int verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - goto ignore_cp; + goto after_nat; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { @@ -907,6 +888,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto out; +after_nat: /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); @@ -915,8 +897,6 @@ static int handle_response_icmp(int af, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); - -ignore_cp: verdict = NF_ACCEPT; out: @@ -1159,7 +1139,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, __be16 vport; unsigned int flags; - EnterFunction(12); vaddr = &svc->addr; vport = svc->port; daddr = &iph->saddr; @@ -1227,7 +1206,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), cp->flags, refcount_read(&cp->refcnt)); - LeaveFunction(12); return cp; } @@ -1282,6 +1260,9 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, { struct ip_vs_protocol *pp = pd->pp; + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto after_nat; + IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); if (skb_ensure_writable(skb, iph->len)) @@ -1322,6 +1303,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); +after_nat: ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); skb->ipvs_property = 1; @@ -1331,13 +1313,11 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); - LeaveFunction(11); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); - LeaveFunction(11); return NF_STOLEN; } @@ -1345,16 +1325,17 @@ drop: * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + int af = state->pf; struct sock *sk; - EnterFunction(11); - /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; @@ -1364,16 +1345,13 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - if (!ipvs->enable) - return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1418,11 +1396,8 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, af, skb, &iph); - if (likely(cp)) { - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - goto ignore_cp; + if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); - } /* Check for real-server-started requests */ if (atomic_read(&ipvs->conn_out_counter)) { @@ -1481,66 +1456,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in } } -out: IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; - -ignore_cp: - __ip_vs_conn_put(cp); - goto out; } -/* - * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, - * used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_reply4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -/* - * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_local_reply4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -#ifdef CONFIG_IP_VS_IPV6 - -/* - * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, - * used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_reply6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -/* - * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_local_reply6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -#endif - static unsigned int ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, @@ -1626,6 +1546,7 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 type; /* Only support version 0 and C (csum) */ @@ -1636,7 +1557,10 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (type != htons(ETH_P_IP)) goto unk; *proto = IPPROTO_IPIP; - return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + + gre_flags_to_tnl_flags(flags, greh->flags); + + return gre_calc_hlen(flags); } unk: @@ -1980,15 +1904,17 @@ out: * and send it on its way... */ static unsigned int -ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; - int conn_reuse_mode; struct sock *sk; + int af = state->pf; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -2010,7 +1936,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); @@ -2020,7 +1946,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } @@ -2064,16 +1990,17 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, af, skb, &iph); - conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); - if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { - bool uses_ct = false, resched = false; + if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) { + int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); + bool old_ct = false, resched = false; if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); - } else if (is_new_conn_expected(cp, conn_reuse_mode)) { - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); + } else if (conn_reuse_mode && + is_new_conn_expected(cp, conn_reuse_mode)) { + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { @@ -2081,50 +2008,51 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int * that uses conntrack while it is still * referenced by controlled connection(s). */ - resched = !uses_ct; + resched = !old_ct; } } if (resched) { + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); - if (uses_ct) + if (old_ct) return NF_DROP; cp = NULL; } } - if (unlikely(!cp)) { - int v; - - if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) - return v; - } - - IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ + if (sysctl_expire_nodest_conn(ipvs)) { + bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); - __u32 flags = cp->flags; - - /* when timer already started, silently drop the packet.*/ - if (timer_pending(&cp->timer)) - __ip_vs_conn_put(cp); - else - ip_vs_conn_put(cp); + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; - if (sysctl_expire_nodest_conn(ipvs) && - !(flags & IP_VS_CONN_F_ONE_PACKET)) { - /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + if (old_ct) + return NF_DROP; + cp = NULL; + } else { + __ip_vs_conn_put(cp); + return NF_DROP; } + } - return NF_DROP; + if (unlikely(!cp)) { + int v; + + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) + return v; } + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); + ip_vs_in_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) @@ -2147,7 +2075,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (cp->flags & IP_VS_CONN_F_ONE_PACKET) pkts = sysctl_sync_threshold(ipvs); else - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); @@ -2160,55 +2088,6 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int } /* - * AF_INET handler in NF_INET_LOCAL_IN chain - * Schedule and forward packets from remote clients - */ -static unsigned int -ip_vs_remote_request4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -/* - * AF_INET handler in NF_INET_LOCAL_OUT chain - * Schedule and forward packets from local clients - */ -static unsigned int -ip_vs_local_request4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -#ifdef CONFIG_IP_VS_IPV6 - -/* - * AF_INET6 handler in NF_INET_LOCAL_IN chain - * Schedule and forward packets from remote clients - */ -static unsigned int -ip_vs_remote_request6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -/* - * AF_INET6 handler in NF_INET_LOCAL_OUT chain - * Schedule and forward packets from local clients - */ -static unsigned int -ip_vs_local_request6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -#endif - - -/* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. * When fwmark-based virtual service is used, such as transparent @@ -2221,45 +2100,36 @@ static unsigned int ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - int r; struct netns_ipvs *ipvs = net_ipvs(state->net); - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; + int r; /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; - return ip_vs_in_icmp(ipvs, skb, &r, state->hook); -} - + if (state->pf == NFPROTO_IPV4) { + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; #ifdef CONFIG_IP_VS_IPV6 -static unsigned int -ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - int r; - struct netns_ipvs *ipvs = net_ipvs(state->net); - struct ip_vs_iphdr iphdr; + } else { + struct ip_vs_iphdr iphdr; - ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); - if (iphdr.protocol != IPPROTO_ICMPV6) - return NF_ACCEPT; + ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); - /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) - return NF_ACCEPT; + if (iphdr.protocol != IPPROTO_ICMPV6) + return NF_ACCEPT; - return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); -} + return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); #endif + } + return ip_vs_in_icmp(ipvs, skb, &r, state->hook); +} -static const struct nf_hook_ops ip_vs_ops[] = { +static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, @@ -2268,21 +2138,21 @@ static const struct nf_hook_ops ip_vs_ops[] = { * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_remote_request4, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_local_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_local_request4, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, @@ -2297,15 +2167,18 @@ static const struct nf_hook_ops ip_vs_ops[] = { }, /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, }, +}; + #ifdef CONFIG_IP_VS_IPV6 +static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, @@ -2314,21 +2187,21 @@ static const struct nf_hook_ops ip_vs_ops[] = { * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_remote_request6, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_local_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_local_request6, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, @@ -2336,20 +2209,76 @@ static const struct nf_hook_ops ip_vs_ops[] = { /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { - .hook = ip_vs_forward_icmp_v6, + .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, }, -#endif }; +#endif + +int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) +{ + const struct nf_hook_ops *ops; + unsigned int count; + unsigned int afmask; + int ret = 0; + + if (af == AF_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + ops = ip_vs_ops6; + count = ARRAY_SIZE(ip_vs_ops6); + afmask = 2; +#else + return -EINVAL; +#endif + } else { + ops = ip_vs_ops4; + count = ARRAY_SIZE(ip_vs_ops4); + afmask = 1; + } + + if (!(ipvs->hooks_afmask & afmask)) { + ret = nf_register_net_hooks(ipvs->net, ops, count); + if (ret >= 0) + ipvs->hooks_afmask |= afmask; + } + return ret; +} + +void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) +{ + const struct nf_hook_ops *ops; + unsigned int count; + unsigned int afmask; + + if (af == AF_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + ops = ip_vs_ops6; + count = ARRAY_SIZE(ip_vs_ops6); + afmask = 2; +#else + return; +#endif + } else { + ops = ip_vs_ops4; + count = ARRAY_SIZE(ip_vs_ops4); + afmask = 1; + } + + if (ipvs->hooks_afmask & afmask) { + nf_unregister_net_hooks(ipvs->net, ops, count); + ipvs->hooks_afmask &= ~afmask; + } +} + /* * Initialize IP Virtual Server netns mem. */ @@ -2361,8 +2290,8 @@ static int __net_init __ip_vs_init(struct net *net) if (ipvs == NULL) return -ENOMEM; - /* Hold the beast until a service is registerd */ - ipvs->enable = 0; + /* Hold the beast until a service is registered */ + WRITE_ONCE(ipvs->enable, 0); ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); @@ -2425,33 +2354,19 @@ static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) } } -static int __net_init __ip_vs_dev_init(struct net *net) -{ - int ret; - - ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) - goto hook_fail; - return 0; - -hook_fail: - return ret; -} - static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; - EnterFunction(2); list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ipvs->enable = 0; /* Disable packet reception */ + ip_vs_unregister_hooks(ipvs, AF_INET); + ip_vs_unregister_hooks(ipvs, AF_INET6); + WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } - LeaveFunction(2); } static struct pernet_operations ipvs_core_ops = { @@ -2462,7 +2377,6 @@ static struct pernet_operations ipvs_core_ops = { }; static struct pernet_operations ipvs_core_dev_ops = { - .init = __ip_vs_dev_init, .exit_batch = __ip_vs_dev_cleanup_batch, }; @@ -2526,9 +2440,14 @@ static void __exit ip_vs_cleanup(void) ip_vs_conn_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); + /* common rcu_barrier() used by: + * - ip_vs_control_cleanup() + */ + rcu_barrier(); pr_info("ipvs unloaded.\n"); } module_init(ip_vs_init); module_exit(ip_vs_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Virtual Server"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 412656c34f20..068702894377 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -13,8 +13,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/init.h> @@ -24,7 +23,6 @@ #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/workqueue.h> -#include <linux/swap.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -48,8 +46,9 @@ #include <net/ip_vs.h> -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); +MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); + +DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ /* sysctl variables */ @@ -94,6 +93,7 @@ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; + int amemthresh; int nomem; int to_change = -1; @@ -105,7 +105,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ - nomem = (availmem < ipvs->sysctl_amemthresh); + amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); + nomem = (availmem < amemthresh); local_bh_disable(); @@ -145,9 +146,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 1: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; @@ -155,9 +155,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 2: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; @@ -210,6 +209,17 @@ static void update_defense_level(struct netns_ipvs *ipvs) local_bh_enable(); } +/* Handler for delayed work for expiring no + * destination connections + */ +static void expire_nodest_conn_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs; + + ipvs = container_of(work, struct netns_ipvs, + expire_nodest_conn_work.work); + ip_vs_expire_nodest_conn_flush(ipvs); +} /* * Timer for checking the defense @@ -224,10 +234,52 @@ static void defense_work_handler(struct work_struct *work) update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(ipvs); - schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); } #endif +static void est_reload_work_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, est_reload_work.work); + int genid_done = atomic_read(&ipvs->est_genid_done); + unsigned long delay = HZ / 10; /* repeat startups after failure */ + bool repeat = false; + int genid; + int id; + + mutex_lock(&ipvs->est_mutex); + genid = atomic_read(&ipvs->est_genid); + for (id = 0; id < ipvs->est_kt_count; id++) { + struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; + + /* netns clean up started, abort delayed work */ + if (!READ_ONCE(ipvs->enable)) + goto unlock; + if (!kd) + continue; + /* New config ? Stop kthread tasks */ + if (genid != genid_done) + ip_vs_est_kthread_stop(kd); + if (!kd->task && !ip_vs_est_stopped(ipvs)) { + /* Do not start kthreads above 0 in calc phase */ + if ((!id || !ipvs->est_calc_phase) && + ip_vs_est_kthread_start(ipvs, kd) < 0) + repeat = true; + } + } + + atomic_set(&ipvs->est_genid_done, genid); + + if (repeat) + queue_delayed_work(system_long_wq, &ipvs->est_reload_work, + delay); + +unlock: + mutex_unlock(&ipvs->est_mutex); +} + int ip_vs_use_count_inc(void) { @@ -458,7 +510,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) static void ip_vs_service_free(struct ip_vs_service *svc) { - free_percpu(svc->stats.cpustats); + ip_vs_stats_release(&svc->stats); kfree(svc); } @@ -470,17 +522,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head) ip_vs_service_free(svc); } -static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +static void __ip_vs_svc_put(struct ip_vs_service *svc) { if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); - if (do_delay) - call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); - else - ip_vs_service_free(svc); + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } } @@ -767,14 +816,22 @@ out: return dest; } +static void ip_vs_dest_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest *dest; + + dest = container_of(head, struct ip_vs_dest, rcu_head); + ip_vs_stats_release(&dest->stats); + ip_vs_dest_put_and_free(dest); +} + static void ip_vs_dest_free(struct ip_vs_dest *dest) { struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); __ip_vs_dst_cache_reset(dest); - __ip_vs_svc_put(svc, false); - free_percpu(dest->stats.cpustats); - ip_vs_dest_put_and_free(dest); + __ip_vs_svc_put(svc); + call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); } /* @@ -790,7 +847,7 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; - del_timer_sync(&ipvs->dest_trash_timer); + timer_delete_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { list_del(&dest->t_list); @@ -798,12 +855,22 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) } } +static void ip_vs_stats_rcu_free(struct rcu_head *head) +{ + struct ip_vs_stats_rcu *rs = container_of(head, + struct ip_vs_stats_rcu, + rcu_head); + + ip_vs_stats_release(&rs->s); + kfree(rs); +} + static void ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c - spin_lock_bh(&src->lock); + spin_lock(&src->lock); IP_VS_SHOW_STATS_COUNTER(conns); IP_VS_SHOW_STATS_COUNTER(inpkts); @@ -813,7 +880,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) ip_vs_read_estimator(dst, src); - spin_unlock_bh(&src->lock); + spin_unlock(&src->lock); } static void @@ -834,7 +901,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) static void ip_vs_zero_stats(struct ip_vs_stats *stats) { - spin_lock_bh(&stats->lock); + spin_lock(&stats->lock); /* get current counters as zero point, rates are zeroed */ @@ -848,7 +915,48 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) ip_vs_zero_estimator(stats); - spin_unlock_bh(&stats->lock); + spin_unlock(&stats->lock); +} + +/* Allocate fields after kzalloc */ +int ip_vs_stats_init_alloc(struct ip_vs_stats *s) +{ + int i; + + spin_lock_init(&s->lock); + s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!s->cpustats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); + + u64_stats_init(&cs->syncp); + } + return 0; +} + +struct ip_vs_stats *ip_vs_stats_alloc(void) +{ + struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); + + if (s && ip_vs_stats_init_alloc(s) >= 0) + return s; + kfree(s); + return NULL; +} + +void ip_vs_stats_release(struct ip_vs_stats *stats) +{ + free_percpu(stats->cpustats); +} + +void ip_vs_stats_free(struct ip_vs_stats *stats) +{ + if (stats) { + ip_vs_stats_release(stats); + kfree(stats); + } } /* @@ -910,7 +1018,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); - __ip_vs_svc_put(old_svc, true); + __ip_vs_svc_put(old_svc); } } @@ -929,7 +1037,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, spin_unlock_bh(&dest->dst_lock); if (add) { - ip_vs_start_estimator(svc->ipvs, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); @@ -947,18 +1054,14 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, * Create a destination for the given service */ static int -ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, - struct ip_vs_dest **dest_p) +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - unsigned int atype, i; - - EnterFunction(2); + unsigned int atype; + int ret; #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { - int ret; - atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -980,15 +1083,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, if (dest == NULL) return -ENOMEM; - dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!dest->stats.cpustats) + ret = ip_vs_stats_init_alloc(&dest->stats); + if (ret < 0) goto err_alloc; - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ip_vs_dest_stats; - ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); - u64_stats_init(&ip_vs_dest_stats->syncp); - } + ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + if (ret < 0) + goto err_stats; dest->af = udest->af; dest->protocol = svc->protocol; @@ -1005,17 +1106,16 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); __ip_vs_update_dest(svc, dest, udest, 1); - *dest_p = dest; - - LeaveFunction(2); return 0; +err_stats: + ip_vs_stats_release(&dest->stats); + err_alloc: kfree(dest); - return -ENOMEM; + return ret; } @@ -1030,8 +1130,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) __be16 dport = udest->port; int ret; - EnterFunction(2); - if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; @@ -1077,15 +1175,16 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); + ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + if (ret < 0) + return ret; __ip_vs_update_dest(svc, dest, udest, 1); - ret = 0; } else { /* * Allocate and initialize the dest structure */ - ret = ip_vs_new_dest(svc, udest, &dest); + ret = ip_vs_new_dest(svc, udest); } - LeaveFunction(2); return ret; } @@ -1101,8 +1200,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) union nf_inet_addr daddr; __be16 dport = udest->port; - EnterFunction(2); - if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; @@ -1134,7 +1231,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) } __ip_vs_update_dest(svc, dest, udest, 0); - LeaveFunction(2); return 0; } @@ -1163,6 +1259,12 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); + + /* Queue up delayed work to expire all no destination connections. + * No-op when CONFIG_SYSCTL is disabled. + */ + if (!cleanup) + ip_vs_enqueue_expire_nodest_conns(ipvs); } @@ -1203,8 +1305,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) struct ip_vs_dest *dest; __be16 dport = udest->port; - EnterFunction(2); - /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); @@ -1225,14 +1325,13 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) */ __ip_vs_del_dest(svc->ipvs, dest, false); - LeaveFunction(2); - return 0; } static void ip_vs_dest_trash_expire(struct timer_list *t) { - struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); + struct netns_ipvs *ipvs = timer_container_of(ipvs, t, + dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; @@ -1268,10 +1367,11 @@ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0, i; + int ret = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; + int ret_hooks = -1; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1313,24 +1413,23 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif + if ((u->af == AF_INET && !ipvs->num_services) || + (u->af == AF_INET6 && !ipvs->num_services6)) { + ret = ip_vs_register_hooks(ipvs, u->af); + if (ret < 0) + goto out_err; + ret_hooks = ret; + } + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; goto out_err; } - svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!svc->stats.cpustats) { - ret = -ENOMEM; + ret = ip_vs_stats_init_alloc(&svc->stats); + if (ret < 0) goto out_err; - } - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ip_vs_stats; - ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); - u64_stats_init(&ip_vs_stats->syncp); - } - /* I'm the first user of the service */ atomic_set(&svc->refcnt, 0); @@ -1340,14 +1439,13 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; - svc->flags = u->flags; + svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; svc->ipvs = ipvs; INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ if (sched) { @@ -1357,34 +1455,47 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, sched = NULL; } - /* Bind the ct retriever */ - RCU_INIT_POINTER(svc->pe, pe); - pe = NULL; + ret = ip_vs_start_estimator(ipvs, &svc->stats); + if (ret < 0) + goto out_err; /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - if (svc->pe && svc->pe->conn_out) + if (pe && pe->conn_out) atomic_inc(&ipvs->conn_out_counter); - ip_vs_start_estimator(ipvs, &svc->stats); + /* Bind the ct retriever */ + RCU_INIT_POINTER(svc->pe, pe); + pe = NULL; /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; + else if (svc->af == AF_INET6) + ipvs->num_services6++; /* Hash the service into the service table */ ip_vs_svc_hash(svc); *svc_p = svc; - /* Now there is a service - full throttle */ - ipvs->enable = 1; + + if (!READ_ONCE(ipvs->enable)) { + /* Now there is a service - full throttle */ + WRITE_ONCE(ipvs->enable, 1); + + /* Start estimation for first time */ + ip_vs_est_reload_start(ipvs); + } + return 0; out_err: + if (ret_hooks >= 0) + ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { ip_vs_unbind_scheduler(svc, sched); ip_vs_service_free(svc); @@ -1500,9 +1611,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) + if (svc->af == AF_INET) { ipvs->num_services--; + if (!ipvs->num_services) + ip_vs_unregister_hooks(ipvs, svc->af); + } else if (svc->af == AF_INET6) { + ipvs->num_services6--; + if (!ipvs->num_services6) + ip_vs_unregister_hooks(ipvs, svc->af); + } ip_vs_stop_estimator(svc->ipvs, &svc->stats); @@ -1536,7 +1653,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* * Free the service if nobody refers to it */ - __ip_vs_svc_put(svc, true); + __ip_vs_svc_put(svc); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1614,7 +1731,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list) struct netns_ipvs *ipvs; struct net *net; - EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); list_for_each_entry(net, net_list, exit_list) { @@ -1622,7 +1738,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list) ip_vs_flush(ipvs, true); } mutex_unlock(&__ip_vs_mutex); - LeaveFunction(2); } /* Put all references for device (dst_cache) */ @@ -1660,7 +1775,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); - EnterFunction(2); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { @@ -1689,7 +1803,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, } spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); - LeaveFunction(2); return NOTIFY_DONE; } @@ -1726,16 +1839,14 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) } } - ip_vs_zero_stats(&ipvs->tot_stats); + ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } #ifdef CONFIG_SYSCTL -static int three = 3; - static int -proc_do_defense_mode(struct ctl_table *table, int write, +proc_do_defense_mode(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; @@ -1762,9 +1873,10 @@ proc_do_defense_mode(struct ctl_table *table, int write, } static int -proc_do_sync_threshold(struct ctl_table *table, int write, +proc_do_sync_threshold(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { + struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val[2]; int rc; @@ -1774,6 +1886,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, .mode = table->mode, }; + mutex_lock(&ipvs->sync_mutex); memcpy(val, valp, sizeof(val)); rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { @@ -1783,11 +1896,12 @@ proc_do_sync_threshold(struct ctl_table *table, int write, else memcpy(valp, val, sizeof(val)); } + mutex_unlock(&ipvs->sync_mutex); return rc; } static int -proc_do_sync_ports(struct ctl_table *table, int write, +proc_do_sync_ports(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1810,6 +1924,149 @@ proc_do_sync_ports(struct ctl_table *table, int write, return rc; } +static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, + void *buffer) +{ + struct netns_ipvs *ipvs = table->extra2; + cpumask_var_t *valp = table->data; + cpumask_var_t newmask; + int ret; + + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpulist_parse(buffer, newmask); + if (ret) + goto out; + + mutex_lock(&ipvs->est_mutex); + + if (!ipvs->est_cpulist_valid) { + if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { + ret = -ENOMEM; + goto unlock; + } + ipvs->est_cpulist_valid = 1; + } + cpumask_and(newmask, newmask, ¤t->cpus_mask); + cpumask_copy(*valp, newmask); + /* est_max_threads may depend on cpulist size */ + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + ipvs->est_calc_phase = 1; + ip_vs_est_reload_start(ipvs); + +unlock: + mutex_unlock(&ipvs->est_mutex); + +out: + free_cpumask_var(newmask); + return ret; +} + +static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, + void *buffer, size_t size) +{ + struct netns_ipvs *ipvs = table->extra2; + cpumask_var_t *valp = table->data; + struct cpumask *mask; + int ret; + + mutex_lock(&ipvs->est_mutex); + + if (ipvs->est_cpulist_valid) + mask = *valp; + else + mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); + ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + + mutex_unlock(&ipvs->est_mutex); + + return ret; +} + +static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + /* Ignore both read and write(append) if *ppos not 0 */ + if (*ppos || !*lenp) { + *lenp = 0; + return 0; + } + if (write) { + /* proc_sys_call_handler() appends terminator */ + ret = ipvs_proc_est_cpumask_set(table, buffer); + if (ret >= 0) + *ppos += *lenp; + } else { + /* proc_sys_call_handler() allocates 1 byte for terminator */ + ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); + if (ret >= 0) { + *lenp = ret; + *ppos += *lenp; + ret = 0; + } + } + return ret; +} + +static int ipvs_proc_est_nice(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + if (val < MIN_NICE || val > MAX_NICE) { + ret = -EINVAL; + } else { + mutex_lock(&ipvs->est_mutex); + if (*valp != val) { + *valp = val; + ip_vs_est_reload_start(ipvs); + } + mutex_unlock(&ipvs->est_mutex); + } + } + return ret; +} + +static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + mutex_lock(&ipvs->est_mutex); + if (*valp != val) { + *valp = val; + ip_vs_est_reload_start(ipvs); + } + mutex_unlock(&ipvs->est_mutex); + } + return ret; +} + /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without @@ -1942,7 +2199,7 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &three, + .extra2 = SYSCTL_THREE, }, { .procname = "nat_icmp_send", @@ -1980,6 +2237,24 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "run_estimation", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_run_estimation, + }, + { + .procname = "est_cpulist", + .maxlen = NR_CPUS, /* unused */ + .mode = 0644, + .proc_handler = ipvs_proc_est_cpulist, + }, + { + .procname = "est_nice", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_est_nice, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -1989,7 +2264,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif - { } }; #endif @@ -2216,7 +2490,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) seq_puts(seq, " Conns Packets Packets Bytes Bytes\n"); - ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)show.conns, (unsigned long long)show.inpkts, @@ -2240,7 +2514,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_kstats kstats; int i; @@ -2257,13 +2531,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) u64 conns, inpkts, outpkts, inbytes, outbytes; do { - start = u64_stats_fetch_begin_irq(&u->syncp); - conns = u->cnt.conns; - inpkts = u->cnt.inpkts; - outpkts = u->cnt.outpkts; - inbytes = u->cnt.inbytes; - outbytes = u->cnt.outbytes; - } while (u64_stats_fetch_retry_irq(&u->syncp, start)); + start = u64_stats_fetch_begin(&u->syncp); + conns = u64_stats_read(&u->cnt.conns); + inpkts = u64_stats_read(&u->cnt.inpkts); + outpkts = u64_stats_read(&u->cnt.outpkts); + inbytes = u64_stats_read(&u->cnt.inbytes); + outbytes = u64_stats_read(&u->cnt.outbytes); + } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, @@ -2414,7 +2688,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, } static int -do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) { struct net *net = sock_net(sk); int ret; @@ -2438,7 +2712,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) return -EINVAL; } - if (copy_from_user(arg, user, len) != 0) + if (copy_from_sockptr(arg, ptr, len) != 0) return -EFAULT; /* Handle daemons since they have another lock */ @@ -2471,6 +2745,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; + } else if (!len) { + /* No more commands with len == 0 below */ + ret = -EINVAL; + goto out_unlock; } usvc_compat = (struct ip_vs_service_user *)arg; @@ -2549,7 +2827,9 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = ip_vs_del_dest(svc, &udest); break; default: + WARN_ON_ONCE(1); ret = -EINVAL; + break; } out_unlock: @@ -2571,7 +2851,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); + strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2765,13 +3045,13 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, + strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, + strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); d[1].syncid = ipvs->bcfg.syncid; } @@ -2811,12 +3091,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; - int size; + size_t size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } @@ -2852,12 +3132,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; - int size; + size_t size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } @@ -3382,10 +3662,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); - if (nla_addr_family) - udest->af = nla_get_u16(nla_addr_family); - else - udest->af = 0; + udest->af = nla_get_u16_default(nla_addr_family, 0); /* If a full entry was requested, check for the additional fields */ if (full_entry) { @@ -3521,7 +3798,7 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; - strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), sizeof(c.mcast_ifn)); c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); @@ -3855,7 +4132,7 @@ out: } -static const struct genl_ops ip_vs_genl_ops[] = { +static const struct genl_small_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -3963,8 +4240,9 @@ static struct genl_family ip_vs_genl_family __ro_after_init = { .policy = ip_vs_cmd_policy, .netnsok = true, /* Make ipvsadm to work on netns */ .module = THIS_MODULE, - .ops = ip_vs_genl_ops, - .n_ops = ARRAY_SIZE(ip_vs_genl_ops), + .small_ops = ip_vs_genl_ops, + .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), + .resv_start_op = IPVS_CMD_FLUSH + 1, }; static int __init ip_vs_genl_register(void) @@ -3986,22 +4264,24 @@ static void ip_vs_genl_unregister(void) static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; - int idx; struct ctl_table *tbl; + int idx, ret; + size_t ctl_table_size = ARRAY_SIZE(vs_vars); + bool unpriv = net->user_ns != &init_user_ns; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, + expire_nodest_conn_handler); + ipvs->est_stopped = 0; if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - tbl[0].procname = NULL; } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -4027,10 +4307,17 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; @@ -4039,6 +4326,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx].extra2 = ipvs; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; @@ -4053,29 +4341,66 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; - ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); - if (ipvs->sysctl_hdr == NULL) { - if (!net_eq(net, &init_net)) - kfree(tbl); - return -ENOMEM; - } - ip_vs_start_estimator(ipvs, &ipvs->tot_stats); + ipvs->sysctl_run_estimation = 1; + if (unpriv) + tbl[idx].mode = 0444; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_run_estimation; + + ipvs->est_cpulist_valid = 0; + if (unpriv) + tbl[idx].mode = 0444; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_est_cpulist; + + ipvs->sysctl_est_nice = IPVS_EST_NICE; + if (unpriv) + tbl[idx].mode = 0444; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_est_nice; + +#ifdef CONFIG_IP_VS_DEBUG + /* Global sysctls must be ro in non-init netns */ + if (!net_eq(net, &init_net)) + tbl[idx++].mode = 0444; +#endif + + ret = -ENOMEM; + ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, + ctl_table_size); + if (!ipvs->sysctl_hdr) + goto err; ipvs->sysctl_tbl = tbl; + + ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); + if (ret < 0) + goto err; + /* Schedule defense work */ - INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); - schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); return 0; + +err: + unregister_net_sysctl_table(ipvs->sysctl_hdr); + if (!net_eq(net, &init_net)) + kfree(tbl); + return ret; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; + cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(ipvs, &ipvs->tot_stats); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + + if (ipvs->est_cpulist_valid) + free_cpumask_var(ipvs->sysctl_est_cpulist); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); @@ -4097,7 +4422,8 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { - int i, idx; + int ret = -ENOMEM; + int idx; /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -4110,44 +4436,66 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); - /* procfs stats */ - ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!ipvs->tot_stats.cpustats) - return -ENOMEM; + INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ipvs_tot_stats; - ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); - u64_stats_init(&ipvs_tot_stats->syncp); - } - - spin_lock_init(&ipvs->tot_stats.lock); + /* procfs stats */ + ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); + if (!ipvs->tot_stats) + goto out; + if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) + goto err_tot_stats; - proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops, - sizeof(struct ip_vs_iter)); - proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, - ip_vs_stats_show, NULL); - proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net, - ip_vs_stats_percpu_show, NULL); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, + &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) + goto err_vs; + if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, + ip_vs_stats_show, NULL)) + goto err_stats; + if (!proc_create_net_single("ip_vs_stats_percpu", 0, + ipvs->net->proc_net, + ip_vs_stats_percpu_show, NULL)) + goto err_percpu; +#endif - if (ip_vs_control_net_init_sysctl(ipvs)) + ret = ip_vs_control_net_init_sysctl(ipvs); + if (ret < 0) goto err; return 0; err: - free_percpu(ipvs->tot_stats.cpustats); - return -ENOMEM; +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); + +err_percpu: + remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); + +err_stats: + remove_proc_entry("ip_vs", ipvs->net->proc_net); + +err_vs: +#endif + ip_vs_stats_release(&ipvs->tot_stats->s); + +err_tot_stats: + kfree(ipvs->tot_stats); + +out: + return ret; } void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); + cancel_delayed_work_sync(&ipvs->est_reload_work); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); - free_percpu(ipvs->tot_stats.cpustats); +#endif + call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); } int __init ip_vs_register_nl_ioctl(void) @@ -4184,8 +4532,6 @@ int __init ip_vs_control_init(void) int idx; int ret; - EnterFunction(2); - /* Initialize svc_table, ip_vs_svc_fwm_table */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); @@ -4198,14 +4544,12 @@ int __init ip_vs_control_init(void) if (ret < 0) return ret; - LeaveFunction(2); return 0; } void ip_vs_control_cleanup(void) { - EnterFunction(2); unregister_netdevice_notifier(&ip_vs_dst_notifier); - LeaveFunction(2); + /* relying on common rcu_barrier() in ip_vs_cleanup() */ } diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 5e6ec32aff2b..bb7aca4601ff 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -30,8 +30,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -270,3 +269,4 @@ static void __exit ip_vs_dh_cleanup(void) module_init(ip_vs_dh_init); module_exit(ip_vs_dh_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs destination hashing scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 05b8112ffb37..77f4f637ff67 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -12,8 +12,7 @@ * get_stats()) do the per cpu summing. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/jiffies.h> @@ -21,6 +20,7 @@ #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> +#include <linux/rcupdate_wait.h> #include <net/ip_vs.h> @@ -30,9 +30,6 @@ long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W @@ -47,65 +44,79 @@ to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c - */ - -/* - * Make a summary from each cpu + KEY POINTS: + - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled + - kthreads read the cpustats to update the estimators (svcs, dests, total) + - the states of estimators can be read (get stats) or modified (zero stats) + from processes + + KTHREADS: + - estimators are added initially to est_temp_list and later kthread 0 + distributes them to one or many kthreads for estimation + - kthread contexts are created and attached to array + - the kthread tasks are started when first service is added, before that + the total stats are not estimated + - when configuration (cpulist/nice) is changed, the tasks are restarted + by work (est_reload_work) + - kthread tasks are stopped while the cpulist is empty + - the kthread context holds lists with estimators (chains) which are + processed every 2 seconds + - as estimators can be added dynamically and in bursts, we try to spread + them to multiple chains which are estimated at different time + - on start, kthread 0 enters calculation phase to determine the chain limits + and the limit of estimators per kthread + - est_add_ktid: ktid where to add new ests, can point to empty slot where + we should add kt data */ -static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, - struct ip_vs_cpu_stats __percpu *stats) -{ - int i; - bool add = false; - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); - unsigned int start; - u64 conns, inpkts, outpkts, inbytes, outbytes; - - if (add) { - do { - start = u64_stats_fetch_begin(&s->syncp); - conns = s->cnt.conns; - inpkts = s->cnt.inpkts; - outpkts = s->cnt.outpkts; - inbytes = s->cnt.inbytes; - outbytes = s->cnt.outbytes; - } while (u64_stats_fetch_retry(&s->syncp, start)); - sum->conns += conns; - sum->inpkts += inpkts; - sum->outpkts += outpkts; - sum->inbytes += inbytes; - sum->outbytes += outbytes; - } else { - add = true; - do { - start = u64_stats_fetch_begin(&s->syncp); - sum->conns = s->cnt.conns; - sum->inpkts = s->cnt.inpkts; - sum->outpkts = s->cnt.outpkts; - sum->inbytes = s->cnt.inbytes; - sum->outbytes = s->cnt.outbytes; - } while (u64_stats_fetch_retry(&s->syncp, start)); - } - } -} +static struct lock_class_key __ipvs_est_key; +static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); +static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); -static void estimation_timer(struct timer_list *t) +static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; + struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; - struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); - spin_lock(&ipvs->est_lock); - list_for_each_entry(e, &ipvs->est_list, list) { + hlist_for_each_entry_rcu(e, chain, list) { + u64 conns, inpkts, outpkts, inbytes, outbytes; + u64 kconns = 0, kinpkts = 0, koutpkts = 0; + u64 kinbytes = 0, koutbytes = 0; + unsigned int start; + int i; + + if (kthread_should_stop()) + break; + s = container_of(e, struct ip_vs_stats, est); + for_each_possible_cpu(i) { + c = per_cpu_ptr(s->cpustats, i); + do { + start = u64_stats_fetch_begin(&c->syncp); + conns = u64_stats_read(&c->cnt.conns); + inpkts = u64_stats_read(&c->cnt.inpkts); + outpkts = u64_stats_read(&c->cnt.outpkts); + inbytes = u64_stats_read(&c->cnt.inbytes); + outbytes = u64_stats_read(&c->cnt.outbytes); + } while (u64_stats_fetch_retry(&c->syncp, start)); + kconns += conns; + kinpkts += inpkts; + koutpkts += outpkts; + kinbytes += inbytes; + koutbytes += outbytes; + } spin_lock(&s->lock); - ip_vs_read_cpu_stats(&s->kstats, s->cpustats); + + s->kstats.conns = kconns; + s->kstats.inpkts = kinpkts; + s->kstats.outpkts = koutpkts; + s->kstats.inbytes = kinbytes; + s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; @@ -130,28 +141,759 @@ static void estimation_timer(struct timer_list *t) e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } - spin_unlock(&ipvs->est_lock); - mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) { - struct ip_vs_estimator *est = &stats->est; + struct ip_vs_est_tick_data *td; + int cid; + + rcu_read_lock(); + td = rcu_dereference(kd->ticks[row]); + if (!td) + goto out; + for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { + if (kthread_should_stop()) + break; + ip_vs_chain_estimation(&td->chains[cid]); + cond_resched_rcu(); + td = rcu_dereference(kd->ticks[row]); + if (!td) + break; + } + +out: + rcu_read_unlock(); +} + +static int ip_vs_estimation_kthread(void *data) +{ + struct ip_vs_est_kt_data *kd = data; + struct netns_ipvs *ipvs = kd->ipvs; + int row = kd->est_row; + unsigned long now; + int id = kd->id; + long gap; + + if (id > 0) { + if (!ipvs->est_chain_max) + return 0; + } else { + if (!ipvs->est_chain_max) { + ipvs->est_calc_phase = 1; + /* commit est_calc_phase before reading est_genid */ + smp_mb(); + } + + /* kthread 0 will handle the calc phase */ + if (ipvs->est_calc_phase) + ip_vs_est_calc_phase(ipvs); + } - INIT_LIST_HEAD(&est->list); + while (1) { + if (!id && !hlist_empty(&ipvs->est_temp_list)) + ip_vs_est_drain_temp_list(ipvs); + set_current_state(TASK_IDLE); + if (kthread_should_stop()) + break; + + /* before estimation, check if we should sleep */ + now = jiffies; + gap = kd->est_timer - now; + if (gap > 0) { + if (gap > IPVS_EST_TICK) { + kd->est_timer = now - IPVS_EST_TICK; + gap = IPVS_EST_TICK; + } + schedule_timeout(gap); + } else { + __set_current_state(TASK_RUNNING); + if (gap < -8 * IPVS_EST_TICK) + kd->est_timer = now; + } - spin_lock_bh(&ipvs->est_lock); - list_add(&est->list, &ipvs->est_list); - spin_unlock_bh(&ipvs->est_lock); + if (kd->tick_len[row]) + ip_vs_tick_estimation(kd, row); + + row++; + if (row >= IPVS_EST_NTICKS) + row = 0; + WRITE_ONCE(kd->est_row, row); + kd->est_timer += IPVS_EST_TICK; + } + __set_current_state(TASK_RUNNING); + + return 0; } +/* Schedule stop/start for kthread tasks */ +void ip_vs_est_reload_start(struct netns_ipvs *ipvs) +{ + /* Ignore reloads before first service is added */ + if (!READ_ONCE(ipvs->enable)) + return; + ip_vs_est_stopped_recalc(ipvs); + /* Bump the kthread configuration genid */ + atomic_inc(&ipvs->est_genid); + queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); +} + +/* Start kthread task with current configuration */ +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd) +{ + unsigned long now; + int ret = 0; + long gap; + + lockdep_assert_held(&ipvs->est_mutex); + + if (kd->task) + goto out; + now = jiffies; + gap = kd->est_timer - now; + /* Sync est_timer if task is starting later */ + if (abs(gap) > 4 * IPVS_EST_TICK) + kd->est_timer = now; + kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", + ipvs->gen, kd->id); + if (IS_ERR(kd->task)) { + ret = PTR_ERR(kd->task); + kd->task = NULL; + goto out; + } + + set_user_nice(kd->task, sysctl_est_nice(ipvs)); + if (sysctl_est_preferred_cpulist(ipvs)) + kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs)); + + pr_info("starting estimator thread %d...\n", kd->id); + wake_up_process(kd->task); + +out: + return ret; +} + +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) +{ + if (kd->task) { + pr_info("stopping estimator thread %d...\n", kd->id); + kthread_stop(kd->task); + kd->task = NULL; + } +} + +/* Apply parameters to kthread */ +static void ip_vs_est_set_params(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd) +{ + kd->chain_max = ipvs->est_chain_max; + /* We are using single chain on RCU preemption */ + if (IPVS_EST_TICK_CHAINS == 1) + kd->chain_max *= IPVS_EST_CHAIN_FACTOR; + kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; + kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; +} + +/* Create and start estimation kthread in a free or new array slot */ +static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) +{ + struct ip_vs_est_kt_data *kd = NULL; + int id = ipvs->est_kt_count; + int ret = -ENOMEM; + void *arr = NULL; + int i; + + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) + return -EINVAL; + + mutex_lock(&ipvs->est_mutex); + + for (i = 0; i < id; i++) { + if (!ipvs->est_kt_arr[i]) + break; + } + if (i >= id) { + arr = krealloc_array(ipvs->est_kt_arr, id + 1, + sizeof(struct ip_vs_est_kt_data *), + GFP_KERNEL); + if (!arr) + goto out; + ipvs->est_kt_arr = arr; + } else { + id = i; + } + + kd = kzalloc(sizeof(*kd), GFP_KERNEL); + if (!kd) + goto out; + kd->ipvs = ipvs; + bitmap_fill(kd->avail, IPVS_EST_NTICKS); + kd->est_timer = jiffies; + kd->id = id; + ip_vs_est_set_params(ipvs, kd); + + /* Pre-allocate stats used in calc phase */ + if (!id && !kd->calc_stats) { + kd->calc_stats = ip_vs_stats_alloc(); + if (!kd->calc_stats) + goto out; + } + + /* Start kthread tasks only when services are present */ + if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { + ret = ip_vs_est_kthread_start(ipvs, kd); + if (ret < 0) + goto out; + } + + if (arr) + ipvs->est_kt_count++; + ipvs->est_kt_arr[id] = kd; + kd = NULL; + /* Use most recent kthread for new ests */ + ipvs->est_add_ktid = id; + ret = 0; + +out: + mutex_unlock(&ipvs->est_mutex); + if (kd) { + ip_vs_stats_free(kd->calc_stats); + kfree(kd); + } + + return ret; +} + +/* Select ktid where to add new ests: available, unused or new slot */ +static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) +{ + int ktid, best = ipvs->est_kt_count; + struct ip_vs_est_kt_data *kd; + + for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { + kd = ipvs->est_kt_arr[ktid]; + if (kd) { + if (kd->est_count < kd->est_max_count) { + best = ktid; + break; + } + } else if (ktid < best) { + best = ktid; + } + } + ipvs->est_add_ktid = best; +} + +/* Add estimator to current kthread (est_add_ktid) */ +static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, + struct ip_vs_estimator *est) +{ + struct ip_vs_est_kt_data *kd = NULL; + struct ip_vs_est_tick_data *td; + int ktid, row, crow, cid, ret; + int delay = est->ktrow; + + BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, + "Too many chains for ktcid"); + + if (ipvs->est_add_ktid < ipvs->est_kt_count) { + kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; + if (kd) + goto add_est; + } + + ret = ip_vs_est_add_kthread(ipvs); + if (ret < 0) + goto out; + kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; + +add_est: + ktid = kd->id; + /* For small number of estimators prefer to use few ticks, + * otherwise try to add into the last estimated row. + * est_row and add_row point after the row we should use + */ + if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) + crow = READ_ONCE(kd->est_row); + else + crow = kd->add_row; + crow += delay; + if (crow >= IPVS_EST_NTICKS) + crow -= IPVS_EST_NTICKS; + /* Assume initial delay ? */ + if (delay >= IPVS_EST_NTICKS - 1) { + /* Preserve initial delay or decrease it if no space in tick */ + row = crow; + if (crow < IPVS_EST_NTICKS - 1) { + crow++; + row = find_last_bit(kd->avail, crow); + } + if (row >= crow) + row = find_last_bit(kd->avail, IPVS_EST_NTICKS); + } else { + /* Preserve delay or increase it if no space in tick */ + row = IPVS_EST_NTICKS; + if (crow > 0) + row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); + if (row >= IPVS_EST_NTICKS) + row = find_first_bit(kd->avail, IPVS_EST_NTICKS); + } + + td = rcu_dereference_protected(kd->ticks[row], 1); + if (!td) { + td = kzalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + ret = -ENOMEM; + goto out; + } + rcu_assign_pointer(kd->ticks[row], td); + } + + cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); + + kd->est_count++; + kd->tick_len[row]++; + if (!td->chain_len[cid]) + __set_bit(cid, td->present); + td->chain_len[cid]++; + est->ktid = ktid; + est->ktrow = row; + est->ktcid = cid; + hlist_add_head_rcu(&est->list, &td->chains[cid]); + + if (td->chain_len[cid] >= kd->chain_max) { + __set_bit(cid, td->full); + if (kd->tick_len[row] >= kd->tick_max) + __clear_bit(row, kd->avail); + } + + /* Update est_add_ktid to point to first available/empty kt slot */ + if (kd->est_count == kd->est_max_count) + ip_vs_est_update_ktid(ipvs); + + ret = 0; + +out: + return ret; +} + +/* Start estimation for stats */ +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + int ret; + + if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + + est->ktid = -1; + est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ + + /* We prefer this code to be short, kthread 0 will requeue the + * estimator to available chain. If tasks are disabled, we + * will not allocate much memory, just for kt 0. + */ + ret = 0; + if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) + ret = ip_vs_est_add_kthread(ipvs); + if (ret >= 0) + hlist_add_head(&est->list, &ipvs->est_temp_list); + else + INIT_HLIST_NODE(&est->list); + return ret; +} + +static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) +{ + if (kd) { + if (kd->task) { + pr_info("stop unused estimator thread %d...\n", kd->id); + kthread_stop(kd->task); + } + ip_vs_stats_free(kd->calc_stats); + kfree(kd); + } +} + +/* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; + struct ip_vs_est_tick_data *td; + struct ip_vs_est_kt_data *kd; + int ktid = est->ktid; + int row = est->ktrow; + int cid = est->ktcid; + + /* Failed to add to chain ? */ + if (hlist_unhashed(&est->list)) + return; + + /* On return, estimator can be freed, dequeue it now */ + + /* In est_temp_list ? */ + if (ktid < 0) { + hlist_del(&est->list); + goto end_kt0; + } + + hlist_del_rcu(&est->list); + kd = ipvs->est_kt_arr[ktid]; + td = rcu_dereference_protected(kd->ticks[row], 1); + __clear_bit(cid, td->full); + td->chain_len[cid]--; + if (!td->chain_len[cid]) + __clear_bit(cid, td->present); + kd->tick_len[row]--; + __set_bit(row, kd->avail); + if (!kd->tick_len[row]) { + RCU_INIT_POINTER(kd->ticks[row], NULL); + kfree_rcu(td, rcu_head); + } + kd->est_count--; + if (kd->est_count) { + /* This kt slot can become available just now, prefer it */ + if (ktid < ipvs->est_add_ktid) + ipvs->est_add_ktid = ktid; + return; + } + + if (ktid > 0) { + mutex_lock(&ipvs->est_mutex); + ip_vs_est_kthread_destroy(kd); + ipvs->est_kt_arr[ktid] = NULL; + if (ktid == ipvs->est_kt_count - 1) { + ipvs->est_kt_count--; + while (ipvs->est_kt_count > 1 && + !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) + ipvs->est_kt_count--; + } + mutex_unlock(&ipvs->est_mutex); + + /* This slot is now empty, prefer another available kt slot */ + if (ktid == ipvs->est_add_ktid) + ip_vs_est_update_ktid(ipvs); + } + +end_kt0: + /* kt 0 is freed after all other kthreads and chains are empty */ + if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { + kd = ipvs->est_kt_arr[0]; + if (!kd || !kd->est_count) { + mutex_lock(&ipvs->est_mutex); + if (kd) { + ip_vs_est_kthread_destroy(kd); + ipvs->est_kt_arr[0] = NULL; + } + ipvs->est_kt_count--; + mutex_unlock(&ipvs->est_mutex); + ipvs->est_add_ktid = 0; + } + } +} + +/* Register all ests from est_temp_list to kthreads */ +static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) +{ + struct ip_vs_estimator *est; + + while (1) { + int max = 16; + + mutex_lock(&__ip_vs_mutex); + + while (max-- > 0) { + est = hlist_entry_safe(ipvs->est_temp_list.first, + struct ip_vs_estimator, list); + if (est) { + if (kthread_should_stop()) + goto unlock; + hlist_del_init(&est->list); + if (ip_vs_enqueue_estimator(ipvs, est) >= 0) + continue; + est->ktid = -1; + hlist_add_head(&est->list, + &ipvs->est_temp_list); + /* Abort, some entries will not be estimated + * until next attempt + */ + } + goto unlock; + } + mutex_unlock(&__ip_vs_mutex); + cond_resched(); + } + +unlock: + mutex_unlock(&__ip_vs_mutex); +} + +/* Calculate limits for all kthreads */ +static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) +{ + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + struct ip_vs_est_kt_data *kd; + struct hlist_head chain; + struct ip_vs_stats *s; + int cache_factor = 4; + int i, loops, ntest; + s32 min_est = 0; + ktime_t t1, t2; + int max = 8; + int ret = 1; + s64 diff; + u64 val; + + INIT_HLIST_HEAD(&chain); + mutex_lock(&__ip_vs_mutex); + kd = ipvs->est_kt_arr[0]; + mutex_unlock(&__ip_vs_mutex); + s = kd ? kd->calc_stats : NULL; + if (!s) + goto out; + hlist_add_head(&s->est.list, &chain); + + loops = 1; + /* Get best result from many tests */ + for (ntest = 0; ntest < 12; ntest++) { + if (!(ntest & 3)) { + /* Wait for cpufreq frequency transition */ + wait_event_idle_timeout(wq, kthread_should_stop(), + HZ / 50); + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) + goto stop; + } + + local_bh_disable(); + rcu_read_lock(); + + /* Put stats in cache */ + ip_vs_chain_estimation(&chain); + + t1 = ktime_get(); + for (i = loops * cache_factor; i > 0; i--) + ip_vs_chain_estimation(&chain); + t2 = ktime_get(); + + rcu_read_unlock(); + local_bh_enable(); + + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) + goto stop; + cond_resched(); + + diff = ktime_to_ns(ktime_sub(t2, t1)); + if (diff <= 1 * NSEC_PER_USEC) { + /* Do more loops on low time resolution */ + loops *= 2; + continue; + } + if (diff >= NSEC_PER_SEC) + continue; + val = diff; + do_div(val, loops); + if (!min_est || val < min_est) { + min_est = val; + /* goal: 95usec per chain */ + val = 95 * NSEC_PER_USEC; + if (val >= min_est) { + do_div(val, min_est); + max = (int)val; + } else { + max = 1; + } + } + } + +out: + if (s) + hlist_del_init(&s->est.list); + *chain_max = max; + return ret; + +stop: + ret = 0; + goto out; +} + +/* Calculate the parameters and apply them in context of kt #0 + * ECP: est_calc_phase + * ECM: est_chain_max + * ECP ECM Insert Chain enable Description + * --------------------------------------------------------------------------- + * 0 0 est_temp_list 0 create kt #0 context + * 0 0 est_temp_list 0->1 service added, start kthread #0 task + * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase + * 1 0 est_temp_list 1 kt #0: determine est_chain_max, + * stop tasks, move ests to est_temp_list + * and free kd for kthreads 1..last + * 1->0 0->N kt chains 1 ests can go to kthreads + * 0 N kt chains 1 drain est_temp_list, create new kthread + * contexts, start tasks, estimate + */ +static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) +{ + int genid = atomic_read(&ipvs->est_genid); + struct ip_vs_est_tick_data *td; + struct ip_vs_est_kt_data *kd; + struct ip_vs_estimator *est; + struct ip_vs_stats *stats; + int id, row, cid, delay; + bool last, last_td; + int chain_max; + int step; + + if (!ip_vs_est_calc_limits(ipvs, &chain_max)) + return; + + mutex_lock(&__ip_vs_mutex); + + /* Stop all other tasks, so that we can immediately move the + * estimators to est_temp_list without RCU grace period + */ + mutex_lock(&ipvs->est_mutex); + for (id = 1; id < ipvs->est_kt_count; id++) { + /* netns clean up started, abort */ + if (!READ_ONCE(ipvs->enable)) + goto unlock2; + kd = ipvs->est_kt_arr[id]; + if (!kd) + continue; + ip_vs_est_kthread_stop(kd); + } + mutex_unlock(&ipvs->est_mutex); + + /* Move all estimators to est_temp_list but carefully, + * all estimators and kthread data can be released while + * we reschedule. Even for kthread 0. + */ + step = 0; + + /* Order entries in est_temp_list in ascending delay, so now + * walk delay(desc), id(desc), cid(asc) + */ + delay = IPVS_EST_NTICKS; + +next_delay: + delay--; + if (delay < 0) + goto end_dequeue; + +last_kt: + /* Destroy contexts backwards */ + id = ipvs->est_kt_count; + +next_kt: + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) + goto unlock; + id--; + if (id < 0) + goto next_delay; + kd = ipvs->est_kt_arr[id]; + if (!kd) + goto next_kt; + /* kt 0 can exist with empty chains */ + if (!id && kd->est_count <= 1) + goto next_delay; + + row = kd->est_row + delay; + if (row >= IPVS_EST_NTICKS) + row -= IPVS_EST_NTICKS; + td = rcu_dereference_protected(kd->ticks[row], 1); + if (!td) + goto next_kt; + + cid = 0; + +walk_chain: + if (kthread_should_stop()) + goto unlock; + step++; + if (!(step & 63)) { + /* Give chance estimators to be added (to est_temp_list) + * and deleted (releasing kthread contexts) + */ + mutex_unlock(&__ip_vs_mutex); + cond_resched(); + mutex_lock(&__ip_vs_mutex); + + /* Current kt released ? */ + if (id >= ipvs->est_kt_count) + goto last_kt; + if (kd != ipvs->est_kt_arr[id]) + goto next_kt; + /* Current td released ? */ + if (td != rcu_dereference_protected(kd->ticks[row], 1)) + goto next_kt; + /* No fatal changes on the current kd and td */ + } + est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, + list); + if (!est) { + cid++; + if (cid >= IPVS_EST_TICK_CHAINS) + goto next_kt; + goto walk_chain; + } + /* We can cheat and increase est_count to protect kt 0 context + * from release but we prefer to keep the last estimator + */ + last = kd->est_count <= 1; + /* Do not free kt #0 data */ + if (!id && last) + goto next_delay; + last_td = kd->tick_len[row] <= 1; + stats = container_of(est, struct ip_vs_stats, est); + ip_vs_stop_estimator(ipvs, stats); + /* Tasks are stopped, move without RCU grace period */ + est->ktid = -1; + est->ktrow = row - kd->est_row; + if (est->ktrow < 0) + est->ktrow += IPVS_EST_NTICKS; + hlist_add_head(&est->list, &ipvs->est_temp_list); + /* kd freed ? */ + if (last) + goto next_kt; + /* td freed ? */ + if (last_td) + goto next_kt; + goto walk_chain; + +end_dequeue: + /* All estimators removed while calculating ? */ + if (!ipvs->est_kt_count) + goto unlock; + kd = ipvs->est_kt_arr[0]; + if (!kd) + goto unlock; + kd->add_row = kd->est_row; + ipvs->est_chain_max = chain_max; + ip_vs_est_set_params(ipvs, kd); + + pr_info("using max %d ests per chain, %d per kthread\n", + kd->chain_max, kd->est_max_count); + + /* Try to keep tot_stats in kt0, enqueue it early */ + if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && + ipvs->tot_stats->s.est.ktid == -1) { + hlist_del(&ipvs->tot_stats->s.est.list); + hlist_add_head(&ipvs->tot_stats->s.est.list, + &ipvs->est_temp_list); + } + + mutex_lock(&ipvs->est_mutex); - spin_lock_bh(&ipvs->est_lock); - list_del(&est->list); - spin_unlock_bh(&ipvs->est_lock); + /* We completed the calc phase, new calc phase not requested */ + if (genid == atomic_read(&ipvs->est_genid)) + ipvs->est_calc_phase = 0; + +unlock2: + mutex_unlock(&ipvs->est_mutex); + +unlock: + mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) @@ -186,14 +928,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { - INIT_LIST_HEAD(&ipvs->est_list); - spin_lock_init(&ipvs->est_lock); - timer_setup(&ipvs->est_timer, estimation_timer, 0); - mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); + INIT_HLIST_HEAD(&ipvs->est_temp_list); + ipvs->est_kt_arr = NULL; + ipvs->est_max_threads = 0; + ipvs->est_calc_phase = 0; + ipvs->est_chain_max = 0; + ipvs->est_kt_count = 0; + ipvs->est_add_ktid = 0; + atomic_set(&ipvs->est_genid, 0); + atomic_set(&ipvs->est_genid_done, 0); + __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { - del_timer_sync(&ipvs->est_timer); + int i; + + for (i = 0; i < ipvs->est_kt_count; i++) + ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); + kfree(ipvs->est_kt_arr); + mutex_destroy(&ipvs->est_mutex); } diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c index b846cc385279..d657b47c6511 100644 --- a/net/netfilter/ipvs/ip_vs_fo.c +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -8,8 +8,7 @@ * Kenny Mathis : added initial functionality based on weight */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -72,3 +71,4 @@ static void __exit ip_vs_fo_cleanup(void) module_init(ip_vs_fo_init); module_exit(ip_vs_fo_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted failover scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index cf925906f59b..b315c608fda4 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -16,8 +16,7 @@ * Author: Wouter Gadeyne */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/moduleparam.h> @@ -35,7 +34,7 @@ #include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/ip_vs.h> @@ -53,6 +52,7 @@ enum { IP_VS_FTP_EPSV, }; +static bool exiting_module; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. @@ -591,8 +591,6 @@ static int __net_init __ip_vs_ftp_init(struct net *net) ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; - pr_info("%s: loaded support on port[%d] = %u\n", - app->name, i, ports[i]); } return 0; @@ -607,7 +605,7 @@ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!ipvs) + if (!ipvs || !exiting_module) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); @@ -629,6 +627,7 @@ static int __init ip_vs_ftp_init(void) */ static void __exit ip_vs_ftp_exit(void) { + exiting_module = true; unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } @@ -637,3 +636,4 @@ static void __exit ip_vs_ftp_exit(void) module_init(ip_vs_ftp_init); module_exit(ip_vs_ftp_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs ftp helper"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 7ac7473e3804..e6c8ed0c92f6 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -34,8 +34,7 @@ * me to write this module. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -123,7 +122,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -293,7 +291,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) */ static void ip_vs_lblc_check_expire(struct timer_list *t) { - struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_lblc_table *tbl = timer_container_of(tbl, t, + periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; @@ -384,7 +383,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) struct ip_vs_lblc_table *tbl = svc->sched_data; /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); + timer_shutdown_sync(&tbl->periodic_timer); /* got to clean up table entries here */ ip_vs_lblc_flush(svc); @@ -550,6 +549,7 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = { static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; @@ -563,15 +563,16 @@ static int __net_init __ip_vs_lblc_init(struct net *net) /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) - ipvs->lblc_ctl_table[0].procname = NULL; + vars_table_size = 0; } else ipvs->lblc_ctl_table = vs_vars_table; ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; - ipvs->lblc_ctl_header = - register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); + ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", + ipvs->lblc_ctl_table, + vars_table_size); if (!ipvs->lblc_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); @@ -628,3 +629,4 @@ static void __exit ip_vs_lblc_cleanup(void) module_init(ip_vs_lblc_init); module_exit(ip_vs_lblc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 77c323c36a88..a25cf7bb6185 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -32,8 +32,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/module.h> @@ -294,7 +293,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -457,7 +455,8 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) */ static void ip_vs_lblcr_check_expire(struct timer_list *t) { - struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_lblcr_table *tbl = timer_container_of(tbl, t, + periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; @@ -547,7 +546,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) struct ip_vs_lblcr_table *tbl = svc->sched_data; /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); + timer_shutdown_sync(&tbl->periodic_timer); /* got to clean up table entries here */ ip_vs_lblcr_flush(svc); @@ -736,6 +735,7 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; @@ -749,14 +749,15 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) - ipvs->lblcr_ctl_table[0].procname = NULL; + vars_table_size = 0; } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; - ipvs->lblcr_ctl_header = - register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); + ipvs->lblcr_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", + ipvs->lblcr_ctl_table, + vars_table_size); if (!ipvs->lblcr_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); @@ -813,3 +814,4 @@ static void __exit ip_vs_lblcr_cleanup(void) module_init(ip_vs_lblcr_init); module_exit(ip_vs_lblcr_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 9d34d81fc6f1..38cc38c5d8bb 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -9,8 +9,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -86,3 +85,4 @@ static void __exit ip_vs_lc_cleanup(void) module_init(ip_vs_lc_init); module_exit(ip_vs_lc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs least connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c index da0280cec506..f61f54004c9e 100644 --- a/net/netfilter/ipvs/ip_vs_mh.c +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -17,8 +17,7 @@ https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -174,8 +173,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s, return 0; } - table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), - sizeof(unsigned long), GFP_KERNEL); + table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL); if (!table) return -ENOMEM; @@ -227,7 +225,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s, } out: - kfree(table); + bitmap_free(table); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 08adcb222986..81974f69e5bb 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -30,8 +30,7 @@ * PASV response can not be NAT-ed) but Active FTP should work */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/types.h> diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index f56862a87518..ada158c610ce 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -26,8 +26,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -136,3 +135,4 @@ static void __exit ip_vs_nq_cleanup(void) module_init(ip_vs_nq_init); module_exit(ip_vs_nq_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs never queue scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index c03066fdd5ca..c5c67df80a0b 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -12,8 +12,7 @@ * active connections */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -79,3 +78,4 @@ static void __exit ip_vs_ovf_cleanup(void) module_init(ip_vs_ovf_init); module_exit(ip_vs_ovf_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs overflow connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 166c669f0763..3035079ebd99 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/spinlock.h> diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0ac6705a61d3..85f31d71e29a 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -185,3 +184,4 @@ static void __exit ip_vs_sip_cleanup(void) module_init(ip_vs_sip_init); module_exit(ip_vs_sip_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs sip helper"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index f100da4ba3bc..fd9dbca24c85 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -8,8 +8,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -340,7 +339,7 @@ void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_protocol_init(void) { - char protocols[64]; + char protocols[64] = { 0 }; #define REGISTER_PROTOCOL(p) \ do { \ register_ip_vs_protocol(p); \ @@ -348,8 +347,6 @@ int __init ip_vs_protocol_init(void) strcat(protocols, (p)->name); \ } while (0) - protocols[0] = '\0'; - protocols[2] = '\0'; #ifdef CONFIG_IP_VS_PROTO_TCP REGISTER_PROTOCOL(&ip_vs_protocol_tcp); #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 89602c16f6b6..44e14acc187e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -6,8 +6,7 @@ * Wensong Zhang <wensong@linuxvirtualserver.org> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index a0921adc31a9..83e452916403 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -126,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -174,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 32b028853a7c..f68a1533ee45 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -13,8 +13,7 @@ * protocol ip_vs_proto_data and is handled by netns */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/ip.h> @@ -315,7 +314,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - /* fall through */ + fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -539,8 +538,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; - IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" - "%s:%d state: %s->%s conn->refcnt:%d\n", + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " + "d:%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), @@ -548,10 +547,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->daf, &cp->daddr), - ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), + ntohs(cp->dport), tcp_state_name(cp->state), tcp_state_name(new_state), refcount_read(&cp->refcnt)); diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 153d89647c87..0f0107c80dd2 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -9,8 +9,7 @@ * Network name space (netns) aware. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> @@ -318,7 +317,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) case CHECKSUM_NONE: skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - /* fall through */ + fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 38495c6f6c7c..4125ee561cdc 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -14,8 +14,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -122,4 +121,5 @@ static void __exit ip_vs_rr_cleanup(void) module_init(ip_vs_rr_init); module_exit(ip_vs_rr_cleanup); +MODULE_DESCRIPTION("ipvs round-robin scheduler"); MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index d4903723be7e..c6e421c4e299 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -12,8 +12,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/spinlock.h> diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 7663288e5358..245a323c84cd 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -30,8 +30,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -137,3 +136,4 @@ static void __exit ip_vs_sed_cleanup(void) module_init(ip_vs_sed_init); module_exit(ip_vs_sed_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs shortest expected delay scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index c2028e412092..0e85e07e23b9 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -32,8 +32,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -376,3 +375,4 @@ static void __exit ip_vs_sh_cleanup(void) module_init(ip_vs_sh_init); module_exit(ip_vs_sh_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs source hashing scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 605e0f68f8bd..54dd1514ac45 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -32,8 +32,7 @@ * Persistence support, fwmark and time-out. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/slab.h> @@ -51,7 +50,7 @@ #include <linux/kernel.h> #include <linux/sched/signal.h> -#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ +#include <linux/unaligned.h> /* Used for ntoh_seq and hton_seq */ #include <net/ip.h> #include <net/sock.h> @@ -242,9 +241,6 @@ struct ip_vs_sync_thread_data { | IPVS Sync Connection (1) | */ -#define SYNC_MESG_HEADER_LEN 4 -#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ - /* Version 0 header */ struct ip_vs_sync_mesg_v0 { __u8 nr_conns; @@ -606,7 +602,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { struct ip_vs_sync_conn_options *opt = (struct ip_vs_sync_conn_options *)&s[1]; - memcpy(opt, &cp->in_seq, sizeof(*opt)); + memcpy(opt, &cp->sync_conn_opt, sizeof(*opt)); } m->nr_conns++; @@ -618,7 +614,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, cp = cp->control; if (cp) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); ip_vs_sync_conn(ipvs, cp, pkts); @@ -779,7 +775,7 @@ control: if (!cp) return; if (cp->flags & IP_VS_CONN_F_TEMPLATE) - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); goto sloop; @@ -1283,12 +1279,12 @@ static void set_sock_size(struct sock *sk, int mode, int val) lock_sock(sk); if (mode) { val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, - sysctl_wmem_max); + READ_ONCE(sysctl_wmem_max)); sk->sk_sndbuf = val * 2; sk->sk_userlocks |= SOCK_SNDBUF_LOCK; } else { val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, - sysctl_rmem_max); + READ_ONCE(sysctl_rmem_max)); sk->sk_rcvbuf = val * 2; sk->sk_userlocks |= SOCK_RCVBUF_LOCK; } @@ -1300,20 +1296,14 @@ static void set_sock_size(struct sock *sk, int mode, int val) */ static void set_mcast_loop(struct sock *sk, u_char loop) { - struct inet_sock *inet = inet_sk(sk); - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; + inet_assign_bit(MC_LOOP, sk, loop); #ifdef CONFIG_IP_VS_IPV6 - if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - + if (READ_ONCE(sk->sk_family) == AF_INET6) { /* IPV6_MULTICAST_LOOP */ - np->mc_loop = loop ? 1 : 0; + inet6_assign_bit(MC6_LOOP, sk, loop); } #endif - release_sock(sk); } /* @@ -1325,13 +1315,13 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ lock_sock(sk); - inet->mc_ttl = ttl; + WRITE_ONCE(inet->mc_ttl, ttl); #ifdef CONFIG_IP_VS_IPV6 if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MULTICAST_HOPS */ - np->mcast_hops = ttl; + WRITE_ONCE(np->mcast_hops, ttl); } #endif release_sock(sk); @@ -1344,13 +1334,13 @@ static void set_mcast_pmtudisc(struct sock *sk, int val) /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ lock_sock(sk); - inet->pmtudisc = val; + WRITE_ONCE(inet->pmtudisc, val); #ifdef CONFIG_IP_VS_IPV6 if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MTU_DISCOVER */ - np->pmtudisc = val; + WRITE_ONCE(np->pmtudisc, val); } #endif release_sock(sk); @@ -1374,7 +1364,7 @@ static int set_mcast_if(struct sock *sk, struct net_device *dev) struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MULTICAST_IF */ - np->mcast_oif = dev->ifindex; + WRITE_ONCE(np->mcast_oif, dev->ifindex); } #endif release_sock(sk); @@ -1444,7 +1434,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) sin.sin_addr.s_addr = addr; sin.sin_port = 0; - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); + return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); } static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, @@ -1510,8 +1500,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - salen, 0); + result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr, + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; @@ -1551,7 +1541,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id, get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); sock->sk->sk_bound_dev_if = dev->ifindex; - result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); + result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; @@ -1585,13 +1575,11 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) struct kvec iov; int len; - EnterFunction(7); iov.iov_base = (void *)buffer; iov.iov_len = length; len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); - LeaveFunction(7); return len; } @@ -1617,15 +1605,12 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) struct kvec iov = {buffer, buflen}; int len; - EnterFunction(7); - /* Receive a packet */ - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen); len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); if (len < 0) return len; - LeaveFunction(7); return len; } @@ -1717,6 +1702,8 @@ static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; struct netns_ipvs *ipvs = tinfo->ipvs; + struct sock *sk = tinfo->sock->sk; + struct udp_sock *up = udp_sk(sk); int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " @@ -1724,12 +1711,14 @@ static int sync_thread_backup(void *data) ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); while (!kthread_should_stop()) { - wait_event_interruptible(*sk_sleep(tinfo->sock->sk), - !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) - || kthread_should_stop()); + wait_event_interruptible(*sk_sleep(sk), + !skb_queue_empty_lockless(&sk->sk_receive_queue) || + !skb_queue_empty_lockless(&up->reader_queue) || + kthread_should_stop()); /* do we have data now? */ - while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + while (!skb_queue_empty_lockless(&sk->sk_receive_queue) || + !skb_queue_empty_lockless(&up->reader_queue)) { len = ip_vs_receive(tinfo->sock, tinfo->buf, ipvs->bcfg.sync_maxlen); if (len <= 0) { diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c new file mode 100644 index 000000000000..dbb7f5fd4688 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* IPVS: Power of Twos Choice Scheduling module + * + * Authors: Darby Payne <darby.payne@applovin.com> + */ + +#define pr_fmt(fmt) "IPVS: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/random.h> + +#include <net/ip_vs.h> + +/* Power of Twos Choice scheduling, algorithm originally described by + * Michael Mitzenmacher. + * + * Randomly picks two destinations and picks the one with the least + * amount of connections + * + * The algorithm calculates a few variables + * - total_weight = sum of all weights + * - rweight1 = random number between [0,total_weight] + * - rweight2 = random number between [0,total_weight] + * + * For each destination + * decrement rweight1 and rweight2 by the destination weight + * pick choice1 when rweight1 is <= 0 + * pick choice2 when rweight2 is <= 0 + * + * Return choice2 if choice2 has less connections than choice 1 normalized + * by weight + * + * References + * ---------- + * + * [Mitzenmacher 2016] + * The Power of Two Random Choices: A Survey of Techniques and Results + * Michael Mitzenmacher, Andrea W. Richa y, Ramesh Sitaraman + * http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf + * + */ +static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc, + const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *choice1 = NULL, *choice2 = NULL; + int rweight1, rweight2, weight1 = -1, weight2 = -1, overhead1 = 0; + int overhead2, total_weight = 0, weight; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* Generate a random weight between [0,sum of all weights) */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD)) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + total_weight += weight; + choice1 = dest; + } + } + } + + if (!choice1) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* Add 1 to total_weight so that the random weights are inclusive + * from 0 to total_weight + */ + total_weight += 1; + rweight1 = get_random_u32_below(total_weight); + rweight2 = get_random_u32_below(total_weight); + + /* Pick two weighted servers */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + weight = atomic_read(&dest->weight); + if (weight <= 0) + continue; + + rweight1 -= weight; + rweight2 -= weight; + + if (rweight1 <= 0 && weight1 == -1) { + choice1 = dest; + weight1 = weight; + overhead1 = ip_vs_dest_conn_overhead(dest); + } + + if (rweight2 <= 0 && weight2 == -1) { + choice2 = dest; + weight2 = weight; + overhead2 = ip_vs_dest_conn_overhead(dest); + } + + if (weight1 != -1 && weight2 != -1) + goto nextstage; + } + +nextstage: + if (choice2 && (weight2 * overhead1) > (weight1 * overhead2)) + choice1 = choice2; + + IP_VS_DBG_BUF(6, "twos: server %s:%u conns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(choice1->af, &choice1->addr), + ntohs(choice1->port), atomic_read(&choice1->activeconns), + refcount_read(&choice1->refcnt), + atomic_read(&choice1->weight)); + + return choice1; +} + +static struct ip_vs_scheduler ip_vs_twos_scheduler = { + .name = "twos", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_twos_scheduler.n_list), + .schedule = ip_vs_twos_schedule, +}; + +static int __init ip_vs_twos_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_twos_scheduler); +} + +static void __exit ip_vs_twos_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_twos_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_twos_init); +module_exit(ip_vs_twos_cleanup); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs power of twos choice scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 09f584b564a0..9da445ca09a1 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -14,8 +14,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -109,3 +108,4 @@ static void __exit ip_vs_wlc_cleanup(void) module_init(ip_vs_wlc_init); module_exit(ip_vs_wlc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted least connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 1bc7a0789d85..99f09cbf2d9b 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -13,8 +13,7 @@ * with weight 0 when all weights are zero */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -263,3 +262,4 @@ static void __exit ip_vs_wrr_cleanup(void) module_init(ip_vs_wrr_init); module_exit(ip_vs_wrr_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted round-robin scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b00866d777fe..3162ce3c2640 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -21,8 +21,7 @@ * - the only place where we can see skb->sk != NULL */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/slab.h> @@ -97,7 +96,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest) if (!dest_dst) return NULL; dst = dest_dst->dst_cache; - if (dst->obsolete && + if (READ_ONCE(dst->obsolete) && dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; return dest_dst; @@ -119,13 +118,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) return false; } -/* Get route to daddr, update *saddr, optionally bind route to saddr */ +/* Get route to daddr, optionally bind route to saddr */ static struct rtable *do_output_route4(struct net *net, __be32 daddr, - int rt_mode, __be32 *saddr) + int rt_mode, __be32 *ret_saddr) { struct flowi4 fl4; struct rtable *rt; - bool loop = false; memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -135,23 +133,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, retry: rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { - /* Invalid saddr ? */ - if (PTR_ERR(rt) == -EINVAL && *saddr && - rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { - *saddr = 0; - flowi4_update_output(&fl4, 0, 0, daddr, 0); - goto retry; - } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); return NULL; - } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + } + if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); - *saddr = fl4.saddr; - flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); - loop = true; + flowi4_update_output(&fl4, 0, daddr, fl4.saddr); + rt_mode = 0; goto retry; } - *saddr = fl4.saddr; + if (ret_saddr) + *ret_saddr = fl4.saddr; return rt; } @@ -180,7 +172,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && (addr_type & IPV6_ADDR_LOOPBACK); old_rt_is_local = __ip_vs_is_local_route6( - (struct rt6_info *)skb_dst(skb)); + dst_rt6_info(skb_dst(skb))); } else #endif { @@ -271,7 +263,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } @@ -286,7 +278,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, { if (ip_hdr(skb)->ttl <= 1) { /* Tell the sender its packet died... */ - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); return false; } @@ -318,7 +310,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rtable *) dest_dst->dst_cache; + rt = dst_rtable(dest_dst->dst_cache); else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); @@ -339,24 +331,20 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", &dest->addr.ip, &dest_dst->dst_saddr.ip, - atomic_read(&rt->dst.__refcnt)); + rcuref_read(&rt->dst.__rcuref)); } if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; } else { - __be32 saddr = htonl(INADDR_ANY); - noref = 0; /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; - rt = do_output_route4(net, daddr, rt_mode, &saddr); + rt = do_output_route4(net, daddr, rt_mode, ret_saddr); if (!rt) goto err_unreach; - if (ret_saddr) - *ret_saddr = saddr; } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; @@ -390,10 +378,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < 68) { @@ -481,7 +469,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rt6_info *) dest_dst->dst_cache; + rt = dst_rt6_info(dest_dst->dst_cache); else { u32 cookie; @@ -501,13 +489,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, ip_vs_dest_dst_free(dest_dst); goto err_unreach; } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", &dest->addr.in6, &dest_dst->dst_saddr.in6, - atomic_read(&rt->dst.__refcnt)); + rcuref_read(&rt->dst.__rcuref)); } if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.in6; @@ -517,7 +505,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, rt_mode); if (!dst) goto err_unreach; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); } local = __ip_vs_is_local_route6(rt); @@ -553,10 +541,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < IPV6_MIN_MTU) { @@ -609,6 +597,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset_ct(skb); skb_forward_csum(skb); + if (skb->dev) + skb_clear_tstamp(skb); } return ret; } @@ -649,6 +639,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); + if (skb->dev) + skb_clear_tstamp(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -669,6 +661,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!local) { ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); + if (skb->dev) + skb_clear_tstamp(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -700,8 +694,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, { struct iphdr *iph = ip_hdr(skb); - EnterFunction(10); - if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; @@ -713,12 +705,10 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); - LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } @@ -729,8 +719,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, { struct ipv6hdr *iph = ipv6_hdr(skb); - EnterFunction(10); - if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, &iph->daddr, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) @@ -741,12 +729,10 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); - LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } #endif @@ -762,8 +748,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ int local, rc, was_input; - EnterFunction(10); - /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; @@ -833,12 +817,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); - LeaveFunction(10); return rc; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } @@ -850,8 +832,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct rt6_info *rt; /* Route to the other host */ int local, rc; - EnterFunction(10); - /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { __be16 _pt, *p; @@ -870,7 +850,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_RDR); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -921,11 +901,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); - LeaveFunction(10); return rc; tx_error: - LeaveFunction(10); kfree_skb(skb); return NF_STOLEN; } @@ -988,7 +966,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) - *payload_len = ntohs(old_iph->tot_len); + *payload_len = skb_ip_totlen(skb); } /* Implement full-functionality option for ECN encapsulation */ @@ -1092,11 +1070,11 @@ ipvs_gre_encap(struct net *net, struct sk_buff *skb, { __be16 proto = *next_protocol == IPPROTO_IPIP ? htons(ETH_P_IP) : htons(ETH_P_IPV6); - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t hdrlen; if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); hdrlen = gre_calc_hlen(tflags); gre_build_header(skb, hdrlen, tflags, proto, 0, 0); @@ -1143,8 +1121,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, int tun_type, gso_type; int tun_flags; - EnterFunction(10); - local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1177,11 +1153,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1193,7 +1169,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, &next_protocol, NULL, &dsfield, &ttl, dfp); if (IS_ERR(skb)) - goto tx_error; + return NF_STOLEN; gso_type = __tun_gso_type_mask(AF_INET, cp->af); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { @@ -1219,6 +1195,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1261,14 +1238,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, else if (ret == NF_DROP) kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; tx_error: - if (!IS_ERR(skb)) - kfree_skb(skb); - LeaveFunction(10); + kfree_skb(skb); return NF_STOLEN; } @@ -1292,8 +1265,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, int tun_type, gso_type; int tun_flags; - EnterFunction(10); - local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, @@ -1305,7 +1276,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (local) return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); tdev = rt->dst.dev; /* @@ -1327,11 +1298,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1341,7 +1312,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, &next_protocol, &payload_len, &dsfield, &ttl, NULL); if (IS_ERR(skb)) - goto tx_error; + return NF_STOLEN; gso_type = __tun_gso_type_mask(AF_INET6, cp->af); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { @@ -1367,6 +1338,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1408,14 +1380,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, else if (ret == NF_DROP) kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; tx_error: - if (!IS_ERR(skb)) - kfree_skb(skb); - LeaveFunction(10); + kfree_skb(skb); return NF_STOLEN; } #endif @@ -1431,8 +1399,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, { int local; - EnterFunction(10); - local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1449,12 +1415,10 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); - LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } @@ -1465,8 +1429,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, { int local; - EnterFunction(10); - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, @@ -1483,12 +1445,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); - LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } #endif @@ -1508,8 +1468,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, int local; int rt_mode, was_input; - EnterFunction(10); - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be forwarded directly here, because there is no need to translate address/port back */ @@ -1520,7 +1478,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); - goto out; + return rc; } /* @@ -1576,14 +1534,11 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; - rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); - goto out; + return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); tx_error: kfree_skb(skb); rc = NF_STOLEN; - out: - LeaveFunction(10); return rc; } @@ -1598,8 +1553,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, int local; int rt_mode; - EnterFunction(10); - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be forwarded directly here, because there is no need to translate address/port back */ @@ -1610,7 +1563,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); - goto out; + return rc; } /* @@ -1625,7 +1578,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -1665,14 +1618,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; - rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); - goto out; + return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); tx_error: kfree_skb(skb); rc = NF_STOLEN; -out: - LeaveFunction(10); return rc; } #endif diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c new file mode 100644 index 000000000000..46e667a50d98 --- /dev/null +++ b/net/netfilter/nf_bpf_link.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/netfilter.h> + +#include <net/netfilter/nf_bpf_link.h> +#include <uapi/linux/netfilter_ipv4.h> + +static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, + const struct nf_hook_state *s) +{ + const struct bpf_prog *prog = bpf_prog; + struct bpf_nf_ctx ctx = { + .state = s, + .skb = skb, + }; + + return bpf_prog_run_pin_on_cpu(prog, &ctx); +} + +struct bpf_nf_link { + struct bpf_link link; + struct nf_hook_ops hook_ops; + netns_tracker ns_tracker; + struct net *net; + u32 dead; + const struct nf_defrag_hook *defrag_hook; +}; + +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) +static const struct nf_defrag_hook * +get_proto_defrag_hook(struct bpf_nf_link *link, + const struct nf_defrag_hook __rcu **ptr_global_hook, + const char *mod) +{ + const struct nf_defrag_hook *hook; + int err; + + /* RCU protects us from races against module unloading */ + rcu_read_lock(); + hook = rcu_dereference(*ptr_global_hook); + if (!hook) { + rcu_read_unlock(); + err = request_module("%s", mod); + if (err) + return ERR_PTR(err < 0 ? err : -EINVAL); + + rcu_read_lock(); + hook = rcu_dereference(*ptr_global_hook); + } + + if (hook && try_module_get(hook->owner)) { + /* Once we have a refcnt on the module, we no longer need RCU */ + hook = rcu_pointer_handoff(hook); + } else { + WARN_ONCE(!hook, "%s has bad registration", mod); + hook = ERR_PTR(-ENOENT); + } + rcu_read_unlock(); + + if (!IS_ERR(hook)) { + err = hook->enable(link->net); + if (err) { + module_put(hook->owner); + hook = ERR_PTR(err); + } + } + + return hook; +} +#endif + +static int bpf_nf_enable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook __maybe_unused *hook; + + switch (link->hook_ops.pf) { +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + case NFPROTO_IPV4: + hook = get_proto_defrag_hook(link, &nf_defrag_v4_hook, "nf_defrag_ipv4"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + case NFPROTO_IPV6: + hook = get_proto_defrag_hook(link, &nf_defrag_v6_hook, "nf_defrag_ipv6"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif + default: + return -EAFNOSUPPORT; + } +} + +static void bpf_nf_disable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook *hook = link->defrag_hook; + + if (!hook) + return; + hook->disable(link->net); + module_put(hook->owner); +} + +static void bpf_nf_link_release(struct bpf_link *link) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + if (nf_link->dead) + return; + + /* do not double release in case .detach was already called */ + if (!cmpxchg(&nf_link->dead, 0, 1)) { + nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); + bpf_nf_disable_defrag(nf_link); + put_net_track(nf_link->net, &nf_link->ns_tracker); + } +} + +static void bpf_nf_link_dealloc(struct bpf_link *link) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + kfree(nf_link); +} + +static int bpf_nf_link_detach(struct bpf_link *link) +{ + bpf_nf_link_release(link); + return 0; +} + +static void bpf_nf_link_show_info(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n", + nf_link->hook_ops.pf, nf_link->hook_ops.hooknum, + nf_link->hook_ops.priority); +} + +static int bpf_nf_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + const struct nf_defrag_hook *hook = nf_link->defrag_hook; + + info->netfilter.pf = nf_link->hook_ops.pf; + info->netfilter.hooknum = nf_link->hook_ops.hooknum; + info->netfilter.priority = nf_link->hook_ops.priority; + info->netfilter.flags = hook ? BPF_F_NETFILTER_IP_DEFRAG : 0; + + return 0; +} + +static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + return -EOPNOTSUPP; +} + +static const struct bpf_link_ops bpf_nf_link_lops = { + .release = bpf_nf_link_release, + .dealloc = bpf_nf_link_dealloc, + .detach = bpf_nf_link_detach, + .show_fdinfo = bpf_nf_link_show_info, + .fill_link_info = bpf_nf_link_fill_link_info, + .update_prog = bpf_nf_link_update, +}; + +static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) +{ + int prio; + + switch (attr->link_create.netfilter.pf) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS) + return -EPROTO; + break; + default: + return -EAFNOSUPPORT; + } + + if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG) + return -EOPNOTSUPP; + + /* make sure conntrack confirm is always last */ + prio = attr->link_create.netfilter.priority; + if (prio == NF_IP_PRI_FIRST) + return -ERANGE; /* sabotage_in and other warts */ + else if (prio == NF_IP_PRI_LAST) + return -ERANGE; /* e.g. conntrack confirm */ + else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) && + prio <= NF_IP_PRI_CONNTRACK_DEFRAG) + return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */ + + return 0; +} + +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_nf_link *link; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + err = bpf_nf_check_pf_and_hooks(attr); + if (err) + return err; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog, + attr->link_create.attach_type); + + link->hook_ops.hook = nf_hook_run_bpf; + link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF; + link->hook_ops.priv = prog; + + link->hook_ops.pf = attr->link_create.netfilter.pf; + link->hook_ops.priority = attr->link_create.netfilter.priority; + link->hook_ops.hooknum = attr->link_create.netfilter.hooknum; + + link->net = net; + link->dead = false; + link->defrag_hook = NULL; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) { + err = bpf_nf_enable_defrag(link); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + } + + err = nf_register_net_hook(net, &link->hook_ops); + if (err) { + bpf_nf_disable_defrag(link); + bpf_link_cleanup(&link_primer); + return err; + } + + get_net_track(net, &link->ns_tracker, GFP_KERNEL); + + return bpf_link_settle(&link_primer); +} + +const struct bpf_prog_ops netfilter_prog_ops = { + .test_run = bpf_prog_test_run_nf, +}; + +static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name) +{ + struct btf *btf; + s32 type_id; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR_OR_NULL(btf)) + return false; + + type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT); + if (WARN_ON_ONCE(type_id < 0)) + return false; + + info->btf = btf; + info->btf_id = type_id; + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + return true; +} + +static bool nf_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_nf_ctx)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) + return false; + + switch (off) { + case bpf_ctx_range(struct bpf_nf_ctx, skb): + if (size != sizeof_field(struct bpf_nf_ctx, skb)) + return false; + + return nf_ptr_to_btf_id(info, "sk_buff"); + case bpf_ctx_range(struct bpf_nf_ctx, state): + if (size != sizeof_field(struct bpf_nf_ctx, state)) + return false; + + return nf_ptr_to_btf_id(info, "nf_hook_state"); + default: + return false; + } + + return false; +} + +static const struct bpf_func_proto * +bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id, prog); +} + +const struct bpf_verifier_ops netfilter_verifier_ops = { + .is_valid_access = nf_is_valid_access, + .get_func_proto = bpf_nf_func_proto, +}; diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 82f36beb2e76..f1be4dd5cf85 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -122,15 +122,68 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } +static bool get_ct_or_tuple_from_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conn **ct, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone **zone, + bool *refcounted) +{ + const struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *found_ct; + + found_ct = nf_ct_get(skb, &ctinfo); + if (found_ct && !nf_ct_is_template(found_ct)) { + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + *zone = nf_ct_zone(found_ct); + *ct = found_ct; + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) + return false; + + if (found_ct) + *zone = nf_ct_zone(found_ct); + + h = nf_conntrack_find_get(net, *zone, tuple); + if (!h) + return true; + + found_ct = nf_ct_tuplehash_to_ctrack(h); + *refcounted = true; + *ct = found_ct; + + return true; +} + static int __nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct = NULL; struct nf_conn *found_ct; unsigned int collect = 0; + bool refcounted = false; + + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) + return -ENOENT; + + if (ct && nf_ct_is_confirmed(ct)) { + if (refcounted) + nf_ct_put(ct); + return -EEXIST; + } + + if ((u32)jiffies == list->last_gc) + goto add_new_node; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { @@ -141,10 +194,10 @@ static int __nf_conncount_add(struct net *net, if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - return 0; /* already exists */ + goto out_put; /* already exists */ } else { collect++; } @@ -153,7 +206,7 @@ static int __nf_conncount_add(struct net *net, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -162,7 +215,7 @@ static int __nf_conncount_add(struct net *net, * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); - return 0; + goto out_put; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -177,6 +230,7 @@ static int __nf_conncount_add(struct net *net, nf_ct_put(found_ct); } +add_new_node: if (WARN_ON_ONCE(list->count > INT_MAX)) return -EOVERFLOW; @@ -184,42 +238,48 @@ static int __nf_conncount_add(struct net *net, if (conn == NULL) return -ENOMEM; - conn->tuple = *tuple; + conn->tuple = tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; + list->last_gc = (u32)jiffies; + +out_put: + if (refcounted) + nf_ct_put(ct); return 0; } -int nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +int nf_conncount_add_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); - ret = __nf_conncount_add(net, list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, list); spin_unlock_bh(&list->list_lock); return ret; } -EXPORT_SYMBOL_GPL(nf_conncount_add); +EXPORT_SYMBOL_GPL(nf_conncount_add_skb); void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; + list->last_gc = (u32)jiffies; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ -bool nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) +static bool __nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -227,8 +287,8 @@ bool nf_conncount_gc_list(struct net *net, unsigned int collected = 0; bool ret = false; - /* don't bother if other cpu is already doing GC */ - if (!spin_trylock(&list->list_lock)) + /* don't bother if we just did GC */ + if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { @@ -258,7 +318,22 @@ bool nf_conncount_gc_list(struct net *net, if (!list->count) ret = true; - spin_unlock(&list->list_lock); + list->last_gc = (u32)jiffies; + + return ret; +} + +bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) +{ + bool ret; + + /* don't bother if other cpu is already doing GC */ + if (!spin_trylock_bh(&list->list_lock)) + return false; + + ret = __nf_conncount_gc_list(net, list); + spin_unlock_bh(&list->list_lock); return ret; } @@ -298,20 +373,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree) static unsigned int insert_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + bool do_gc = true, refcounted = false; + unsigned int count = 0, gc_count = 0; struct rb_node **rbnode, *parent; - struct nf_conncount_rb *rbconn; + struct nf_conntrack_tuple tuple; struct nf_conncount_tuple *conn; - unsigned int count = 0, gc_count = 0; - u8 keylen = data->keylen; - bool do_gc = true; + struct nf_conncount_rb *rbconn; + struct nf_conn *ct = NULL; spin_lock_bh(&nf_conncount_locks[hash]); restart: @@ -322,7 +399,7 @@ restart: rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); parent = *rbnode; - diff = key_diff(key, rbconn->key, keylen); + diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { rbnode = &((*rbnode)->rb_left); } else if (diff > 0) { @@ -330,8 +407,8 @@ restart: } else { int ret; - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); - if (ret) + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); + if (ret && ret != -EEXIST) count = 0; /* hotdrop */ else count = rbconn->list.count; @@ -354,28 +431,35 @@ restart: goto restart; } - /* expected case: match, insert new node */ - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - goto out_unlock; + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { + /* expected case: match, insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + goto out_unlock; - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(conncount_rb_cachep, rbconn); - goto out_unlock; - } + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(conncount_rb_cachep, rbconn); + goto out_unlock; + } - conn->tuple = *tuple; - conn->zone = *zone; - memcpy(rbconn->key, key, sizeof(u32) * keylen); + conn->tuple = tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); - nf_conncount_list_init(&rbconn->list); - list_add(&conn->node, &rbconn->list.head); - count = 1; - rbconn->list.count = count; + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); + count = 1; + rbconn->list.count = count; - rb_link_node_rcu(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); + rb_link_node_rcu(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + + if (refcounted) + nf_ct_put(ct); + } out_unlock: spin_unlock_bh(&nf_conncount_locks[hash]); return count; @@ -383,16 +467,15 @@ out_unlock: static unsigned int count_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct rb_root *root; struct rb_node *parent; struct nf_conncount_rb *rbconn; unsigned int hash; - u8 keylen = data->keylen; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; @@ -403,7 +486,7 @@ count_tree(struct net *net, rbconn = rb_entry(parent, struct nf_conncount_rb, node); - diff = key_diff(key, rbconn->key, keylen); + diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { @@ -411,7 +494,7 @@ count_tree(struct net *net, } else { int ret; - if (!tuple) { + if (!skb) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } @@ -426,19 +509,23 @@ count_tree(struct net *net, } /* same source network -> be counted! */ - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); spin_unlock_bh(&rbconn->list.list_lock); - if (ret) + if (ret && ret != -EEXIST) { return 0; /* hotdrop */ - else + } else { + /* -EEXIST means add was skipped, update the list */ + if (ret == -EEXIST) + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } } } - if (!tuple) + if (!skb) return 0; - return insert_tree(net, data, root, hash, key, tuple, zone); + return insert_tree(net, skb, l3num, data, root, hash, key); } static void tree_gc_worker(struct work_struct *work) @@ -500,24 +587,24 @@ next: } /* Count and return number of conntrack entries in 'net' with particular 'key'. - * If 'tuple' is not null, insert it into the accounting data structure. - * Call with RCU read lock. + * If 'skb' is not null, insert the corresponding tuple into the accounting + * data structure. Call with RCU read lock. */ -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key) { - return count_tree(net, data, key, tuple, zone); + return count_tree(net, skb, l3num, data, key); + } -EXPORT_SYMBOL_GPL(nf_conncount_count); +EXPORT_SYMBOL_GPL(nf_conncount_count_skb); -struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, - unsigned int keylen) +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { struct nf_conncount_data *data; - int ret, i; + int i; if (keylen % sizeof(u32) || keylen / sizeof(u32) > MAX_KEYLEN || @@ -530,12 +617,6 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family if (!data) return ERR_PTR(-ENOMEM); - ret = nf_ct_netns_get(net, family); - if (ret < 0) { - kfree(data); - return ERR_PTR(ret); - } - for (i = 0; i < ARRAY_SIZE(data->root); ++i) data->root[i] = RB_ROOT; @@ -572,13 +653,11 @@ static void destroy_tree(struct rb_root *r) } } -void nf_conncount_destroy(struct net *net, unsigned int family, - struct nf_conncount_data *data) +void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data) { unsigned int i; cancel_work_sync(&data->gc_work); - nf_ct_netns_put(net, family); for (i = 0; i < ARRAY_SIZE(data->root); ++i) destroy_tree(&data->root[i]); @@ -594,15 +673,11 @@ static int __init nf_conncount_modinit(void) for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); - conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", - sizeof(struct nf_conncount_tuple), - 0, 0, NULL); + conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0); if (!conncount_conn_cachep) return -ENOMEM; - conncount_rb_cachep = kmem_cache_create("nf_conncount_rb", - sizeof(struct nf_conncount_rb), - 0, 0, NULL); + conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0); if (!conncount_rb_cachep) { kmem_cache_destroy(conncount_conn_cachep); return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 2ccda8ace796..385a5f458aba 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Accouting handling for netfilter. */ +/* Accounting handling for netfilter. */ /* * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl> @@ -22,26 +22,7 @@ static bool nf_ct_acct __read_mostly; module_param_named(acct, nf_ct_acct, bool, 0644); MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); -static const struct nf_ct_ext_type acct_extend = { - .len = sizeof(struct nf_conn_acct), - .align = __alignof__(struct nf_conn_acct), - .id = NF_CT_EXT_ACCT, -}; - void nf_conntrack_acct_pernet_init(struct net *net) { net->ct.sysctl_acct = nf_ct_acct; } - -int nf_conntrack_acct_init(void) -{ - int ret = nf_ct_extend_register(&acct_extend); - if (ret < 0) - pr_err("Unable to register extension\n"); - return ret; -} - -void nf_conntrack_acct_fini(void) -{ - nf_ct_extend_unregister(&acct_extend); -} diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index d011d2eb0848..7be4c35e4795 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb, /* increase the UDP timeout of the master connection as replies from * Amanda clients to the server can be quite delayed */ - nf_ct_refresh(ct, skb, master_timeout * HZ); + nf_ct_refresh(ct, master_timeout * HZ); /* No data? */ dataoff = protoff + sizeof(struct udphdr); diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c new file mode 100644 index 000000000000..4a136fc3a9c0 --- /dev/null +++ b/net/netfilter/nf_conntrack_bpf.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable Conntrack Helpers for XDP and TC-BPF hook + * + * These are called from the XDP and SCHED_CLS BPF programs. Note that it is + * allowed to break compatibility for these functions since the interface they + * are exposed through to BPF programs is explicitly unstable. + */ + +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/btf_ids.h> +#include <linux/net_namespace.h> +#include <net/xdp.h> +#include <net/netfilter/nf_conntrack_bpf.h> +#include <net/netfilter/nf_conntrack_core.h> + +/* bpf_ct_opts - Options for CT lookup helpers + * + * Members: + * @netns_id - Specify the network namespace for lookup + * Values: + * BPF_F_CURRENT_NETNS (-1) + * Use namespace associated with ctx (xdp_md, __sk_buff) + * [0, S32_MAX] + * Network Namespace ID + * @error - Out parameter, set for any errors encountered + * Values: + * -EINVAL - Passed NULL for bpf_tuple pointer + * -EINVAL - opts->reserved is not 0 + * -EINVAL - netns_id is less than -1 + * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12 + * -EINVAL - opts->ct_zone_id set when + opts__sz isn't NF_BPF_CT_OPTS_SZ (16) + * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP + * -ENONET - No network namespace found for netns_id + * -ENOENT - Conntrack lookup could not find entry for tuple + * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4) + * or sizeof(tuple->ipv6) + * @l4proto - Layer 4 protocol + * Values: + * IPPROTO_TCP, IPPROTO_UDP + * @dir: - connection tracking tuple direction. + * @ct_zone_id - connection tracking zone id. + * @ct_zone_dir - connection tracking zone direction. + * @reserved - Reserved member, will be reused for more options in future + * Values: + * 0 + */ +struct bpf_ct_opts { + s32 netns_id; + s32 error; + u8 l4proto; + u8 dir; + u16 ct_zone_id; + u8 ct_zone_dir; + u8 reserved[3]; +}; + +enum { + NF_BPF_CT_OPTS_SZ = 16, +}; + +static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, + u32 tuple_len, u8 protonum, u8 dir, + struct nf_conntrack_tuple *tuple) +{ + union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3; + union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3; + union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u + : &tuple->src.u; + union nf_conntrack_man_proto *dport = dir ? &tuple->src.u + : (void *)&tuple->dst.u; + + if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP)) + return -EPROTO; + + memset(tuple, 0, sizeof(*tuple)); + + switch (tuple_len) { + case sizeof(bpf_tuple->ipv4): + tuple->src.l3num = AF_INET; + src->ip = bpf_tuple->ipv4.saddr; + sport->tcp.port = bpf_tuple->ipv4.sport; + dst->ip = bpf_tuple->ipv4.daddr; + dport->tcp.port = bpf_tuple->ipv4.dport; + break; + case sizeof(bpf_tuple->ipv6): + tuple->src.l3num = AF_INET6; + memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr)); + sport->tcp.port = bpf_tuple->ipv6.sport; + memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr)); + dport->tcp.port = bpf_tuple->ipv6.dport; + break; + default: + return -EAFNOSUPPORT; + } + tuple->dst.protonum = protonum; + tuple->dst.dir = dir; + + return 0; +} + +static struct nf_conn * +__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, + u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len, + u32 timeout) +{ + struct nf_conntrack_tuple otuple, rtuple; + struct nf_conntrack_zone ct_zone; + struct nf_conn *ct; + int err; + + if (!opts || !bpf_tuple) + return ERR_PTR(-EINVAL); + if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) + return ERR_PTR(-EINVAL); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) + return ERR_PTR(-EINVAL); + } else { + if (opts->ct_zone_id) + return ERR_PTR(-EINVAL); + } + + if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) + return ERR_PTR(-EINVAL); + + err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, + IP_CT_DIR_ORIGINAL, &otuple); + if (err < 0) + return ERR_PTR(err); + + err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, + IP_CT_DIR_REPLY, &rtuple); + if (err < 0) + return ERR_PTR(err); + + if (opts->netns_id >= 0) { + net = get_net_ns_by_id(net, opts->netns_id); + if (unlikely(!net)) + return ERR_PTR(-ENONET); + } + + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->ct_zone_dir == 0) + opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; + nf_ct_zone_init(&ct_zone, + opts->ct_zone_id, opts->ct_zone_dir, 0); + } else { + ct_zone = nf_ct_zone_dflt; + } + + ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple, + GFP_ATOMIC); + if (IS_ERR(ct)) + goto out; + + memset(&ct->proto, 0, sizeof(ct->proto)); + __nf_ct_set_timeout(ct, timeout * HZ); + +out: + if (opts->netns_id >= 0) + put_net(net); + + return ct; +} + +static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, + struct bpf_sock_tuple *bpf_tuple, + u32 tuple_len, struct bpf_ct_opts *opts, + u32 opts_len) +{ + struct nf_conntrack_tuple_hash *hash; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone ct_zone; + struct nf_conn *ct; + int err; + + if (!opts || !bpf_tuple) + return ERR_PTR(-EINVAL); + if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) + return ERR_PTR(-EINVAL); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) + return ERR_PTR(-EINVAL); + } else { + if (opts->ct_zone_id) + return ERR_PTR(-EINVAL); + } + if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) + return ERR_PTR(-EPROTO); + if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) + return ERR_PTR(-EINVAL); + + err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, + IP_CT_DIR_ORIGINAL, &tuple); + if (err < 0) + return ERR_PTR(err); + + if (opts->netns_id >= 0) { + net = get_net_ns_by_id(net, opts->netns_id); + if (unlikely(!net)) + return ERR_PTR(-ENONET); + } + + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->ct_zone_dir == 0) + opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; + nf_ct_zone_init(&ct_zone, + opts->ct_zone_id, opts->ct_zone_dir, 0); + } else { + ct_zone = nf_ct_zone_dflt; + } + + hash = nf_conntrack_find_get(net, &ct_zone, &tuple); + if (opts->netns_id >= 0) + put_net(net); + if (!hash) + return ERR_PTR(-ENOENT); + + ct = nf_ct_tuplehash_to_ctrack(hash); + opts->dir = NF_CT_DIRECTION(hash); + + return ct; +} + +BTF_ID_LIST(btf_nf_conn_ids) +BTF_ID(struct, nf_conn) +BTF_ID(struct, nf_conn___init) + +/* Check writes into `struct nf_conn` */ +static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + const struct btf_type *ncit, *nct, *t; + size_t end; + + ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]); + nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]); + t = btf_type_by_id(reg->btf, reg->btf_id); + if (t != nct && t != ncit) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + /* `struct nf_conn` and `struct nf_conn___init` have the same layout + * so we are safe to simply merge offset checks here + */ + switch (off) { +#if defined(CONFIG_NF_CONNTRACK_MARK) + case offsetof(struct nf_conn, mark): + end = offsetofend(struct nf_conn, mark); + break; +#endif + default: + bpf_log(log, "no write support to nf_conn at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", + off, size, end); + return -EACCES; + } + + return 0; +} + +__bpf_kfunc_start_defs(); + +/* bpf_xdp_ct_alloc - Allocate a new CT entry + * + * Parameters: + * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program + * Cannot be NULL + * @bpf_tuple - Pointer to memory representing the tuple to look up + * Cannot be NULL + * @tuple__sz - Length of the tuple structure + * Must be one of sizeof(bpf_tuple->ipv4) or + * sizeof(bpf_tuple->ipv6) + * @opts - Additional options for allocation (documented above) + * Cannot be NULL + * @opts__sz - Length of the bpf_ct_opts structure + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 + */ +__bpf_kfunc struct nf_conn___init * +bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, + u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) +{ + struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; + struct nf_conn *nfct; + + nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz, + opts, opts__sz, 10); + if (IS_ERR(nfct)) { + if (opts) + opts->error = PTR_ERR(nfct); + return NULL; + } + + return (struct nf_conn___init *)nfct; +} + +/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a + * reference to it + * + * Parameters: + * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program + * Cannot be NULL + * @bpf_tuple - Pointer to memory representing the tuple to look up + * Cannot be NULL + * @tuple__sz - Length of the tuple structure + * Must be one of sizeof(bpf_tuple->ipv4) or + * sizeof(bpf_tuple->ipv6) + * @opts - Additional options for lookup (documented above) + * Cannot be NULL + * @opts__sz - Length of the bpf_ct_opts structure + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 + */ +__bpf_kfunc struct nf_conn * +bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, + u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) +{ + struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; + struct net *caller_net; + struct nf_conn *nfct; + + caller_net = dev_net(ctx->rxq->dev); + nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); + if (IS_ERR(nfct)) { + if (opts) + opts->error = PTR_ERR(nfct); + return NULL; + } + return nfct; +} + +/* bpf_skb_ct_alloc - Allocate a new CT entry + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @bpf_tuple - Pointer to memory representing the tuple to look up + * Cannot be NULL + * @tuple__sz - Length of the tuple structure + * Must be one of sizeof(bpf_tuple->ipv4) or + * sizeof(bpf_tuple->ipv6) + * @opts - Additional options for allocation (documented above) + * Cannot be NULL + * @opts__sz - Length of the bpf_ct_opts structure + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 + */ +__bpf_kfunc struct nf_conn___init * +bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, + u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct nf_conn *nfct; + struct net *net; + + net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); + nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10); + if (IS_ERR(nfct)) { + if (opts) + opts->error = PTR_ERR(nfct); + return NULL; + } + + return (struct nf_conn___init *)nfct; +} + +/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a + * reference to it + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @bpf_tuple - Pointer to memory representing the tuple to look up + * Cannot be NULL + * @tuple__sz - Length of the tuple structure + * Must be one of sizeof(bpf_tuple->ipv4) or + * sizeof(bpf_tuple->ipv6) + * @opts - Additional options for lookup (documented above) + * Cannot be NULL + * @opts__sz - Length of the bpf_ct_opts structure + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 + */ +__bpf_kfunc struct nf_conn * +bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, + u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct net *caller_net; + struct nf_conn *nfct; + + caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); + nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); + if (IS_ERR(nfct)) { + if (opts) + opts->error = PTR_ERR(nfct); + return NULL; + } + return nfct; +} + +/* bpf_ct_insert_entry - Add the provided entry into a CT map + * + * This must be invoked for referenced PTR_TO_BTF_ID. + * + * @nfct - Pointer to referenced nf_conn___init object, obtained + * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc. + */ +__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) +{ + struct nf_conn *nfct = (struct nf_conn *)nfct_i; + int err; + + if (!nf_ct_is_confirmed(nfct)) + nfct->timeout += nfct_time_stamp; + nfct->status |= IPS_CONFIRMED; + err = nf_conntrack_hash_check_insert(nfct); + if (err < 0) { + nf_conntrack_free(nfct); + return NULL; + } + return nfct; +} + +/* bpf_ct_release - Release acquired nf_conn object + * + * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects + * the program if any references remain in the program in all of the explored + * states. + * + * Parameters: + * @nf_conn - Pointer to referenced nf_conn object, obtained using + * bpf_xdp_ct_lookup or bpf_skb_ct_lookup. + */ +__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct) +{ + nf_ct_put(nfct); +} + +/* bpf_ct_set_timeout - Set timeout of allocated nf_conn + * + * Sets the default timeout of newly allocated nf_conn before insertion. + * This helper must be invoked for refcounted pointer to nf_conn___init. + * + * Parameters: + * @nfct - Pointer to referenced nf_conn object, obtained using + * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. + * @timeout - Timeout in msecs. + */ +__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) +{ + __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout)); +} + +/* bpf_ct_change_timeout - Change timeout of inserted nf_conn + * + * Change timeout associated of the inserted or looked up nf_conn. + * This helper must be invoked for refcounted pointer to nf_conn. + * + * Parameters: + * @nfct - Pointer to referenced nf_conn object, obtained using + * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup. + * @timeout - New timeout in msecs. + */ +__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) +{ + return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout)); +} + +/* bpf_ct_set_status - Set status field of allocated nf_conn + * + * Set the status field of the newly allocated nf_conn before insertion. + * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init. + * + * Parameters: + * @nfct - Pointer to referenced nf_conn object, obtained using + * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. + * @status - New status value. + */ +__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) +{ + return nf_ct_change_status_common((struct nf_conn *)nfct, status); +} + +/* bpf_ct_change_status - Change status of inserted nf_conn + * + * Change the status field of the provided connection tracking entry. + * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn. + * + * Parameters: + * @nfct - Pointer to referenced nf_conn object, obtained using + * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup. + * @status - New status value. + */ +__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) +{ + return nf_ct_change_status_common(nfct, status); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(nf_ct_kfunc_set) +BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) +BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(nf_ct_kfunc_set) + +static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { + .owner = THIS_MODULE, + .set = &nf_ct_kfunc_set, +}; + +int register_nf_conntrack_bpf(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); + if (!ret) { + mutex_lock(&nf_conn_btf_access_lock); + nfct_btf_struct_access = _nf_conntrack_btf_struct_access; + mutex_unlock(&nf_conn_btf_access_lock); + } + + return ret; +} + +void cleanup_nf_conntrack_bpf(void) +{ + mutex_lock(&nf_conn_btf_access_lock); + nfct_btf_struct_access = NULL; + mutex_unlock(&nf_conn_btf_access_lock); +} diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 1ba6becc3079..a7552a46d6ac 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -20,6 +20,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, enum ip_conntrack_info ctinfo, unsigned int timeout) { + const struct nf_conntrack_helper *helper; struct nf_conntrack_expect *exp; struct iphdr *iph = ip_hdr(skb); struct rtable *rt = skb_rtable(skb); @@ -58,7 +59,10 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, goto out; exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port; + + helper = rcu_dereference(help->helper); + if (helper) + exp->tuple.src.u.udp.port = helper->tuple.src.u.udp.port; exp->mask.src.u3.ip = mask; exp->mask.src.u.udp.port = htons(0xFFFF); @@ -71,10 +75,11 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); - nf_ct_refresh(ct, skb, timeout * HZ); + nf_ct_refresh(ct, timeout * HZ); out: return NF_ACCEPT; } EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Broadcast connection tracking helper"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 79cd9dde457b..0b95f226f211 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -21,7 +21,6 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/random.h> -#include <linux/jhash.h> #include <linux/siphash.h> #include <linux/err.h> #include <linux/percpu.h> @@ -35,10 +34,10 @@ #include <linux/rculist_nulls.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> @@ -66,22 +65,39 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash); struct conntrack_gc_work { struct delayed_work dwork; - u32 last_bucket; + u32 next_bucket; + u32 avg_timeout; + u32 count; + u32 start_time; bool exiting; bool early_drop; - long next_gc_run; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; -/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ -#define GC_MAX_BUCKETS_DIV 128u -/* upper bound of full table scan */ -#define GC_MAX_SCAN_JIFFIES (16u * HZ) -/* desired ratio of entries found to be expired */ -#define GC_EVICT_RATIO 50u +/* serialize hash resizes and nf_ct_iterate_cleanup */ +static DEFINE_MUTEX(nf_conntrack_mutex); + +#define GC_SCAN_INTERVAL_MAX (60ul * HZ) +#define GC_SCAN_INTERVAL_MIN (1ul * HZ) + +/* clamp timeouts to this value (TCP unacked) */ +#define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) + +/* Initial bias pretending we have 100 entries at the upper bound so we don't + * wakeup often just because we have three entries with a 1s timeout while still + * allowing non-idle machines to wakeup more often when needed. + */ +#define GC_SCAN_INITIAL_COUNT 100 +#define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX + +#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) +#define GC_SCAN_EXPIRED_MAX (64000u / HZ) + +#define MIN_CHAINLEN 50u +#define MAX_CHAINLEN (80u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; @@ -120,8 +136,8 @@ static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) } /* return true if we need to recompute hashes (in case hash table was resized) */ -static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, - unsigned int h2, unsigned int sequence) +static bool nf_conntrack_double_lock(unsigned int h1, unsigned int h2, + unsigned int sequence) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; @@ -149,7 +165,15 @@ static void nf_conntrack_all_lock(void) spin_lock(&nf_conntrack_locks_all_lock); - nf_conntrack_locks_all = true; + /* For nf_contrack_locks_all, only the latest time when another + * CPU will see an update is controlled, by the "release" of the + * spin_lock below. + * The earliest time is not controlled, an thus KCSAN could detect + * a race when nf_conntract_lock() reads the variable. + * WRITE_ONCE() is used to ensure the compiler will not + * optimize the write. + */ + WRITE_ONCE(nf_conntrack_locks_all, true); for (i = 0; i < CONNTRACK_LOCKS; i++) { spin_lock(&nf_conntrack_locks[i]); @@ -180,26 +204,25 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); -seqcount_t nf_conntrack_generation __read_mostly; -static unsigned int nf_conntrack_hash_rnd __read_mostly; +seqcount_spinlock_t nf_conntrack_generation __read_mostly; +static siphash_aligned_key_t nf_conntrack_hash_rnd; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, const struct net *net) { - unsigned int n; - u32 seed; + siphash_key_t key; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); - /* The direction must be ignored, so we hash everything up to the - * destination ports (which is a multiple of 4) and treat the last - * three bytes manually. - */ - seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); - n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, seed ^ - (((__force __u16)tuple->dst.u.all << 16) | - tuple->dst.protonum)); + key = nf_conntrack_hash_rnd; + + key.key[0] ^= zoneid; + key.key[1] ^= net_hash_mix(net); + + return siphash((void *)tuple, + offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), + &key); } static u32 scale_hash(u32 hash) @@ -209,15 +232,17 @@ static u32 scale_hash(u32 hash) static u32 __hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, unsigned int size) { - return reciprocal_scale(hash_conntrack_raw(tuple, net), size); + return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); } static u32 hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, + unsigned int zoneid) { - return scale_hash(hash_conntrack_raw(tuple, net)); + return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); } static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, @@ -297,20 +322,15 @@ nf_ct_get_tuple(const struct sk_buff *skb, return gre_pkt_to_tuple(skb, dataoff, net, tuple); #endif case IPPROTO_TCP: - case IPPROTO_UDP: /* fallthrough */ - return nf_ct_get_tuple_ports(skb, dataoff, tuple); + case IPPROTO_UDP: #ifdef CONFIG_NF_CT_PROTO_UDPLITE case IPPROTO_UDPLITE: - return nf_ct_get_tuple_ports(skb, dataoff, tuple); #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: - return nf_ct_get_tuple_ports(skb, dataoff, tuple); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: + /* fallthrough */ return nf_ct_get_tuple_ports(skb, dataoff, tuple); -#endif default: break; } @@ -463,7 +483,7 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); */ u32 nf_ct_get_id(const struct nf_conn *ct) { - static __read_mostly siphash_key_t ct_id_seed; + static siphash_aligned_key_t ct_id_seed; unsigned long a, b, c, d; net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); @@ -482,10 +502,14 @@ u32 nf_ct_get_id(const struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_ct_get_id); +static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) +{ + return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); +} + static void clean_from_lists(struct nf_conn *ct) { - pr_debug("clean_from_lists(%p)\n", ct); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); @@ -493,53 +517,9 @@ clean_from_lists(struct nf_conn *ct) nf_ct_remove_expectations(ct); } -/* must be called with local_bh_disable */ -static void nf_ct_add_to_dying_list(struct nf_conn *ct) -{ - struct ct_pcpu *pcpu; - - /* add this conntrack to the (per cpu) dying list */ - ct->cpu = smp_processor_id(); - pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); - - spin_lock(&pcpu->lock); - hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &pcpu->dying); - spin_unlock(&pcpu->lock); -} - -/* must be called with local_bh_disable */ -static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) -{ - struct ct_pcpu *pcpu; - - /* add this conntrack to the (per cpu) unconfirmed list */ - ct->cpu = smp_processor_id(); - pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); - - spin_lock(&pcpu->lock); - hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &pcpu->unconfirmed); - spin_unlock(&pcpu->lock); -} - -/* must be called with local_bh_disable */ -static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) -{ - struct ct_pcpu *pcpu; - - /* We overload first tuple to link into unconfirmed or dying list.*/ - pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); - - spin_lock(&pcpu->lock); - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); - spin_unlock(&pcpu->lock); -} - #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) -/* Released via destroy_conntrack() */ +/* Released via nf_ct_destroy() */ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags) @@ -553,10 +533,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); - if (tmpl != p) { - tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); + if (tmpl != p) tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; - } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) @@ -566,7 +544,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); nf_ct_zone_add(tmpl, zone); - atomic_set(&tmpl->ct_general.use, 0); + refcount_set(&tmpl->ct_general.use, 1); return tmpl; } @@ -574,7 +552,7 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { - nf_ct_ext_destroy(tmpl); + kfree(tmpl->ext); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); @@ -593,13 +571,11 @@ static void destroy_gre_conntrack(struct nf_conn *ct) #endif } -static void -destroy_conntrack(struct nf_conntrack *nfct) +void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - pr_debug("destroy_conntrack(%p)\n", ct); - WARN_ON(atomic_read(&nfct->use) != 0); + WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); @@ -609,7 +585,6 @@ destroy_conntrack(struct nf_conntrack *nfct) if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) destroy_gre_conntrack(ct); - local_bh_disable(); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, @@ -617,64 +592,90 @@ destroy_conntrack(struct nf_conntrack *nfct) */ nf_ct_remove_expectations(ct); - nf_ct_del_from_dying_or_unconfirmed_list(ct); - - local_bh_enable(); - if (ct->master) nf_ct_put(ct->master); - pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); nf_conntrack_free(ct); } +EXPORT_SYMBOL(nf_ct_destroy); -static void nf_ct_delete_from_lists(struct nf_conn *ct) +static void __nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; unsigned int sequence; - nf_ct_helper_destroy(ct); - - local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); clean_from_lists(ct); nf_conntrack_double_unlock(hash, reply_hash); +} - nf_ct_add_to_dying_list(ct); +static void nf_ct_delete_from_lists(struct nf_conn *ct) +{ + nf_ct_helper_destroy(ct); + local_bh_disable(); + + __nf_ct_delete_from_lists(ct); local_bh_enable(); } +static void nf_ct_add_to_ecache_list(struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); + + spin_lock(&cnet->ecache.dying_lock); + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &cnet->ecache.dying_list); + spin_unlock(&cnet->ecache.dying_lock); +#endif +} + bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; + struct net *net; if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) return false; tstamp = nf_conn_tstamp_find(ct); - if (tstamp && tstamp->stop == 0) + if (tstamp) { + s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; + tstamp->stop = ktime_get_real_ns(); + if (timeout < 0) + tstamp->stop -= jiffies_to_nsecs(-timeout); + } if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { /* destroy event was not delivered. nf_ct_put will * be done by event cache worker on redelivery. */ - nf_ct_delete_from_lists(ct); - nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); + nf_ct_helper_destroy(ct); + local_bh_disable(); + __nf_ct_delete_from_lists(ct); + nf_ct_add_to_ecache_list(ct); + local_bh_enable(); + + nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); return false; } - nf_conntrack_ecache_work(nf_ct_net(ct)); + net = nf_ct_net(ct); + if (nf_conntrack_ecache_dwork_pending(net)) + nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; @@ -713,9 +714,12 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { - if (!atomic_inc_not_zero(&ct->ct_general.use)) + if (!refcount_inc_not_zero(&ct->ct_general.use)) return; + /* load ->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + if (nf_ct_should_gc(ct)) nf_ct_kill(ct); @@ -773,17 +777,18 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - rcu_read_lock(); - h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { /* We have a candidate that matches the tuple we're interested * in, try to obtain a reference and re-check tuple */ ct = nf_ct_tuplehash_to_ctrack(h); - if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { + if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { + /* re-check key after refcount */ + smp_acquire__after_ctrl_dep(); + if (likely(nf_ct_key_equal(h, tuple, zone, net))) - goto found; + return h; /* TYPESAFE_BY_RCU recycled the candidate */ nf_ct_put(ct); @@ -791,8 +796,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, h = NULL; } -found: - rcu_read_unlock(); return h; } @@ -801,8 +804,25 @@ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, net)); + unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + struct nf_conntrack_tuple_hash *thash; + + rcu_read_lock(); + + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone_id, net)); + + if (thash) + goto out_unlock; + + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (rid != zone_id) + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + +out_unlock: + rcu_read_unlock(); + return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -816,6 +836,33 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, &nf_conntrack_hash[reply_hash]); } +static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) +{ + /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions + * may contain stale pointers to e.g. helper that has been removed. + * + * The helper can't clear this because the nf_conn object isn't in + * any hash and synchronize_rcu() isn't enough because associated skb + * might sit in a queue. + */ + return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); +} + +static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) +{ + if (!ext) + return true; + + if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) + return false; + + /* inserted into conntrack table, nf_ct_iterate_cleanup() + * will find it. Disable nf_ct_ext_find() id check. + */ + WRITE_ONCE(ext->gen_id, 0); + return true; +} + int nf_conntrack_hash_check_insert(struct nf_conn *ct) { @@ -824,44 +871,77 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int max_chainlen; + unsigned int chainlen = 0; unsigned int sequence; + int err = -EEXIST; zone = nf_ct_zone(ct); + if (!nf_ct_ext_valid_pre(ct->ext)) + return -EAGAIN; + local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); + + max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (chainlen++ > max_chainlen) + goto chaintoolong; + } + + chainlen = 0; + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > max_chainlen) + goto chaintoolong; + } + + /* If genid has changed, we can't insert anymore because ct + * extensions could have stale pointers and nf_ct_iterate_destroy + * might have completed its table scan already. + * + * Increment of the ext genid right after this check is fine: + * nf_ct_iterate_destroy blocks until locks are released. + */ + if (!nf_ct_ext_valid_post(ct->ext)) { + err = -EAGAIN; + goto out; + } smp_wmb(); /* The caller holds a reference to this object */ - atomic_set(&ct->ct_general.use, 2); + refcount_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); local_bh_enable(); - return 0; + return 0; +chaintoolong: + NF_CT_STAT_INC(net, chaintoolong); + err = -ENOSPC; out: nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); - return -EEXIST; + return err; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); @@ -900,8 +980,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) { struct nf_conn_tstamp *tstamp; - atomic_inc(&ct->ct_general.use); - ct->status |= IPS_CONFIRMED; + refcount_inc(&ct->ct_general.use); /* set conntrack timestamp, if enabled. */ tstamp = nf_conn_tstamp_find(ct); @@ -909,6 +988,57 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) tstamp->start = ktime_get_real_ns(); } +/** + * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow + * @ct1: conntrack in hash table to check against + * @ct2: merge candidate + * + * returns true if ct1 and ct2 happen to refer to the same flow, but + * in opposing directions, i.e. + * ct1: a:b -> c:d + * ct2: c:d -> a:b + * for both directions. If so, @ct2 should not have been created + * as the skb should have been picked up as ESTABLISHED flow. + * But ct1 was not yet committed to hash table before skb that created + * ct2 had arrived. + * + * Note we don't compare netns because ct entries in different net + * namespace cannot clash to begin with. + * + * @return: true if ct1 and ct2 are identical when swapping origin/reply. + */ +static bool +nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) +{ + u16 id1, id2; + + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) + return false; + + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, + &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + return false; + + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); + if (id1 != id2) + return false; + + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); + + return id1 == id2; +} + +static int nf_ct_can_merge(const struct nf_conn *ct, + const struct nf_conn *loser_ct) +{ + return nf_ct_match(ct, loser_ct) || + nf_ct_match_reverse(ct, loser_ct); +} + +/* caller must hold locks to prevent concurrent changes */ static int __nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) { @@ -919,26 +1049,19 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, loser_ct = nf_ct_get(skb, &ctinfo); - if (nf_ct_is_dying(ct)) - return NF_DROP; - - if (!atomic_inc_not_zero(&ct->ct_general.use)) - return NF_DROP; - - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || - nf_ct_match(ct, loser_ct)) { + if (nf_ct_can_merge(ct, loser_ct)) { struct net *net = nf_ct_net(ct); + nf_conntrack_get(&ct->ct_general); + nf_ct_acct_merge(ct, ctinfo, loser_ct); - nf_ct_add_to_dying_list(loser_ct); - nf_conntrack_put(&loser_ct->ct_general); + nf_ct_put(loser_ct); nf_ct_set(skb, ct, ctinfo); - NF_CT_STAT_INC(net, insert_failed); + NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } - nf_ct_put(ct); return NF_DROP; } @@ -979,7 +1102,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) } /* We want the clashing entry to go away real soon: 1 second timeout. */ - loser_ct->timeout = nfct_time_stamp + HZ; + WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); /* IPS_NAT_CLASH removes the entry automatically on the first * reply. Also prevents UDP tracker from moving the entry to @@ -998,6 +1121,14 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); + /* confirmed bit must be set after hlist add, not before: + * loser_ct can still be visible to other cpu due to + * SLAB_TYPESAFE_BY_RCU. + */ + smp_mb__before_atomic(); + set_bit(IPS_CONFIRMED_BIT, &loser_ct->status); + + NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } @@ -1006,12 +1137,12 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table - * @hash_reply: hash slot for reply direction + * @reply_hash: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. * - * If there is one, @skb (and the assocated, unconfirmed conntrack) has + * If there is one, @skb (and the associated, unconfirmed conntrack) has * to be dropped. In case @skb is retransmitted, next conntrack lookup * will find the already-existing entry. * @@ -1027,10 +1158,10 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * * Failing that, the new, unconfirmed conntrack is still added to the table * provided that the collision only occurs in the ORIGINAL direction. - * The new entry will be added after the existing one in the hash list, + * The new entry will be added only in the non-clashing REPLY direction, * so packets in the ORIGINAL direction will continue to match the existing * entry. The new entry will also have a fixed timeout so it expires -- - * due to the collision, it will not see bidirectional traffic. + * due to the collision, it will only see reply traffic. * * Returns NF_DROP if the clash could not be resolved. */ @@ -1062,7 +1193,6 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, return ret; drop: - nf_ct_add_to_dying_list(loser_ct); NF_CT_STAT_INC(net, drop); NF_CT_STAT_INC(net, insert_failed); return NF_DROP; @@ -1072,6 +1202,7 @@ drop: int __nf_conntrack_confirm(struct sk_buff *skb) { + unsigned int chainlen = 0, sequence, max_chainlen; const struct nf_conntrack_zone *zone; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; @@ -1080,7 +1211,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; - unsigned int sequence; int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); @@ -1102,9 +1232,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = scale_hash(hash); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related * connections for unconfirmed conns. But packet copies and @@ -1123,34 +1253,48 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_DROP; } - pr_debug("Confirming conntrack %p\n", ct); + if (!nf_ct_ext_valid_pre(ct->ext)) { + NF_CT_STAT_INC(net, insert_failed); + goto dying; + } + /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking * further use of that particular connection -JM. */ - nf_ct_del_from_dying_or_unconfirmed_list(ct); - if (unlikely(nf_ct_is_dying(ct))) { - nf_ct_add_to_dying_list(ct); NF_CT_STAT_INC(net, insert_failed); goto dying; } + max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; + if (chainlen++ > max_chainlen) + goto chaintoolong; + } - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + chainlen = 0; + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > max_chainlen) { +chaintoolong: + NF_CT_STAT_INC(net, chaintoolong); + NF_CT_STAT_INC(net, insert_failed); + ret = NF_DROP; + goto dying; + } + } - /* Timer relative to confirmation time, not original + /* Timeout is relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; @@ -1158,14 +1302,34 @@ __nf_conntrack_confirm(struct sk_buff *skb) __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after - * starting the timer and setting the CONFIRMED bit. The RCU barriers - * guarantee that no other CPU can find the conntrack before the above - * stores are visible. + * setting ct->timeout. The RCU barriers guarantee that no other CPU + * can find the conntrack before the above stores are visible. */ __nf_conntrack_hash_insert(ct, hash, reply_hash); + + /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups + * skip entries that lack this bit. This happens when a CPU is looking + * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU + * or when another CPU encounters this entry right after the insertion + * but before the set-confirm-bit below. This bit must not be set until + * after __nf_conntrack_hash_insert(). + */ + smp_mb__before_atomic(); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); + /* ext area is still valid (rcu read lock is held, + * but will go out of scope soon, we need to remove + * this conntrack again. + */ + if (!nf_ct_ext_valid_post(ct->ext)) { + nf_ct_kill(ct); + NF_CT_STAT_INC_ATOMIC(net, drop); + return NF_DROP; + } + help = nfct_help(ct); if (help && help->helper) nf_conntrack_event_cache(IPCT_HELPER, ct); @@ -1183,7 +1347,7 @@ dying: } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); -/* Returns true if a connection correspondings to the tuple (required +/* Returns true if a connection corresponds to the tuple (required for NAT). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, @@ -1202,7 +1366,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, rcu_read_lock(); begin: nf_conntrack_get_ht(&ct_hash, &hsize); - hash = __hash_conntrack(net, tuple, hsize); + hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); @@ -1229,7 +1393,8 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * Let nf_ct_resolve_clash() deal with this later. */ if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) continue; NF_CT_STAT_INC_ATOMIC(net, found); @@ -1264,9 +1429,6 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) - continue; - if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; @@ -1277,9 +1439,12 @@ static unsigned int early_drop_list(struct net *net, nf_ct_is_dying(tmp)) continue; - if (!atomic_inc_not_zero(&tmp->ct_general.use)) + if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; + /* load ->ct_net and ->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + /* kill only if still in same netns -- might have moved due to * SLAB_TYPESAFE_BY_RCU rules. * @@ -1333,68 +1498,78 @@ static bool gc_worker_skip_ct(const struct nf_conn *ct) static bool gc_worker_can_early_drop(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; + u8 protonum = nf_ct_protonum(ct); if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; - l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(protonum); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; return false; } -#define DAY (86400 * HZ) - -/* Set an arbitrary timeout large enough not to ever expire, this save - * us a check for the IPS_OFFLOAD_BIT from the packet path via - * nf_ct_is_expired(). - */ -static void nf_ct_offload_timeout(struct nf_conn *ct) -{ - if (nf_ct_expires(ct) < DAY / 2) - ct->timeout = nfct_time_stamp + DAY; -} - static void gc_worker(struct work_struct *work) { - unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); - unsigned int i, goal, buckets = 0, expired_count = 0; - unsigned int nf_conntrack_max95 = 0; + unsigned int i, hashsz, nf_conntrack_max95 = 0; + u32 end_time, start_time = nfct_time_stamp; struct conntrack_gc_work *gc_work; - unsigned int ratio, scanned = 0; + unsigned int expired_count = 0; unsigned long next_run; + s32 delta_time; + long count; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); - goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; - i = gc_work->last_bucket; + i = gc_work->next_bucket; if (gc_work->early_drop) nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; + if (i == 0) { + gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; + gc_work->count = GC_SCAN_INITIAL_COUNT; + gc_work->start_time = start_time; + } + + next_run = gc_work->avg_timeout; + count = gc_work->count; + + end_time = start_time + GC_SCAN_MAX_DURATION; + do { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int hashsz; struct nf_conn *tmp; - i++; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hashsz); - if (i >= hashsz) - i = 0; + if (i >= hashsz) { + rcu_read_unlock(); + break; + } hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { + struct nf_conntrack_net *cnet; struct net *net; + long expires; tmp = nf_ct_tuplehash_to_ctrack(h); - scanned++; - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { - nf_ct_offload_timeout(tmp); - continue; + if (expired_count > GC_SCAN_EXPIRED_MAX) { + rcu_read_unlock(); + + gc_work->next_bucket = i; + gc_work->avg_timeout = next_run; + gc_work->count = count; + + delta_time = nfct_time_stamp - gc_work->start_time; + + /* re-sched immediately if total cycle time is exceeded */ + next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; + goto early_exit; } if (nf_ct_is_expired(tmp)) { @@ -1403,24 +1578,34 @@ static void gc_worker(struct work_struct *work) continue; } + expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); + expires = (expires - (long)next_run) / ++count; + next_run += expires; + if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) continue; net = nf_ct_net(tmp); - if (atomic_read(&net->ct.count) < nf_conntrack_max95) + cnet = nf_ct_pernet(net); + if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; /* need to take reference to avoid possible races */ - if (!atomic_inc_not_zero(&tmp->ct_general.use)) + if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; + /* load ->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + if (gc_worker_skip_ct(tmp)) { nf_ct_put(tmp); continue; } - if (gc_worker_can_early_drop(tmp)) + if (gc_worker_can_early_drop(tmp)) { nf_ct_kill(tmp); + expired_count++; + } nf_ct_put(tmp); } @@ -1431,51 +1616,41 @@ static void gc_worker(struct work_struct *work) */ rcu_read_unlock(); cond_resched(); - } while (++buckets < goal); + i++; - if (gc_work->exiting) - return; + delta_time = nfct_time_stamp - end_time; + if (delta_time > 0 && i < hashsz) { + gc_work->avg_timeout = next_run; + gc_work->count = count; + gc_work->next_bucket = i; + next_run = 0; + goto early_exit; + } + } while (i < hashsz); - /* - * Eviction will normally happen from the packet path, and not - * from this gc worker. - * - * This worker is only here to reap expired entries when system went - * idle after a busy period. - * - * The heuristics below are supposed to balance conflicting goals: - * - * 1. Minimize time until we notice a stale entry - * 2. Maximize scan intervals to not waste cycles - * - * Normally, expire ratio will be close to 0. - * - * As soon as a sizeable fraction of the entries have expired - * increase scan frequency. - */ - ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio > GC_EVICT_RATIO) { - gc_work->next_gc_run = min_interval; - } else { - unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; + gc_work->next_bucket = 0; - BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); + next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); - gc_work->next_gc_run += min_interval; - if (gc_work->next_gc_run > max) - gc_work->next_gc_run = max; - } + delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); + if (next_run > (unsigned long)delta_time) + next_run -= delta_time; + else + next_run = 1; + +early_exit: + if (gc_work->exiting) + return; + + if (next_run) + gc_work->early_drop = false; - next_run = gc_work->next_gc_run; - gc_work->last_bucket = i; - gc_work->early_drop = false; queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { - INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); - gc_work->next_gc_run = HZ; + INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); gc_work->exiting = false; } @@ -1486,18 +1661,23 @@ __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { + struct nf_conntrack_net *cnet = nf_ct_pernet(net); + unsigned int ct_count; struct nf_conn *ct; /* We don't want any race condition at early drop stage */ - atomic_inc(&net->ct.count); + ct_count = atomic_inc_return(&cnet->count); - if (nf_conntrack_max && - unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { + if (unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; - atomic_dec(&net->ct.count); - net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); + atomic_dec(&cnet->count); + if (net == &init_net) + net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); + else + net_warn_ratelimited("nf_conntrack: table full in netns %u, dropping packet\n", + net->ns.inum); return ERR_PTR(-ENOMEM); } } @@ -1517,21 +1697,19 @@ __nf_conntrack_alloc(struct net *net, /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; - ct->timeout = 0; + WRITE_ONCE(ct->timeout, 0); write_pnet(&ct->ct_net, net); - memset(&ct->__nfct_init_offset, 0, - offsetof(struct nf_conn, proto) - - offsetof(struct nf_conn, __nfct_init_offset)); + memset_after(ct, 0, __nfct_init_offset); nf_ct_zone_add(ct, zone); /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ - atomic_set(&ct->ct_general.use, 0); + refcount_set(&ct->ct_general.use, 0); return ct; out: - atomic_dec(&net->ct.count); + atomic_dec(&cnet->count); return ERR_PTR(-ENOMEM); } @@ -1548,16 +1726,29 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc); void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_net *cnet; /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU */ - WARN_ON(atomic_read(&ct->ct_general.use) != 0); + WARN_ON(refcount_read(&ct->ct_general.use) != 0); + + if (ct->status & IPS_SRC_NAT_DONE) { + const struct nf_nat_hook *nat_hook; + + rcu_read_lock(); + nat_hook = rcu_dereference(nf_nat_hook); + if (nat_hook) + nat_hook->remove_nat_bysrc(ct); + rcu_read_unlock(); + } - nf_ct_ext_destroy(ct); + kfree(ct->ext); kmem_cache_free(nf_conntrack_cachep, ct); + cnet = nf_ct_pernet(net); + smp_mb__before_atomic(); - atomic_dec(&net->ct.count); + atomic_dec(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -1573,22 +1764,23 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conn *ct; struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; +#ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_ecache *ecache; +#endif struct nf_conntrack_expect *exp = NULL; const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; + struct nf_conntrack_net *cnet; - if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { - pr_debug("Can't invert tuple.\n"); + if (!nf_ct_invert_tuple(&repl_tuple, tuple)) return NULL; - } zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) - return (struct nf_conntrack_tuple_hash *)ct; + return ERR_CAST(ct); if (!nf_ct_add_synproxy(ct, tmpl)) { nf_conntrack_free(ct); @@ -1605,18 +1797,23 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_labels_ext_add(ct); +#ifdef CONFIG_NF_CONNTRACK_EVENTS ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; - nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, - ecache ? ecache->expmask : 0, - GFP_ATOMIC); - local_bh_disable(); - if (net->ct.expect_count) { - spin_lock(&nf_conntrack_expect_lock); - exp = nf_ct_find_expectation(net, zone, tuple); + if ((ecache || net->ct.sysctl_events) && + !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, + ecache ? ecache->expmask : 0, + GFP_ATOMIC)) { + nf_conntrack_free(ct); + return ERR_PTR(-ENOMEM); + } +#endif + + cnet = nf_ct_pernet(net); + if (cnet->expect_count) { + spin_lock_bh(&nf_conntrack_expect_lock); + exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { - pr_debug("expectation arrives ct=%p exp=%p\n", - ct, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ @@ -1628,23 +1825,30 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } #ifdef CONFIG_NF_CONNTRACK_MARK - ct->mark = exp->master->mark; + ct->mark = READ_ONCE(exp->master->mark); #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK ct->secmark = exp->master->secmark; #endif NF_CT_STAT_INC(net, expect_new); } - spin_unlock(&nf_conntrack_expect_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } - if (!exp) + if (!exp && tmpl) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); - /* Now it is inserted into the unconfirmed list, bump refcount */ - nf_conntrack_get(&ct->ct_general); - nf_ct_add_to_unconfirmed_list(ct); + /* Other CPU might have obtained a pointer to this object before it was + * released. Because refcount is 0, refcount_inc_not_zero() will fail. + * + * After refcount_set(1) it will succeed; ensure that zeroing of + * ct->status and the correct ct->net pointer are visible; else other + * core might observe CONFIRMED bit which means the entry is valid and + * in the hash table, but its not (anymore). + */ + smp_wmb(); - local_bh_enable(); + /* Now it is going to be associated with an sk_buff, set refcount to 1. */ + refcount_set(&ct->ct_general.use, 1); if (exp) { if (exp->expectfn) @@ -1668,20 +1872,30 @@ resolve_normal_ct(struct nf_conn *tmpl, struct nf_conntrack_tuple_hash *h; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; + u32 hash, zone_id, rid; struct nf_conn *ct; - u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, - &tuple)) { - pr_debug("Can't get tuple\n"); + &tuple)) return 0; - } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple, state->net); + + zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + hash = hash_conntrack_raw(&tuple, zone_id, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); + + if (!h) { + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (zone_id != rid) { + u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); + + h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); + } + } + if (!h) { h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); @@ -1696,17 +1910,15 @@ resolve_normal_ct(struct nf_conn *tmpl, if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { ctinfo = IP_CT_ESTABLISHED_REPLY; } else { + unsigned long status = READ_ONCE(ct->status); + /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - pr_debug("normal packet for %p\n", ct); + if (likely(status & IPS_SEEN_REPLY)) ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - pr_debug("related packet for %p\n", ct); + else if (status & IPS_EXPECTED) ctinfo = IP_CT_RELATED; - } else { - pr_debug("new packet for %p\n", ct); + else ctinfo = IP_CT_NEW; - } } nf_ct_set(skb, ct, ctinfo); return 0; @@ -1737,10 +1949,8 @@ nf_conntrack_handle_icmp(struct nf_conn *tmpl, else return NF_ACCEPT; - if (ret <= 0) { + if (ret <= 0) NF_CT_STAT_INC_ATOMIC(state->net, error); - NF_CT_STAT_INC_ATOMIC(state->net, invalid); - } return ret; } @@ -1787,11 +1997,6 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct, return nf_conntrack_sctp_packet(ct, skb, dataoff, ctinfo, state); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: - return nf_conntrack_dccp_packet(ct, skb, dataoff, - ctinfo, state); -#endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return nf_conntrack_gre_packet(ct, skb, dataoff, @@ -1814,18 +2019,14 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || - ctinfo == IP_CT_UNTRACKED) { - NF_CT_STAT_INC_ATOMIC(state->net, ignore); + ctinfo == IP_CT_UNTRACKED) return NF_ACCEPT; - } skb->_nfct = 0; } /* rcu_read_lock()ed by nf_hook_thresh */ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { - pr_debug("not prepared to track yet or error occurred\n"); - NF_CT_STAT_INC_ATOMIC(state->net, error); NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; @@ -1864,18 +2065,19 @@ repeat: if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ - pr_debug("nf_conntrack_in: Can't track with proto module\n"); - nf_conntrack_put(&ct->ct_general); + nf_ct_put(ct); skb->_nfct = 0; - NF_CT_STAT_INC_ATOMIC(state->net, invalid); - if (ret == -NF_DROP) - NF_CT_STAT_INC_ATOMIC(state->net, drop); /* Special case: TCP tracker reports an attempt to reopen a * closed/aborted connection. We have to go back and create a * fresh conntrack. */ if (ret == -NF_REPEAT) goto repeat; + + NF_CT_STAT_INC_ATOMIC(state->net, invalid); + if (ret == NF_DROP) + NF_CT_STAT_INC_ATOMIC(state->net, drop); + ret = -ret; goto out; } @@ -1891,35 +2093,11 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_in); -/* Alter reply tuple (maybe alter helper). This is for NAT, and is - implicitly racy: see __nf_conntrack_confirm */ -void nf_conntrack_alter_reply(struct nf_conn *ct, - const struct nf_conntrack_tuple *newreply) -{ - struct nf_conn_help *help = nfct_help(ct); - - /* Should be unconfirmed, so not in hash table yet */ - WARN_ON(nf_ct_is_confirmed(ct)); - - pr_debug("Altering reply tuple of %p to ", ct); - nf_ct_dump_tuple(newreply); - - ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (ct->master || (help && !hlist_empty(&help->expectations))) - return; - - rcu_read_lock(); - __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); - /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, u32 extra_jiffies, - bool do_acct) + unsigned int bytes) { /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) @@ -1932,8 +2110,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (READ_ONCE(ct->timeout) != extra_jiffies) WRITE_ONCE(ct->timeout, extra_jiffies); acct: - if (do_acct) - nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); + if (bytes) + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -2025,74 +2203,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) nf_conntrack_get(skb_nfct(nskb)); } -static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - struct nf_nat_hook *nat_hook; - unsigned int status; - int dataoff; - u16 l3num; - u8 l4num; - - l3num = nf_ct_l3num(ct); - - dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); - if (dataoff <= 0) - return -1; - - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - l4num, net, &tuple)) - return -1; - - if (ct->status & IPS_SRC_NAT) { - memcpy(tuple.src.u3.all, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, - sizeof(tuple.src.u3.all)); - tuple.src.u.all = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; - } - - if (ct->status & IPS_DST_NAT) { - memcpy(tuple.dst.u3.all, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, - sizeof(tuple.dst.u3.all)); - tuple.dst.u.all = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; - } - - h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); - if (!h) - return 0; - - /* Store status bits of the conntrack that is clashing to re-do NAT - * mangling according to what it has been done already to this packet. - */ - status = ct->status; - - nf_ct_put(ct); - ct = nf_ct_tuplehash_to_ctrack(h); - nf_ct_set(skb, ct, ctinfo); - - nat_hook = rcu_dereference(nf_nat_hook); - if (!nat_hook) - return 0; - - if (status & IPS_SRC_NAT && - nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, - IP_CT_DIR_ORIGINAL) == NF_DROP) - return -1; - - if (status & IPS_DST_NAT && - nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, - IP_CT_DIR_ORIGINAL) == NF_DROP) - return -1; - - return 0; -} - /* This packet is coming from userspace via nf_queue, complete the packet * processing after the helper invocation in nf_confirm(). */ @@ -2105,11 +2215,14 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, help = nfct_help(ct); if (!help) - return 0; + return NF_ACCEPT; helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) - return 0; + return NF_ACCEPT; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: @@ -2124,41 +2237,34 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) - return 0; + return NF_ACCEPT; break; } #endif default: - return 0; + return NF_ACCEPT; } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return -1; + return NF_DROP; } } /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; + return nf_conntrack_confirm(skb); } static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; - int err; ct = nf_ct_get(skb, &ctinfo); if (!ct) - return 0; - - if (!nf_ct_is_confirmed(ct)) { - err = __nf_conntrack_update(net, skb, ct, ctinfo); - if (err < 0) - return err; - } + return NF_ACCEPT; return nf_confirm_cthelper(skb, ct, ctinfo); } @@ -2201,7 +2307,7 @@ static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), - void *data, unsigned int *bucket) + const struct nf_ct_iter_data *iter_data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -2209,28 +2315,36 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; + + if (hlist_nulls_empty(hslot)) + continue; + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) - continue; - /* All nf_conn objects are added to hash table twice, one - * for original direction tuple, once for the reply tuple. - * - * Exception: In the IPS_NAT_CLASH case, only the reply - * tuple is added (the original tuple already existed for - * a different object). - * - * We only need to call the iterator once for each - * conntrack, so we just use the 'reply' direction - * tuple while iterating. - */ - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - goto found; - } + hlist_nulls_for_each_entry(h, n, hslot, hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) + continue; + /* All nf_conn objects are added to hash table twice, one + * for original direction tuple, once for the reply tuple. + * + * Exception: In the IPS_NAT_CLASH case, only the reply + * tuple is added (the original tuple already existed for + * a different object). + * + * We only need to call the iterator once for each + * conntrack, so we just use the 'reply' direction + * tuple while iterating. + */ + ct = nf_ct_tuplehash_to_ctrack(h); + + if (iter_data->net && + !net_eq(iter_data->net, nf_ct_net(ct))) + continue; + + if (iter(ct, iter_data->data)) + goto found; } spin_unlock(lockp); local_bh_enable(); @@ -2239,109 +2353,43 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), return NULL; found: - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); spin_unlock(lockp); local_bh_enable(); return ct; } static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), - void *data, u32 portid, int report) + const struct nf_ct_iter_data *iter_data) { - unsigned int bucket = 0, sequence; + unsigned int bucket = 0; struct nf_conn *ct; might_sleep(); - for (;;) { - sequence = read_seqcount_begin(&nf_conntrack_generation); - - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ - - nf_ct_delete(ct, portid, report); - nf_ct_put(ct); - cond_resched(); - } - - if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) - break; - bucket = 0; - } -} - -struct iter_data { - int (*iter)(struct nf_conn *i, void *data); - void *data; - struct net *net; -}; - -static int iter_net_only(struct nf_conn *i, void *data) -{ - struct iter_data *d = data; - - if (!net_eq(d->net, nf_ct_net(i))) - return 0; - - return d->iter(i, d->data); -} - -static void -__nf_ct_unconfirmed_destroy(struct net *net) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct nf_conntrack_tuple_hash *h; - struct hlist_nulls_node *n; - struct ct_pcpu *pcpu; - - pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { - struct nf_conn *ct; - - ct = nf_ct_tuplehash_to_ctrack(h); + mutex_lock(&nf_conntrack_mutex); + while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { + /* Time to push up daises... */ - /* we cannot call iter() on unconfirmed list, the - * owning cpu can reallocate ct->ext at any time. - */ - set_bit(IPS_DYING_BIT, &ct->status); - } - spin_unlock_bh(&pcpu->lock); + nf_ct_delete(ct, iter_data->portid, iter_data->report); + nf_ct_put(ct); cond_resched(); } + mutex_unlock(&nf_conntrack_mutex); } -void nf_ct_unconfirmed_destroy(struct net *net) +void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), + const struct nf_ct_iter_data *iter_data) { - might_sleep(); - - if (atomic_read(&net->ct.count) > 0) { - __nf_ct_unconfirmed_destroy(net); - nf_queue_nf_hook_drop(net); - synchronize_net(); - } -} -EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy); - -void nf_ct_iterate_cleanup_net(struct net *net, - int (*iter)(struct nf_conn *i, void *data), - void *data, u32 portid, int report) -{ - struct iter_data d; + struct net *net = iter_data->net; + struct nf_conntrack_net *cnet = nf_ct_pernet(net); might_sleep(); - if (atomic_read(&net->ct.count) == 0) + if (atomic_read(&cnet->count) == 0) return; - d.iter = iter; - d.data = data; - d.net = net; - - nf_ct_iterate_cleanup(iter_net_only, &d, portid, report); + nf_ct_iterate_cleanup(iter, iter_data); } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); @@ -2359,43 +2407,56 @@ EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) { + struct nf_ct_iter_data iter_data = {}; struct net *net; down_read(&net_rwsem); for_each_net(net) { - if (atomic_read(&net->ct.count) == 0) + struct nf_conntrack_net *cnet = nf_ct_pernet(net); + + if (atomic_read(&cnet->count) == 0) continue; - __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); } up_read(&net_rwsem); /* Need to wait for netns cleanup worker to finish, if its * running -- it might have deleted a net namespace from - * the global list, so our __nf_ct_unconfirmed_destroy() might - * not have affected all namespaces. + * the global list, so hook drop above might not have + * affected all namespaces. */ net_ns_barrier(); - /* a conntrack could have been unlinked from unconfirmed list - * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy(). + /* a skb w. unconfirmed conntrack could have been reinjected just + * before we called nf_queue_nf_hook_drop(). + * * This makes sure its inserted into conntrack table. */ synchronize_net(); - nf_ct_iterate_cleanup(iter, data, 0, 0); + nf_ct_ext_bump_genid(); + iter_data.data = data; + nf_ct_iterate_cleanup(iter, &iter_data); + + /* Another cpu might be in a rcu read section with + * rcu protected pointer cleared in iter callback + * or hidden via nf_ct_ext_bump_genid() above. + * + * Wait until those are done. + */ + synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); static int kill_all(struct nf_conn *i, void *data) { - return net_eq(nf_ct_net(i), data); + return 1; } void nf_conntrack_cleanup_start(void) { + cleanup_nf_conntrack_bpf(); conntrack_gc_work.exiting = true; - RCU_INIT_POINTER(ip_ct_attach, NULL); } void nf_conntrack_cleanup_end(void) @@ -2405,13 +2466,7 @@ void nf_conntrack_cleanup_end(void) kvfree(nf_conntrack_hash); nf_conntrack_proto_fini(); - nf_conntrack_seqadj_fini(); - nf_conntrack_labels_fini(); nf_conntrack_helper_fini(); - nf_conntrack_timeout_fini(); - nf_conntrack_ecache_fini(); - nf_conntrack_tstamp_fini(); - nf_conntrack_acct_fini(); nf_conntrack_expect_fini(); kmem_cache_destroy(nf_conntrack_cachep); @@ -2431,20 +2486,24 @@ void nf_conntrack_cleanup_net(struct net *net) void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) { - int busy; + struct nf_ct_iter_data iter_data = {}; struct net *net; + int busy; /* * This makes sure all current packets have passed through * netfilter framework. Roll on, two-stage module * delete... */ - synchronize_net(); + synchronize_rcu_expedited(); i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_iterate_cleanup(kill_all, net, 0, 0); - if (atomic_read(&net->ct.count) != 0) + struct nf_conntrack_net *cnet = nf_ct_pernet(net); + + iter_data.net = net; + nf_ct_iterate_cleanup_net(kill_all, &iter_data); + if (atomic_read(&cnet->count) != 0) busy = 1; } if (busy) { @@ -2453,11 +2512,9 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { - nf_conntrack_proto_pernet_fini(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); - free_percpu(net->ct.pcpu_lists); } } @@ -2466,12 +2523,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) struct hlist_nulls_head *hash; unsigned int nr_slots, i; - if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) @@ -2497,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize) if (!hash) return -ENOMEM; + mutex_lock(&nf_conntrack_mutex); old_size = nf_conntrack_htable_size; if (old_size == hashsize) { + mutex_unlock(&nf_conntrack_mutex); kvfree(hash); return 0; } @@ -2515,16 +2577,19 @@ int nf_conntrack_hash_resize(unsigned int hashsize) for (i = 0; i < nf_conntrack_htable_size; i++) { while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + unsigned int zone_id; + h = hlist_nulls_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); + + zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); bucket = __hash_conntrack(nf_ct_net(ct), - &h->tuple, hashsize); + &h->tuple, zone_id, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } - old_size = nf_conntrack_htable_size; old_hash = nf_conntrack_hash; nf_conntrack_hash = hash; @@ -2534,6 +2599,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize) nf_conntrack_all_unlock(); local_bh_enable(); + mutex_unlock(&nf_conntrack_mutex); + synchronize_net(); kvfree(old_hash); return 0; @@ -2558,36 +2625,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) return nf_conntrack_hash_resize(hashsize); } -static __always_inline unsigned int total_extension_size(void) -{ - /* remember to add new extensions below */ - BUILD_BUG_ON(NF_CT_EXT_NUM > 9); - - return sizeof(struct nf_ct_ext) + - sizeof(struct nf_conn_help) -#if IS_ENABLED(CONFIG_NF_NAT) - + sizeof(struct nf_conn_nat) -#endif - + sizeof(struct nf_conn_seqadj) - + sizeof(struct nf_conn_acct) -#ifdef CONFIG_NF_CONNTRACK_EVENTS - + sizeof(struct nf_conntrack_ecache) -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP - + sizeof(struct nf_conn_tstamp) -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - + sizeof(struct nf_conn_timeout) -#endif -#ifdef CONFIG_NF_CONNTRACK_LABELS - + sizeof(struct nf_conn_labels) -#endif -#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) - + sizeof(struct nf_conn_synproxy) -#endif - ; -}; - int nf_conntrack_init_start(void) { unsigned long nr_pages = totalram_pages(); @@ -2595,35 +2632,31 @@ int nf_conntrack_init_start(void) int ret = -ENOMEM; int i; - /* struct nf_ct_ext uses u8 to store offsets/size */ - BUILD_BUG_ON(total_extension_size() > 255u); - - seqcount_init(&nf_conntrack_generation); + seqcount_spinlock_init(&nf_conntrack_generation, + &nf_conntrack_locks_all_lock); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); if (!nf_conntrack_htable_size) { - /* Idea from tcp.c: use 1/16384 of memory. - * On i386: 32MB machine has 512 buckets. - * >= 1GB machines have 16384 buckets. - * >= 4GB machines have 65536 buckets. - */ nf_conntrack_htable_size = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) - nf_conntrack_htable_size = 65536; + if (BITS_PER_LONG >= 64 && + nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + nf_conntrack_htable_size = 262144; else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) - nf_conntrack_htable_size = 16384; - if (nf_conntrack_htable_size < 32) - nf_conntrack_htable_size = 32; + nf_conntrack_htable_size = 65536; - /* Use a max. factor of four by default to get the same max as - * with the old struct list_heads. When a table size is given - * we use the old value of 8 to avoid reducing the max. - * entries. */ - max_factor = 4; + if (nf_conntrack_htable_size < 1024) + nf_conntrack_htable_size = 1024; + /* Use a max. factor of one by default to keep the average + * hash chain length at 2 entries. Each entry has to be added + * twice (once for original direction, once for reply). + * When a table size is given we use the old value of 8 to + * avoid implicit reduction of the max entries setting. + */ + max_factor = 1; } nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); @@ -2643,34 +2676,10 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_expect; - ret = nf_conntrack_acct_init(); - if (ret < 0) - goto err_acct; - - ret = nf_conntrack_tstamp_init(); - if (ret < 0) - goto err_tstamp; - - ret = nf_conntrack_ecache_init(); - if (ret < 0) - goto err_ecache; - - ret = nf_conntrack_timeout_init(); - if (ret < 0) - goto err_timeout; - ret = nf_conntrack_helper_init(); if (ret < 0) goto err_helper; - ret = nf_conntrack_labels_init(); - if (ret < 0) - goto err_labels; - - ret = nf_conntrack_seqadj_init(); - if (ret < 0) - goto err_seqadj; - ret = nf_conntrack_proto_init(); if (ret < 0) goto err_proto; @@ -2678,23 +2687,18 @@ int nf_conntrack_init_start(void) conntrack_gc_work_init(&conntrack_gc_work); queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); + ret = register_nf_conntrack_bpf(); + if (ret < 0) + goto err_kfunc; + return 0; +err_kfunc: + cancel_delayed_work_sync(&conntrack_gc_work.dwork); + nf_conntrack_proto_fini(); err_proto: - nf_conntrack_seqadj_fini(); -err_seqadj: - nf_conntrack_labels_fini(); -err_labels: nf_conntrack_helper_fini(); err_helper: - nf_conntrack_timeout_fini(); -err_timeout: - nf_conntrack_ecache_fini(); -err_ecache: - nf_conntrack_tstamp_fini(); -err_tstamp: - nf_conntrack_acct_fini(); -err_acct: nf_conntrack_expect_fini(); err_expect: kmem_cache_destroy(nf_conntrack_cachep); @@ -2703,16 +2707,29 @@ err_cachep: return ret; } -static struct nf_ct_hook nf_conntrack_hook = { +static void nf_conntrack_set_closing(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = nf_ct_to_nf_conn(nfct); + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + nf_conntrack_tcp_set_closing(ct); + break; + } +} + +static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, - .destroy = destroy_conntrack, + .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, + .attach = nf_conntrack_attach, + .set_closing = nf_conntrack_set_closing, + .confirm = __nf_conntrack_confirm, + .get_id = nf_conntrack_get_id, }; void nf_conntrack_init_end(void) { - /* For use by REJECT target */ - RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); } @@ -2720,32 +2737,19 @@ void nf_conntrack_init_end(void) * We need to use special "null" values, not used in hash table */ #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) -#define DYING_NULLS_VAL ((1<<30)+1) int nf_conntrack_init_net(struct net *net) { + struct nf_conntrack_net *cnet = nf_ct_pernet(net); int ret = -ENOMEM; - int cpu; BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); - atomic_set(&net->ct.count, 0); - - net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); - if (!net->ct.pcpu_lists) - goto err_stat; - - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_init(&pcpu->lock); - INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); - } + atomic_set(&cnet->count, 0); net->ct.stat = alloc_percpu(struct ip_conntrack_stat); if (!net->ct.stat) - goto err_pcpu_lists; + return ret; ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) @@ -2754,15 +2758,67 @@ int nf_conntrack_init_net(struct net *net) nf_conntrack_acct_pernet_init(net); nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); - nf_conntrack_helper_pernet_init(net); nf_conntrack_proto_pernet_init(net); return 0; err_expect: free_percpu(net->ct.stat); -err_pcpu_lists: - free_percpu(net->ct.pcpu_lists); -err_stat: return ret; } + +/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ + +int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) +{ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) + return -EPERM; + + __nf_ct_set_timeout(ct, timeout); + + if (test_bit(IPS_DYING_BIT, &ct->status)) + return -ETIME; + + return 0; +} +EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); + +void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) +{ + unsigned int bit; + + /* Ignore these unchangable bits */ + on &= ~IPS_UNCHANGEABLE_MASK; + off &= ~IPS_UNCHANGEABLE_MASK; + + for (bit = 0; bit < __IPS_MAX_BIT; bit++) { + if (on & (1 << bit)) + set_bit(bit, &ct->status); + else if (off & (1 << bit)) + clear_bit(bit, &ct->status); + } +} +EXPORT_SYMBOL_GPL(__nf_ct_change_status); + +int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) +{ + unsigned long d; + + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EBUSY; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EBUSY; + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EBUSY; + + __nf_ct_change_status(ct, status, 0); + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_change_status_common); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 7956c9f19899..81baf2082604 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -16,7 +16,6 @@ #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> -#include <linux/percpu.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/slab.h> @@ -29,8 +28,9 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); -#define ECACHE_RETRY_WAIT (HZ/10) -#define ECACHE_STACK_ALLOC (256 / sizeof(void *)) +#define DYING_NULLS_VAL ((1 << 30) + 1) +#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10) +#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10) enum retry_state { STATE_CONGESTED, @@ -38,150 +38,173 @@ enum retry_state { STATE_DONE, }; -static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) +struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net) { - struct nf_conn *refs[ECACHE_STACK_ALLOC]; + struct nf_conntrack_net *cnet = nf_ct_pernet(net); + + return &cnet->ecache; +} +#if IS_MODULE(CONFIG_NF_CT_NETLINK) +EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache); +#endif + +static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet) +{ + unsigned long stop = jiffies + ECACHE_MAX_JIFFIES; + struct hlist_nulls_head evicted_list; enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - unsigned int evicted = 0; + unsigned int sent; - spin_lock(&pcpu->lock); + INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL); - hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { +next: + sent = 0; + spin_lock_bh(&cnet->ecache.dying_lock); + + hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - struct nf_conntrack_ecache *e; - - if (!nf_ct_is_confirmed(ct)) - continue; - - /* This ecache access is safe because the ct is on the - * pcpu dying list and we hold the spinlock -- the entry - * cannot be free'd until after the lock is released. - * - * This is true even if ct has a refcount of 0: the - * cpu that is about to free the entry must remove it - * from the dying list and needs the lock to do so. - */ - e = nf_ct_ecache_find(ct); - if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) - continue; - /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means - * the worker owns this entry: the ct will remain valid - * until the worker puts its ct reference. + /* The worker owns all entries, ct remains valid until nf_ct_put + * in the loop below. */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; } - e->state = NFCT_ECACHE_DESTROY_SENT; - refs[evicted] = ct; + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list); - if (++evicted >= ARRAY_SIZE(refs)) { + if (time_after(stop, jiffies)) { ret = STATE_RESTART; break; } + + if (sent++ > 16) { + spin_unlock_bh(&cnet->ecache.dying_lock); + cond_resched(); + goto next; + } } - spin_unlock(&pcpu->lock); + spin_unlock_bh(&cnet->ecache.dying_lock); - /* can't _put while holding lock */ - while (evicted) - nf_ct_put(refs[--evicted]); + hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); + nf_ct_put(ct); + + cond_resched(); + } return ret; } static void ecache_work(struct work_struct *work) { - struct netns_ct *ctnet = - container_of(work, struct netns_ct, ecache_dwork.work); - int cpu, delay = -1; - struct ct_pcpu *pcpu; + struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work); + int ret, delay = -1; + + ret = ecache_work_evict_list(cnet); + switch (ret) { + case STATE_CONGESTED: + delay = ECACHE_RETRY_JIFFIES; + break; + case STATE_RESTART: + delay = 0; + break; + case STATE_DONE: + break; + } - local_bh_disable(); + if (delay >= 0) + schedule_delayed_work(&cnet->ecache.dwork, delay); +} - for_each_possible_cpu(cpu) { - enum retry_state ret; +static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, + const u32 events, + const u32 missed, + const struct nf_ct_event *item) +{ + struct net *net = nf_ct_net(item->ct); + struct nf_ct_event_notifier *notify; + u32 old, want; + int ret; - pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); + if (!((events | missed) & e->ctmask)) + return 0; - ret = ecache_work_evict_list(pcpu); + rcu_read_lock(); - switch (ret) { - case STATE_CONGESTED: - delay = ECACHE_RETRY_WAIT; - goto out; - case STATE_RESTART: - delay = 0; - break; - case STATE_DONE: - break; - } + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); + if (!notify) { + rcu_read_unlock(); + return 0; } - out: - local_bh_enable(); + ret = notify->ct_event(events | missed, item); + rcu_read_unlock(); - ctnet->ecache_dwork_pending = delay > 0; - if (delay >= 0) - schedule_delayed_work(&ctnet->ecache_dwork, delay); + if (likely(ret >= 0 && missed == 0)) + return 0; + + do { + old = READ_ONCE(e->missed); + if (ret < 0) + want = old | events; + else + want = old & ~missed; + } while (cmpxchg(&e->missed, old, want) != old); + + return ret; +} + +static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (local64_read(&e->timestamp)) + local64_set(&e->timestamp, ktime_get_real_ns()); +#endif } -int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, +int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, u32 portid, int report) { - int ret = 0; - struct net *net = nf_ct_net(ct); - struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; + struct nf_ct_event item; + unsigned int missed; + int ret; - rcu_read_lock(); - notify = rcu_dereference(net->ct.nf_conntrack_event_cb); - if (!notify) - goto out_unlock; + if (!nf_ct_is_confirmed(ct)) + return 0; e = nf_ct_ecache_find(ct); if (!e) - goto out_unlock; + return 0; - if (nf_ct_is_confirmed(ct)) { - struct nf_ct_event item = { - .ct = ct, - .portid = e->portid ? e->portid : portid, - .report = report - }; - /* This is a resent of a destroy event? If so, skip missed */ - unsigned long missed = e->portid ? 0 : e->missed; - - if (!((eventmask | missed) & e->ctmask)) - goto out_unlock; - - ret = notify->fcn(eventmask | missed, &item); - if (unlikely(ret < 0 || missed)) { - spin_lock_bh(&ct->lock); - if (ret < 0) { - /* This is a destroy event that has been - * triggered by a process, we store the PORTID - * to include it in the retransmission. - */ - if (eventmask & (1 << IPCT_DESTROY)) { - if (e->portid == 0 && portid != 0) - e->portid = portid; - e->state = NFCT_ECACHE_DESTROY_FAIL; - } else { - e->missed |= eventmask; - } - } else { - e->missed &= ~missed; - } - spin_unlock_bh(&ct->lock); - } + memset(&item, 0, sizeof(item)); + + item.ct = ct; + item.portid = e->portid ? e->portid : portid; + item.report = report; + + /* This is a resent of a destroy event? If so, skip missed */ + missed = e->portid ? 0 : e->missed; + + nf_ct_ecache_tstamp_refresh(e); + + ret = __nf_conntrack_eventmask_report(e, events, missed, &item); + if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { + /* This is a destroy event that has been triggered by a process, + * we store the PORTID to include it in the retransmission. + */ + if (e->portid == 0 && portid != 0) + e->portid = portid; } -out_unlock: - rcu_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); @@ -190,53 +213,28 @@ EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - unsigned long events, missed; - struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; struct nf_ct_event item; - int ret; - - rcu_read_lock(); - notify = rcu_dereference(net->ct.nf_conntrack_event_cb); - if (notify == NULL) - goto out_unlock; + unsigned int events; if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) - goto out_unlock; + return; e = nf_ct_ecache_find(ct); if (e == NULL) - goto out_unlock; + return; events = xchg(&e->cache, 0); - /* We make a copy of the missed event cache without taking - * the lock, thus we may send missed events twice. However, - * this does not harm and it happens very rarely. */ - missed = e->missed; - - if (!((events | missed) & e->ctmask)) - goto out_unlock; - item.ct = ct; item.portid = 0; item.report = 0; - ret = notify->fcn(events | missed, &item); - - if (likely(ret == 0 && !missed)) - goto out_unlock; - - spin_lock_bh(&ct->lock); - if (ret < 0) - e->missed |= events; - else - e->missed &= ~missed; - spin_unlock_bh(&ct->lock); - -out_unlock: - rcu_read_unlock(); + /* We make a copy of the missed event cache without taking + * the lock, thus we may send missed events twice. However, + * this does not harm and it happens very rarely. + */ + __nf_conntrack_eventmask_report(e, events, e->missed, &item); } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); @@ -246,11 +244,11 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, { struct net *net = nf_ct_exp_net(exp); - struct nf_exp_event_notifier *notify; + struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; rcu_read_lock(); - notify = rcu_dereference(net->ct.nf_expect_event_cb); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) goto out_unlock; @@ -264,118 +262,120 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, .portid = portid, .report = report }; - notify->fcn(1 << event, &item); + notify->exp_event(1 << event, &item); } out_unlock: rcu_read_unlock(); } -int nf_conntrack_register_notifier(struct net *net, - struct nf_ct_event_notifier *new) +void nf_conntrack_register_notifier(struct net *net, + const struct nf_ct_event_notifier *new) { - int ret; struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); - if (notify != NULL) { - ret = -EBUSY; - goto out_unlock; - } + WARN_ON_ONCE(notify); rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); - ret = 0; - -out_unlock: mutex_unlock(&nf_ct_ecache_mutex); - return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); -void nf_conntrack_unregister_notifier(struct net *net, - struct nf_ct_event_notifier *new) +void nf_conntrack_unregister_notifier(struct net *net) { - struct nf_ct_event_notifier *notify; - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - BUG_ON(notify != new); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); - /* synchronize_rcu() is called from ctnetlink_exit. */ + /* synchronize_rcu() is called after netns pre_exit */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); -int nf_ct_expect_register_notifier(struct net *net, - struct nf_exp_event_notifier *new) +void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { - int ret; - struct nf_exp_event_notifier *notify; - - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - if (notify != NULL) { - ret = -EBUSY; - goto out_unlock; + struct nf_conntrack_net *cnet = nf_ct_pernet(net); + + if (state == NFCT_ECACHE_DESTROY_FAIL && + !delayed_work_pending(&cnet->ecache.dwork)) { + schedule_delayed_work(&cnet->ecache.dwork, HZ); + net->ct.ecache_dwork_pending = true; + } else if (state == NFCT_ECACHE_DESTROY_SENT) { + if (!hlist_nulls_empty(&cnet->ecache.dying_list)) + mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0); + else + net->ct.ecache_dwork_pending = false; } - rcu_assign_pointer(net->ct.nf_expect_event_cb, new); - ret = 0; - -out_unlock: - mutex_unlock(&nf_ct_ecache_mutex); - return ret; } -EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); -void nf_ct_expect_unregister_notifier(struct net *net, - struct nf_exp_event_notifier *new) +static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e) { - struct nf_exp_event_notifier *notify; +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + u64 ts = 0; - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - BUG_ON(notify != new); - RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); - mutex_unlock(&nf_ct_ecache_mutex); - /* synchronize_rcu() is called from ctnetlink_exit. */ + if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + ts = ktime_get_real_ns(); + + local64_set(&e->timestamp, ts); +#endif } -EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); -#define NF_CT_EVENTS_DEFAULT 1 -static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; +bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *e; -static const struct nf_ct_ext_type event_extend = { - .len = sizeof(struct nf_conntrack_ecache), - .align = __alignof__(struct nf_conntrack_ecache), - .id = NF_CT_EXT_ECACHE, -}; + switch (net->ct.sysctl_events) { + case 0: + /* assignment via template / ruleset? ignore sysctl. */ + if (ctmask || expmask) + break; + return true; + case 2: /* autodetect: no event listener, don't allocate extension. */ + if (!READ_ONCE(nf_ctnetlink_has_listener)) + return true; + fallthrough; + case 1: + /* always allocate an extension. */ + if (!ctmask && !expmask) { + ctmask = ~0; + expmask = ~0; + } + break; + default: + WARN_ON_ONCE(1); + return true; + } -void nf_conntrack_ecache_pernet_init(struct net *net) -{ - net->ct.sysctl_events = nf_ct_events; - INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); -} + e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); + if (e) { + nf_ct_ecache_tstamp_new(ct, e); + e->ctmask = ctmask; + e->expmask = expmask; + } -void nf_conntrack_ecache_pernet_fini(struct net *net) -{ - cancel_delayed_work_sync(&net->ct.ecache_dwork); + return e != NULL; } +EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add); -int nf_conntrack_ecache_init(void) +#define NF_CT_EVENTS_DEFAULT 2 +static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; + +void nf_conntrack_ecache_pernet_init(struct net *net) { - int ret = nf_ct_extend_register(&event_extend); - if (ret < 0) - pr_err("Unable to register event extension\n"); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); - BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */ + net->ct.sysctl_events = nf_ct_events; - return ret; + INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work); + INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL); + spin_lock_init(&cnet->ecache.dying_lock); + + BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ } -void nf_conntrack_ecache_fini(void) +void nf_conntrack_ecache_pernet_fini(struct net *net) { - nf_ct_extend_unregister(&event_extend); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); + + cancel_delayed_work_sync(&cnet->ecache.dwork); } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 42557d2b6a90..cfc2daa3fc7f 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -17,7 +17,7 @@ #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; -static unsigned int nf_ct_expect_hashrnd __read_mostly; +static siphash_aligned_key_t nf_ct_expect_hashrnd; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -49,12 +49,15 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); + struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); - net->ct.expect_count--; + + cnet = nf_ct_pernet(net); + cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; @@ -68,7 +71,7 @@ EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { - struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); + struct nf_conntrack_expect *exp = timer_container_of(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); @@ -78,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t) static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash, seed; + struct { + union nf_inet_addr dst_addr; + u32 net_mix; + u16 dport; + u8 l3num; + u8 protonum; + } __aligned(SIPHASH_ALIGNMENT) combined; + u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); - seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); + memset(&combined, 0, sizeof(combined)); + + combined.dst_addr = tuple->dst.u3; + combined.net_mix = net_hash_mix(n); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.l3num = tuple->src.l3num; + combined.protonum = tuple->dst.protonum; - hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), - (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ seed); + hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } @@ -104,7 +118,7 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { - if (del_timer(&exp->timeout)) { + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; @@ -118,10 +132,11 @@ __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; - if (!net->ct.expect_count) + if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); @@ -156,12 +171,13 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, bool unlink) { + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; - if (!net->ct.expect_count) + if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); @@ -187,22 +203,22 @@ nf_ct_find_expectation(struct net *net, * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * - * The atomic_inc_not_zero() check tells: If that fails, we + * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || - !atomic_inc_not_zero(&exp->master->ct_general.use))) + !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; - if (exp->flags & NF_CT_EXPECT_PERMANENT) { + if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; - } else if (del_timer(&exp->timeout)) { + } else if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } - /* Undo exp->master refcnt increase, if del_timer() failed */ + /* Undo exp->master refcnt increase, if timer_delete() failed */ nf_ct_put(exp->master); return NULL; @@ -368,6 +384,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { + struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); @@ -389,7 +406,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - net->ct.expect_count++; + cnet = nf_ct_pernet(net); + cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } @@ -415,6 +433,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; + struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; @@ -458,7 +477,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, } } - if (net->ct.expect_count >= nf_ct_expect_max) { + cnet = nf_ct_pernet(net); + if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } @@ -500,7 +520,7 @@ void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, vo hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { - if (iter(exp, data) && del_timer(&exp->timeout)) { + if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } @@ -530,7 +550,7 @@ void nf_ct_expect_iterate_net(struct net *net, if (!net_eq(nf_ct_exp_net(exp), net)) continue; - if (iter(exp, data) && del_timer(&exp->timeout)) { + if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } @@ -686,7 +706,6 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { - net->ct.expect_count = 0; return exp_proc_init(net); } @@ -703,9 +722,7 @@ int nf_conntrack_expect_init(void) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; - nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", - sizeof(struct nf_conntrack_expect), - 0, 0, NULL); + nf_ct_expect_cachep = KMEM_CACHE(nf_conntrack_expect, 0); if (!nf_ct_expect_cachep) return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 3dbe2329c3f1..dd62cc12e775 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -13,40 +13,92 @@ #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack_extend.h> -static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM]; -static DEFINE_MUTEX(nf_ct_ext_type_mutex); +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_acct.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_synproxy.h> +#include <net/netfilter/nf_conntrack_act_ct.h> +#include <net/netfilter/nf_nat.h> + #define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */ -void nf_ct_ext_destroy(struct nf_conn *ct) +atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1); + +static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = { + [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help), +#if IS_ENABLED(CONFIG_NF_NAT) + [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat), +#endif + [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj), + [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct), +#ifdef CONFIG_NF_CONNTRACK_EVENTS + [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache), +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_tstamp), +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_timeout), +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels), +#endif +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy), +#endif +#if IS_ENABLED(CONFIG_NET_ACT_CT) + [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext), +#endif +}; + +static __always_inline unsigned int total_extension_size(void) { - unsigned int i; - struct nf_ct_ext_type *t; - - for (i = 0; i < NF_CT_EXT_NUM; i++) { - rcu_read_lock(); - t = rcu_dereference(nf_ct_ext_types[i]); - - /* Here the nf_ct_ext_type might have been unregisterd. - * I.e., it has responsible to cleanup private - * area in all conntracks when it is unregisterd. - */ - if (t && t->destroy) - t->destroy(ct); - rcu_read_unlock(); - } - - kfree(ct->ext); + /* remember to add new extensions below */ + BUILD_BUG_ON(NF_CT_EXT_NUM > 10); + + return sizeof(struct nf_ct_ext) + + sizeof(struct nf_conn_help) +#if IS_ENABLED(CONFIG_NF_NAT) + + sizeof(struct nf_conn_nat) +#endif + + sizeof(struct nf_conn_seqadj) + + sizeof(struct nf_conn_acct) +#ifdef CONFIG_NF_CONNTRACK_EVENTS + + sizeof(struct nf_conntrack_ecache) +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + + sizeof(struct nf_conn_tstamp) +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + + sizeof(struct nf_conn_timeout) +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + + sizeof(struct nf_conn_labels) +#endif +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + + sizeof(struct nf_conn_synproxy) +#endif +#if IS_ENABLED(CONFIG_NET_ACT_CT) + + sizeof(struct nf_conn_act_ct_ext) +#endif + ; } void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { unsigned int newlen, newoff, oldlen, alloc; - struct nf_ct_ext_type *t; struct nf_ct_ext *new; /* Conntrack must not be confirmed to avoid races on reallocation. */ WARN_ON(nf_ct_is_confirmed(ct)); + /* struct nf_ct_ext uses u8 to store offsets/size */ + BUILD_BUG_ON(total_extension_size() > 255u); if (ct->ext) { const struct nf_ct_ext *old = ct->ext; @@ -58,24 +110,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) oldlen = sizeof(*new); } - rcu_read_lock(); - t = rcu_dereference(nf_ct_ext_types[id]); - if (!t) { - rcu_read_unlock(); - return NULL; - } - - newoff = ALIGN(oldlen, t->align); - newlen = newoff + t->len; - rcu_read_unlock(); + newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext)); + newlen = newoff + nf_ct_ext_type_len[id]; alloc = max(newlen, NF_CT_EXT_PREALLOC); new = krealloc(ct->ext, alloc, gfp); if (!new) return NULL; - if (!ct->ext) + if (!ct->ext) { memset(new->offset, 0, sizeof(new->offset)); + new->gen_id = atomic_read(&nf_conntrack_ext_genid); + } new->offset[id] = newoff; new->len = newlen; @@ -86,30 +132,28 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) } EXPORT_SYMBOL(nf_ct_ext_add); -/* This MUST be called in process context. */ -int nf_ct_extend_register(const struct nf_ct_ext_type *type) +/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */ +void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id) { - int ret = 0; + unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid); + unsigned int this_id = READ_ONCE(ext->gen_id); - mutex_lock(&nf_ct_ext_type_mutex); - if (nf_ct_ext_types[type->id]) { - ret = -EBUSY; - goto out; - } + if (!__nf_ct_ext_exist(ext, id)) + return NULL; - rcu_assign_pointer(nf_ct_ext_types[type->id], type); -out: - mutex_unlock(&nf_ct_ext_type_mutex); - return ret; + if (this_id == 0 || ext->gen_id == gen_id) + return (void *)ext + ext->offset[id]; + + return NULL; } -EXPORT_SYMBOL_GPL(nf_ct_extend_register); +EXPORT_SYMBOL(__nf_ct_ext_find); -/* This MUST be called in process context. */ -void nf_ct_extend_unregister(const struct nf_ct_ext_type *type) +void nf_ct_ext_bump_genid(void) { - mutex_lock(&nf_ct_ext_type_mutex); - RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL); - mutex_unlock(&nf_ct_ext_type_mutex); - synchronize_rcu(); + unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid); + + if (value == UINT_MAX) + atomic_set(&nf_conntrack_ext_genid, 1); + + msleep(HZ); } -EXPORT_SYMBOL_GPL(nf_ct_extend_unregister); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 9eca90414bb7..617f744a2e3a 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -33,10 +33,6 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ftp connection tracking helper"); MODULE_ALIAS("ip_conntrack_ftp"); MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); - -/* This is slow, but it's simple. --RR */ -static char *ftp_buffer; - static DEFINE_SPINLOCK(nf_ftp_lock); #define MAX_PORTS 8 @@ -382,7 +378,7 @@ static int help(struct sk_buff *skb, int ret; u32 seq; int dir = CTINFO2DIR(ctinfo); - unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); + unsigned int matchlen, matchoff; struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; union nf_inet_addr *daddr; @@ -398,6 +394,9 @@ static int help(struct sk_buff *skb, return NF_ACCEPT; } + if (unlikely(skb_linearize(skb))) + return NF_DROP; + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); if (th == NULL) return NF_ACCEPT; @@ -411,9 +410,9 @@ static int help(struct sk_buff *skb, } datalen = skb->len - dataoff; + /* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */ spin_lock_bh(&nf_ftp_lock); - fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer); - BUG_ON(fb_ptr == NULL); + fb_ptr = skb->data + dataoff; ends_in_nl = (fb_ptr[datalen - 1] == '\n'); seq = ntohl(th->seq) + datalen; @@ -568,7 +567,6 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = { static void __exit nf_conntrack_ftp_fini(void) { nf_conntrack_helpers_unregister(ftp, ports_c * 2); - kfree(ftp_buffer); } static int __init nf_conntrack_ftp_init(void) @@ -577,10 +575,6 @@ static int __init nf_conntrack_ftp_init(void) NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_ftp_master)); - ftp_buffer = kmalloc(65536, GFP_KERNEL); - if (!ftp_buffer) - return -ENOMEM; - if (ports_c == 0) ports[ports_c++] = FTP_PORT; @@ -600,7 +594,6 @@ static int __init nf_conntrack_ftp_init(void) ret = nf_conntrack_helpers_register(ftp, ports_c * 2); if (ret < 0) { pr_err("failed to register helpers\n"); - kfree(ftp_buffer); return ret; } diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 573cb4481481..540d97715bd2 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -257,15 +257,15 @@ static unsigned int get_uint(struct bitstr *bs, int b) case 4: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 3: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 2: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 1: v |= *bs->cur++; break; @@ -533,6 +533,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Get fields bitmap */ if (nf_h323_error_boundary(bs, 0, f->sz)) return H323_ERROR_BOUND; + if (f->sz > 32) + return H323_ERROR_RANGE; bmp = get_bitmap(bs, f->sz); if (base) *(unsigned int *)base = bmp; @@ -589,6 +591,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, bmp2_len = get_bits(bs, 7) + 1; if (nf_h323_error_boundary(bs, 0, bmp2_len)) return H323_ERROR_BOUND; + if (bmp2_len > 32) + return H323_ERROR_RANGE; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; if (base) diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 8ba037b76ad3..14f73872f647 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -34,6 +34,8 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_conntrack_h323.h> +#define H323_MAX_SIZE 65535 + /* Parameters */ static unsigned int default_rrq_ttl __read_mostly = 300; module_param(default_rrq_ttl, uint, 0600); @@ -49,64 +51,8 @@ MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " "if both endpoints are on different sides " "(determined by routing information)"); -/* Hooks for NAT */ -int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff, - unsigned char **data, int dataoff, - H245_TransportAddress *taddr, - union nf_inet_addr *addr, __be16 port) - __read_mostly; -int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff, - unsigned char **data, int dataoff, - TransportAddress *taddr, - union nf_inet_addr *addr, __be16 port) - __read_mostly; -int (*set_sig_addr_hook) (struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, unsigned char **data, - TransportAddress *taddr, int count) __read_mostly; -int (*set_ras_addr_hook) (struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, unsigned char **data, - TransportAddress *taddr, int count) __read_mostly; -int (*nat_rtp_rtcp_hook) (struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned char **data, int dataoff, - H245_TransportAddress *taddr, - __be16 port, __be16 rtp_port, - struct nf_conntrack_expect *rtp_exp, - struct nf_conntrack_expect *rtcp_exp) __read_mostly; -int (*nat_t120_hook) (struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned char **data, int dataoff, - H245_TransportAddress *taddr, __be16 port, - struct nf_conntrack_expect *exp) __read_mostly; -int (*nat_h245_hook) (struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned char **data, int dataoff, - TransportAddress *taddr, __be16 port, - struct nf_conntrack_expect *exp) __read_mostly; -int (*nat_callforwarding_hook) (struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned char **data, int dataoff, - TransportAddress *taddr, __be16 port, - struct nf_conntrack_expect *exp) __read_mostly; -int (*nat_q931_hook) (struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned char **data, TransportAddress *taddr, int idx, - __be16 port, struct nf_conntrack_expect *exp) - __read_mostly; +const struct nfct_h323_nat_hooks __rcu *nfct_h323_nat_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfct_h323_nat_hook); static DEFINE_SPINLOCK(nf_h323_lock); static char *h323_buffer; @@ -142,11 +88,15 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, if (tcpdatalen <= 0) /* No TCP data */ goto clear_out; + if (tcpdatalen > H323_MAX_SIZE) + tcpdatalen = H323_MAX_SIZE; + if (*data == NULL) { /* first TPKT */ /* Get first TPKT pointer */ tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen, h323_buffer); - BUG_ON(tpkt == NULL); + if (!tpkt) + goto clear_out; /* Validate TPKT identifier */ if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) { @@ -193,7 +143,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, if (tcpdatalen == 4) { /* Separate TPKT header */ /* Netmeeting sends TPKT header and data separately */ pr_debug("nf_ct_h323: separate TPKT header indicates " - "there will be TPKT data of %hu bytes\n", + "there will be TPKT data of %d bytes\n", tpktlen - 4); info->tpkt_len[dir] = tpktlen - 4; return 0; @@ -258,6 +208,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, unsigned char **data, int dataoff, H245_TransportAddress *taddr) { + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; @@ -265,7 +216,6 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, union nf_inet_addr addr; struct nf_conntrack_expect *rtp_exp; struct nf_conntrack_expect *rtcp_exp; - typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp; /* Read RTP or RTCP address */ if (!get_h245_addr(ct, *data, taddr, &addr, &port) || @@ -295,15 +245,16 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_UDP, NULL, &rtcp_port); + nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && - (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) && + nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, - taddr, port, rtp_port, rtp_exp, rtcp_exp); + ret = nathook->nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff, + taddr, port, rtp_port, rtp_exp, rtcp_exp); } else { /* Conntrack only */ if (nf_ct_expect_related(rtp_exp, 0) == 0) { if (nf_ct_expect_related(rtcp_exp, 0) == 0) { @@ -332,12 +283,12 @@ static int expect_t120(struct sk_buff *skb, unsigned char **data, int dataoff, H245_TransportAddress *taddr) { + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; union nf_inet_addr addr; struct nf_conntrack_expect *exp; - typeof(nat_t120_hook) nat_t120; /* Read T.120 address */ if (!get_h245_addr(ct, *data, taddr, &addr, &port) || @@ -354,15 +305,16 @@ static int expect_t120(struct sk_buff *skb, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */ + nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && - (nat_t120 = rcu_dereference(nat_t120_hook)) && + nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr, - port, exp); + ret = nathook->nat_t120(skb, ct, ctinfo, protoff, data, + dataoff, taddr, port, exp); } else { /* Conntrack only */ if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_h323: expect T.120 "); @@ -663,18 +615,19 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data, return 1; } +EXPORT_SYMBOL_GPL(get_h225_addr); static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff, unsigned char **data, int dataoff, TransportAddress *taddr) { + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; union nf_inet_addr addr; struct nf_conntrack_expect *exp; - typeof(nat_h245_hook) nat_h245; /* Read h245Address */ if (!get_h225_addr(ct, *data, taddr, &addr, &port) || @@ -691,15 +644,16 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, IPPROTO_TCP, NULL, &port); exp->helper = &nf_conntrack_helper_h245; + nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && - (nat_h245 = rcu_dereference(nat_h245_hook)) && + nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* NAT needed */ - ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr, - port, exp); + ret = nathook->nat_h245(skb, ct, ctinfo, protoff, data, + dataoff, taddr, port, exp); } else { /* Conntrack only */ if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_q931: expect H.245 "); @@ -784,13 +738,13 @@ static int expect_callforwarding(struct sk_buff *skb, unsigned char **data, int dataoff, TransportAddress *taddr) { + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; union nf_inet_addr addr; struct nf_conntrack_expect *exp; struct net *net = nf_ct_net(ct); - typeof(nat_callforwarding_hook) nat_callforwarding; /* Read alternativeAddress */ if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0) @@ -814,16 +768,17 @@ static int expect_callforwarding(struct sk_buff *skb, IPPROTO_TCP, NULL, &port); exp->helper = nf_conntrack_helper_q931; + nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(ct->tuplehash[dir].tuple.src.u3)) && - (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) && + nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* Need NAT */ - ret = nat_callforwarding(skb, ct, ctinfo, - protoff, data, dataoff, - taddr, port, exp); + ret = nathook->nat_callforwarding(skb, ct, ctinfo, + protoff, data, dataoff, + taddr, port, exp); } else { /* Conntrack only */ if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_q931: expect Call Forwarding "); @@ -843,12 +798,12 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, unsigned char **data, int dataoff, Setup_UUIE *setup) { + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); int ret; int i; __be16 port; union nf_inet_addr addr; - typeof(set_h225_addr_hook) set_h225_addr; pr_debug("nf_ct_q931: Setup\n"); @@ -859,9 +814,9 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, return -1; } - set_h225_addr = rcu_dereference(set_h225_addr_hook); + nathook = rcu_dereference(nfct_h323_nat_hook); if ((setup->options & eSetup_UUIE_destCallSignalAddress) && - (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK && get_h225_addr(ct, *data, &setup->destCallSignalAddress, &addr, &port) && @@ -869,16 +824,16 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n", &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3, ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); - ret = set_h225_addr(skb, protoff, data, dataoff, - &setup->destCallSignalAddress, - &ct->tuplehash[!dir].tuple.src.u3, - ct->tuplehash[!dir].tuple.src.u.tcp.port); + ret = nathook->set_h225_addr(skb, protoff, data, dataoff, + &setup->destCallSignalAddress, + &ct->tuplehash[!dir].tuple.src.u3, + ct->tuplehash[!dir].tuple.src.u.tcp.port); if (ret < 0) return -1; } if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) && - (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK && get_h225_addr(ct, *data, &setup->sourceCallSignalAddress, &addr, &port) && @@ -886,10 +841,10 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n", &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3, ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); - ret = set_h225_addr(skb, protoff, data, dataoff, - &setup->sourceCallSignalAddress, - &ct->tuplehash[!dir].tuple.dst.u3, - ct->tuplehash[!dir].tuple.dst.u.tcp.port); + ret = nathook->set_h225_addr(skb, protoff, data, dataoff, + &setup->sourceCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + ct->tuplehash[!dir].tuple.dst.u.tcp.port); if (ret < 0) return -1; } @@ -1219,6 +1174,9 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NULL; *datalen = skb->len - dataoff; + if (*datalen > H323_MAX_SIZE) + *datalen = H323_MAX_SIZE; + return skb_header_pointer(skb, dataoff, *datalen, h323_buffer); } @@ -1248,13 +1206,13 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, TransportAddress *taddr, int count) { struct nf_ct_h323_master *info = nfct_help_data(ct); + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); int ret = 0; int i; __be16 port; union nf_inet_addr addr; struct nf_conntrack_expect *exp; - typeof(nat_q931_hook) nat_q931; /* Look for the first related address */ for (i = 0; i < count; i++) { @@ -1278,11 +1236,11 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, exp->helper = nf_conntrack_helper_q931; exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ - nat_q931 = rcu_dereference(nat_q931_hook); - if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook = rcu_dereference(nfct_h323_nat_hook); + if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* Need NAT */ - ret = nat_q931(skb, ct, ctinfo, protoff, data, - taddr, i, port, exp); + ret = nathook->nat_q931(skb, ct, ctinfo, protoff, data, + taddr, i, port, exp); } else { /* Conntrack only */ if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); @@ -1304,15 +1262,15 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct, unsigned int protoff, unsigned char **data, GatekeeperRequest *grq) { - typeof(set_ras_addr_hook) set_ras_addr; + const struct nfct_h323_nat_hooks *nathook; pr_debug("nf_ct_ras: GRQ\n"); - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook = rcu_dereference(nfct_h323_nat_hook); + if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) /* NATed */ - return set_ras_addr(skb, ct, ctinfo, protoff, data, - &grq->rasAddress, 1); + return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data, + &grq->rasAddress, 1); return 0; } @@ -1366,8 +1324,8 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, unsigned char **data, RegistrationRequest *rrq) { struct nf_ct_h323_master *info = nfct_help_data(ct); + const struct nfct_h323_nat_hooks *nathook; int ret; - typeof(set_ras_addr_hook) set_ras_addr; pr_debug("nf_ct_ras: RRQ\n"); @@ -1377,12 +1335,12 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, if (ret < 0) return -1; - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook = rcu_dereference(nfct_h323_nat_hook); + if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { - ret = set_ras_addr(skb, ct, ctinfo, protoff, data, - rrq->rasAddress.item, - rrq->rasAddress.count); + ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data, + rrq->rasAddress.item, + rrq->rasAddress.count); if (ret < 0) return -1; } @@ -1402,19 +1360,19 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, unsigned char **data, RegistrationConfirm *rcf) { struct nf_ct_h323_master *info = nfct_help_data(ct); + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); int ret; struct nf_conntrack_expect *exp; - typeof(set_sig_addr_hook) set_sig_addr; pr_debug("nf_ct_ras: RCF\n"); - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook = rcu_dereference(nfct_h323_nat_hook); + if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(skb, ct, ctinfo, protoff, data, - rcf->callSignalAddress.item, - rcf->callSignalAddress.count); + ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data, + rcf->callSignalAddress.item, + rcf->callSignalAddress.count); if (ret < 0) return -1; } @@ -1427,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, if (info->timeout > 0) { pr_debug("nf_ct_ras: set RAS connection timeout to " "%u seconds\n", info->timeout); - nf_ct_refresh(ct, skb, info->timeout * HZ); + nf_ct_refresh(ct, info->timeout * HZ); /* Set expect timeout */ spin_lock_bh(&nf_conntrack_expect_lock); @@ -1453,18 +1411,18 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, unsigned char **data, UnregistrationRequest *urq) { struct nf_ct_h323_master *info = nfct_help_data(ct); + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); int ret; - typeof(set_sig_addr_hook) set_sig_addr; pr_debug("nf_ct_ras: URQ\n"); - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook = rcu_dereference(nfct_h323_nat_hook); + if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(skb, ct, ctinfo, protoff, data, - urq->callSignalAddress.item, - urq->callSignalAddress.count); + ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data, + urq->callSignalAddress.item, + urq->callSignalAddress.count); if (ret < 0) return -1; } @@ -1475,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, info->sig_port[!dir] = 0; /* Give it 30 seconds for UCF or URJ */ - nf_ct_refresh(ct, skb, 30 * HZ); + nf_ct_refresh(ct, 30 * HZ); return 0; } @@ -1486,39 +1444,42 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct, unsigned char **data, AdmissionRequest *arq) { const struct nf_ct_h323_master *info = nfct_help_data(ct); + const struct nfct_h323_nat_hooks *nathook; int dir = CTINFO2DIR(ctinfo); __be16 port; union nf_inet_addr addr; - typeof(set_h225_addr_hook) set_h225_addr; pr_debug("nf_ct_ras: ARQ\n"); - set_h225_addr = rcu_dereference(set_h225_addr_hook); + nathook = rcu_dereference(nfct_h323_nat_hook); + if (!nathook) + return 0; + if ((arq->options & eAdmissionRequest_destCallSignalAddress) && get_h225_addr(ct, *data, &arq->destCallSignalAddress, &addr, &port) && !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && port == info->sig_port[dir] && nf_ct_l3num(ct) == NFPROTO_IPV4 && - set_h225_addr && ct->status & IPS_NAT_MASK) { + ct->status & IPS_NAT_MASK) { /* Answering ARQ */ - return set_h225_addr(skb, protoff, data, 0, - &arq->destCallSignalAddress, - &ct->tuplehash[!dir].tuple.dst.u3, - info->sig_port[!dir]); + return nathook->set_h225_addr(skb, protoff, data, 0, + &arq->destCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + info->sig_port[!dir]); } if ((arq->options & eAdmissionRequest_srcCallSignalAddress) && get_h225_addr(ct, *data, &arq->srcCallSignalAddress, &addr, &port) && !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && - set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { /* Calling ARQ */ - return set_h225_addr(skb, protoff, data, 0, - &arq->srcCallSignalAddress, - &ct->tuplehash[!dir].tuple.dst.u3, - port); + return nathook->set_h225_addr(skb, protoff, data, 0, + &arq->srcCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + port); } return 0; @@ -1534,7 +1495,6 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, __be16 port; union nf_inet_addr addr; struct nf_conntrack_expect *exp; - typeof(set_sig_addr_hook) set_sig_addr; pr_debug("nf_ct_ras: ACF\n"); @@ -1543,12 +1503,15 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, return 0; if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) { + const struct nfct_h323_nat_hooks *nathook; + /* Answering ACF */ - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook = rcu_dereference(nfct_h323_nat_hook); + if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) - return set_sig_addr(skb, ct, ctinfo, protoff, data, - &acf->destCallSignalAddress, 1); + return nathook->set_sig_addr(skb, ct, ctinfo, protoff, + data, + &acf->destCallSignalAddress, 1); return 0; } @@ -1577,15 +1540,15 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, unsigned int protoff, unsigned char **data, LocationRequest *lrq) { - typeof(set_ras_addr_hook) set_ras_addr; + const struct nfct_h323_nat_hooks *nathook; pr_debug("nf_ct_ras: LRQ\n"); - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook = rcu_dereference(nfct_h323_nat_hook); + if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) - return set_ras_addr(skb, ct, ctinfo, protoff, data, - &lrq->replyAddress, 1); + return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data, + &lrq->replyAddress, 1); return 0; } @@ -1633,27 +1596,22 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct, unsigned int protoff, unsigned char **data, InfoRequestResponse *irr) { + const struct nfct_h323_nat_hooks *nathook; int ret; - typeof(set_ras_addr_hook) set_ras_addr; - typeof(set_sig_addr_hook) set_sig_addr; pr_debug("nf_ct_ras: IRR\n"); - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && + nathook = rcu_dereference(nfct_h323_nat_hook); + if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) { - ret = set_ras_addr(skb, ct, ctinfo, protoff, data, - &irr->rasAddress, 1); + ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data, + &irr->rasAddress, 1); if (ret < 0) return -1; - } - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 && - ct->status & IPS_NAT_MASK) { - ret = set_sig_addr(skb, ct, ctinfo, protoff, data, - irr->callSignalAddress.item, - irr->callSignalAddress.count); + ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data, + irr->callSignalAddress.item, + irr->callSignalAddress.count); if (ret < 0) return -1; } @@ -1820,7 +1778,7 @@ static int __init nf_conntrack_h323_init(void) NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_h323_master)); - h323_buffer = kmalloc(65536, GFP_KERNEL); + h323_buffer = kmalloc(H323_MAX_SIZE + 1, GFP_KERNEL); if (!h323_buffer) return -ENOMEM; ret = h323_helper_init(); @@ -1836,17 +1794,6 @@ err1: module_init(nf_conntrack_h323_init); module_exit(nf_conntrack_h323_fini); -EXPORT_SYMBOL_GPL(get_h225_addr); -EXPORT_SYMBOL_GPL(set_h245_addr_hook); -EXPORT_SYMBOL_GPL(set_h225_addr_hook); -EXPORT_SYMBOL_GPL(set_sig_addr_hook); -EXPORT_SYMBOL_GPL(set_ras_addr_hook); -EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); -EXPORT_SYMBOL_GPL(nat_t120_hook); -EXPORT_SYMBOL_GPL(nat_h245_hook); -EXPORT_SYMBOL_GPL(nat_callforwarding_hook); -EXPORT_SYMBOL_GPL(nat_q931_hook); - MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); MODULE_DESCRIPTION("H.323 connection tracking helper"); MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 118f415928ae..ceb48c3ca0a4 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -26,7 +26,9 @@ #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_log.h> +#include <net/ip.h> static DEFINE_MUTEX(nf_ct_helper_mutex); struct hlist_head *nf_ct_helper_hash __read_mostly; @@ -35,11 +37,6 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = false; -module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); -MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 0)"); - static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; @@ -51,24 +48,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; } -static struct nf_conntrack_helper * -__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) -{ - struct nf_conntrack_helper *helper; - struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; - unsigned int h; - - if (!nf_ct_helper_count) - return NULL; - - h = helper_hash(tuple); - hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) - return helper; - } - return NULL; -} - struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { @@ -165,7 +144,7 @@ nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum) if (!nat) { snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name); rcu_read_unlock(); - request_module(mod_name); + request_module("%s", mod_name); rcu_read_lock(); nat = nf_conntrack_nat_helper_find(mod_name); @@ -209,59 +188,31 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); -static struct nf_conntrack_helper * -nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) -{ - if (!net->ct.sysctl_auto_assign_helper) { - if (net->ct.auto_assign_helper_warned) - return NULL; - if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) - return NULL; - pr_info("nf_conntrack: default automatic helper assignment " - "has been turned off for security reasons and CT-based " - " firewall rule not found. Use the iptables CT target " - "to attach helpers instead.\n"); - net->ct.auto_assign_helper_warned = 1; - return NULL; - } - - return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); -} - - int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; - struct net *net = nf_ct_net(ct); - /* We already got a helper explicitly attached. The function - * nf_conntrack_alter_reply - in case NAT is in use - asks for looking - * the helper up again. Since now the user is in full control of - * making consistent helper configurations, skip this automatic - * re-lookup, otherwise we'll lose the helper. - */ + /* We already got a helper explicitly attached (e.g. nft_ct) */ if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; - if (tmpl != NULL) { - help = nfct_help(tmpl); - if (help != NULL) { - helper = help->helper; - set_bit(IPS_HELPER_BIT, &ct->status); - } + if (WARN_ON_ONCE(!tmpl)) + return 0; + + help = nfct_help(tmpl); + if (help != NULL) { + helper = rcu_dereference(help->helper); + set_bit(IPS_HELPER_BIT, &ct->status); } help = nfct_help(ct); if (helper == NULL) { - helper = nf_ct_lookup_helper(ct, net); - if (helper == NULL) { - if (help) - RCU_INIT_POINTER(help->helper, NULL); - return 0; - } + if (help) + RCU_INIT_POINTER(help->helper, NULL); + return 0; } if (help == NULL) { @@ -404,6 +355,9 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + if (!nf_ct_helper_hash) + return -ENOENT; + if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; @@ -414,7 +368,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) (cur->tuple.src.l3num == NFPROTO_UNSPEC || cur->tuple.src.l3num == me->tuple.src.l3num) && cur->tuple.dst.protonum == me->tuple.dst.protonum) { - ret = -EEXIST; + ret = -EBUSY; goto out; } } @@ -425,7 +379,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; + ret = -EBUSY; goto out; } } @@ -467,11 +421,6 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); - - /* Maybe someone has gotten the helper already when unhelp above. - * So need to wait it. - */ - synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); @@ -549,42 +498,20 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) } EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); -static const struct nf_ct_ext_type helper_extend = { - .len = sizeof(struct nf_conn_help), - .align = __alignof__(struct nf_conn_help), - .id = NF_CT_EXT_HELPER, -}; - -void nf_conntrack_helper_pernet_init(struct net *net) -{ - net->ct.auto_assign_helper_warned = false; - net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; -} - int nf_conntrack_helper_init(void) { - int ret; nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); if (!nf_ct_helper_hash) return -ENOMEM; - ret = nf_ct_extend_register(&helper_extend); - if (ret < 0) { - pr_err("nf_ct_helper: Unable to register helper extension.\n"); - goto out_extend; - } - INIT_LIST_HEAD(&nf_ct_nat_helpers); return 0; -out_extend: - kvfree(nf_ct_helper_hash); - return ret; } void nf_conntrack_helper_fini(void) { - nf_ct_extend_unregister(&helper_extend); kvfree(nf_ct_helper_hash); + nf_ct_helper_hash = NULL; } diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index e40988a2f22f..5703846bea3b 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -39,6 +39,7 @@ unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_irc_hook); #define HELPER_NAME "irc" +#define MAX_SEARCH_SIZE 4095 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); @@ -121,6 +122,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, int i, ret = NF_ACCEPT; char *addr_beg_p, *addr_end_p; typeof(nf_nat_irc_hook) nf_nat_irc; + unsigned int datalen; /* If packet is coming from IRC server */ if (dir == IP_CT_DIR_REPLY) @@ -140,23 +142,52 @@ static int help(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; + datalen = skb->len - dataoff; + if (datalen > MAX_SEARCH_SIZE) + datalen = MAX_SEARCH_SIZE; + spin_lock_bh(&irc_buffer_lock); - ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff, + ib_ptr = skb_header_pointer(skb, dataoff, datalen, irc_buffer); - BUG_ON(ib_ptr == NULL); + if (!ib_ptr) { + spin_unlock_bh(&irc_buffer_lock); + return NF_ACCEPT; + } data = ib_ptr; - data_limit = ib_ptr + skb->len - dataoff; + data_limit = ib_ptr + datalen; + + /* Skip any whitespace */ + while (data < data_limit - 10) { + if (*data == ' ' || *data == '\r' || *data == '\n') + data++; + else + break; + } + + /* strlen("PRIVMSG x ")=10 */ + if (data < data_limit - 10) { + if (strncasecmp("PRIVMSG ", data, 8)) + goto out; + data += 8; + } - /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 - * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ - while (data < data_limit - (19 + MINMATCHLEN)) { - if (memcmp(data, "\1DCC ", 5)) { + /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26 + * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26 + */ + while (data < data_limit - (21 + MINMATCHLEN)) { + /* Find first " :", the start of message */ + if (memcmp(data, " :", 2)) { data++; continue; } + data += 2; + + /* then check that place only for the DCC command */ + if (memcmp(data, "\1DCC ", 5)) + goto out; data += 5; - /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */ iph = ip_hdr(skb); pr_debug("DCC found in master %pI4:%u %pI4:%u\n", @@ -172,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, pr_debug("DCC %s detected\n", dccprotos[i]); /* we have at least - * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid * data left (== 14/13 bytes) */ if (parse_dcc(data, data_limit, &dcc_ip, &dcc_port, &addr_beg_p, &addr_end_p)) { @@ -185,8 +216,9 @@ static int help(struct sk_buff *skb, unsigned int protoff, /* dcc_ip can be the internal OR external (NAT'ed) IP */ tuple = &ct->tuplehash[dir].tuple; - if (tuple->src.u3.ip != dcc_ip && - tuple->dst.u3.ip != dcc_ip) { + if ((tuple->src.u3.ip != dcc_ip && + ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) || + dcc_port == 0) { net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", &tuple->src.u3.ip, &dcc_ip, dcc_port); @@ -248,7 +280,7 @@ static int __init nf_conntrack_irc_init(void) irc_exp_policy.max_expected = max_dcc_channels; irc_exp_policy.timeout = dcc_timeout; - irc_buffer = kmalloc(65536, GFP_KERNEL); + irc_buffer = kmalloc(MAX_SEARCH_SIZE + 1, GFP_KERNEL); if (!irc_buffer) return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 522792556632..6c46aad23313 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -11,8 +11,6 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> -static DEFINE_SPINLOCK(nf_connlabels_lock); - static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; @@ -60,12 +58,15 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { + int v; + if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; - spin_lock(&nf_connlabels_lock); - net->ct.labels_used++; - spin_unlock(&nf_connlabels_lock); + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); + + v = atomic_inc_return_relaxed(&net->ct.labels_used); + WARN_ON_ONCE(v <= 0); return 0; } @@ -73,26 +74,8 @@ EXPORT_SYMBOL_GPL(nf_connlabels_get); void nf_connlabels_put(struct net *net) { - spin_lock(&nf_connlabels_lock); - net->ct.labels_used--; - spin_unlock(&nf_connlabels_lock); -} -EXPORT_SYMBOL_GPL(nf_connlabels_put); - -static const struct nf_ct_ext_type labels_extend = { - .len = sizeof(struct nf_conn_labels), - .align = __alignof__(struct nf_conn_labels), - .id = NF_CT_EXT_LABELS, -}; - -int nf_conntrack_labels_init(void) -{ - BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); + int v = atomic_dec_return_relaxed(&net->ct.labels_used); - return nf_ct_extend_register(&labels_extend); -} - -void nf_conntrack_labels_fini(void) -{ - nf_ct_extend_unregister(&labels_extend); + WARN_ON_ONCE(v < 0); } +EXPORT_SYMBOL_GPL(nf_connlabels_put); diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 7f19ee259609..55415f011943 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -20,13 +20,14 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> +#define HELPER_NAME "netbios-ns" #define NMBD_PORT 137 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_netbios_ns"); -MODULE_ALIAS_NFCT_HELPER("netbios_ns"); +MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); static unsigned int timeout __read_mostly = 3; module_param(timeout, uint, 0400); @@ -44,7 +45,7 @@ static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, } static struct nf_conntrack_helper helper __read_mostly = { - .name = "netbios-ns", + .name = HELPER_NAME, .tuple.src.l3num = NFPROTO_IPV4, .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), .tuple.dst.protonum = IPPROTO_UDP, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d7bd8b1f27d5..3a04665adf99 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -57,6 +57,13 @@ #include "nf_internals.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("List and change connection tracking table"); + +struct ctnetlink_list_dump_ctx { + unsigned long last_id; + unsigned int cpu; + bool done; +}; static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, @@ -167,9 +174,18 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, + bool skip_zero) { - long timeout = nf_ct_expires(ct) / HZ; + long timeout; + + if (nf_ct_is_confirmed(ct)) + timeout = nf_ct_expires(ct) / HZ; + else + timeout = ct->timeout / HZ; + + if (skip_zero && timeout == 0) + return 0; if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) goto nla_put_failure; @@ -179,7 +195,8 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) +static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct, + bool destroy) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; @@ -193,7 +210,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) if (!nest_proto) goto nla_put_failure; - ret = l4proto->to_nlattr(skb, nest_proto, ct); + ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy); nla_nest_end(skb, nest_proto); @@ -213,6 +230,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb, if (!help) return 0; + rcu_read_lock(); helper = rcu_dereference(help->helper); if (!helper) goto out; @@ -228,9 +246,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb, nla_nest_end(skb, nest_helper); out: + rcu_read_unlock(); return 0; nla_put_failure: + rcu_read_unlock(); return -1; } @@ -314,9 +334,15 @@ nla_put_failure: } #ifdef CONFIG_NF_CONNTRACK_MARK -static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct, + bool dump) { - if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) + u32 mark = READ_ONCE(ct->mark); + + if (!mark && !dump) + return 0; + + if (nla_put_be32(skb, CTA_MARK, htonl(mark))) goto nla_put_failure; return 0; @@ -324,18 +350,18 @@ nla_put_failure: return -1; } #else -#define ctnetlink_dump_mark(a, b) (0) +#define ctnetlink_dump_mark(a, b, c) (0) #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; - int len, ret; - char *secctx; + struct lsm_context ctx; + int ret; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return 0; ret = -1; @@ -343,20 +369,37 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_secctx) goto nla_put_failure; - if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) + if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context)) goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; nla_put_failure: - security_release_secctx(secctx, len); + security_release_secctx(&ctx); return ret; } #else #define ctnetlink_dump_secctx(a, b) (0) #endif -#ifdef CONFIG_NF_CONNTRACK_LABELS +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int +ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); + + if (e) { + u64 ts = local64_read(&e->timestamp); + + if (ts) + return nla_put_be64(skb, CTA_TIMESTAMP_EVENT, + cpu_to_be64(ts), CTA_TIMESTAMP_PAD); + } +#endif + return 0; +} + static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); @@ -365,6 +408,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct) return 0; return nla_total_size(sizeof(labels->bits)); } +#endif static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) @@ -385,10 +429,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) return 0; } -#else -#define ctnetlink_dump_labels(a, b) (0) -#define ctnetlink_label_size(a) (0) -#endif #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) @@ -500,7 +540,7 @@ nla_put_failure: static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { - if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) + if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use)))) goto nla_put_failure; return 0; @@ -529,7 +569,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb, static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct, true) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || @@ -537,8 +577,8 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) return -1; if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && - (ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0)) + (ctnetlink_dump_timeout(skb, ct, false) < 0 || + ctnetlink_dump_protoinfo(skb, ct, false) < 0)) return -1; return 0; @@ -550,22 +590,17 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct nlattr *nest_parms; unsigned int event; if (portid) flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct), + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = nf_ct_l3num(ct); - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); @@ -631,7 +666,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) return len + len4; } -#endif static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { @@ -646,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK - int len, ret; + int ret; - ret = security_secid_to_secctx(ct->secmark, NULL, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, NULL); + if (ret < 0) return 0; return nla_total_size(0) /* CTA_SECCTX */ - + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ + + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */ #else return 0; #endif @@ -669,6 +703,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) return 0; #endif } +#endif #ifdef CONFIG_NF_CONNTRACK_EVENTS static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) @@ -699,16 +734,18 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */ +#endif ; } static int -ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) +ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) { const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct nlattr *nest_parms; struct nf_conn *ct = item->ct; struct sk_buff *skb; @@ -738,15 +775,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type); - nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct), + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = nf_ct_l3num(ct); - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); @@ -780,15 +813,19 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; if (events & (1 << IPCT_DESTROY)) { + if (ctnetlink_dump_timeout(skb, ct, true) < 0) + goto nla_put_failure; + if (ctnetlink_dump_acct(skb, ct, type) < 0 || - ctnetlink_dump_timestamp(skb, ct) < 0) + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct, true) < 0) goto nla_put_failure; } else { - if (ctnetlink_dump_timeout(skb, ct) < 0) + if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; - if (events & (1 << IPCT_PROTOINFO) - && ctnetlink_dump_protoinfo(skb, ct) < 0) + if (events & (1 << IPCT_PROTOINFO) && + ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) @@ -818,10 +855,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((events & (1 << IPCT_MARK) || ct->mark) - && ctnetlink_dump_mark(skb, ct) < 0) + if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK))) goto nla_put_failure; #endif + + if (ctnetlink_dump_event_timestamp(skb, ct)) + goto nla_put_failure; + nlmsg_end(skb, nlh); err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); @@ -844,15 +884,18 @@ errout: static int ctnetlink_done(struct netlink_callback *cb) { - if (cb->args[1]) - nf_ct_put((struct nf_conn *)cb->args[1]); kfree(cb->data); return 0; } +struct ctnetlink_filter_u32 { + u32 val; + u32 mask; +}; + struct ctnetlink_filter { - u_int32_t cta_flags; u8 family; + bool zone_filter; u_int32_t orig_flags; u_int32_t reply_flags; @@ -861,10 +904,8 @@ struct ctnetlink_filter { struct nf_conntrack_tuple reply; struct nf_conntrack_zone zone; - struct { - u_int32_t val; - u_int32_t mask; - } mark; + struct ctnetlink_filter_u32 mark; + struct ctnetlink_filter_u32 status; }; static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { @@ -906,9 +947,45 @@ static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_zone *zone, u_int32_t flags); -/* applied on filters */ -#define CTA_FILTER_F_CTA_MARK (1 << 0) -#define CTA_FILTER_F_CTA_MARK_MASK (1 << 1) +static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark, + const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK]) { + mark->val = ntohl(nla_get_be32(cda[CTA_MARK])); + + if (cda[CTA_MARK_MASK]) + mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + else + mark->mask = 0xffffffff; + } else if (cda[CTA_MARK_MASK]) { + return -EINVAL; + } +#endif + return 0; +} + +static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status, + const struct nlattr * const cda[]) +{ + if (cda[CTA_STATUS]) { + status->val = ntohl(nla_get_be32(cda[CTA_STATUS])); + if (cda[CTA_STATUS_MASK]) + status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK])); + else + status->mask = status->val; + + /* status->val == 0? always true, else always false. */ + if (status->mask == 0) + return -EINVAL; + } else if (cda[CTA_STATUS_MASK]) { + return -EINVAL; + } + + /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */ + BUILD_BUG_ON(__IPS_MAX_BIT >= 32); + return 0; +} static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) @@ -927,35 +1004,33 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) filter->family = family; -#ifdef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK]) { - filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK); + err = ctnetlink_filter_parse_mark(&filter->mark, cda); + if (err) + goto err_filter; - if (cda[CTA_MARK_MASK]) { - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); - filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK_MASK); - } else { - filter->mark.mask = 0xffffffff; - } - } else if (cda[CTA_MARK_MASK]) { - return ERR_PTR(-EINVAL); + err = ctnetlink_filter_parse_status(&filter->status, cda); + if (err) + goto err_filter; + + if (cda[CTA_ZONE]) { + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + goto err_filter; + filter->zone_filter = true; } -#endif + if (!cda[CTA_FILTER]) return filter; - err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); - if (err < 0) - return ERR_PTR(err); - err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); if (err < 0) - return ERR_PTR(err); + goto err_filter; if (filter->orig_flags) { - if (!cda[CTA_TUPLE_ORIG]) - return ERR_PTR(-EINVAL); + if (!cda[CTA_TUPLE_ORIG]) { + err = -EINVAL; + goto err_filter; + } err = ctnetlink_parse_tuple_filter(cda, &filter->orig, CTA_TUPLE_ORIG, @@ -963,28 +1038,35 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) &filter->zone, filter->orig_flags); if (err < 0) - return ERR_PTR(err); + goto err_filter; } if (filter->reply_flags) { - if (!cda[CTA_TUPLE_REPLY]) - return ERR_PTR(-EINVAL); + if (!cda[CTA_TUPLE_REPLY]) { + err = -EINVAL; + goto err_filter; + } err = ctnetlink_parse_tuple_filter(cda, &filter->reply, CTA_TUPLE_REPLY, filter->family, &filter->zone, - filter->orig_flags); + filter->reply_flags); if (err < 0) - return ERR_PTR(err); + goto err_filter; } return filter; + +err_filter: + kfree(filter); + + return ERR_PTR(err); } static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) { - return family || cda[CTA_MARK] || cda[CTA_FILTER]; + return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE]; } static int ctnetlink_start(struct netlink_callback *cb) @@ -1077,6 +1159,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; struct nf_conntrack_tuple *tuple; + u32 status; if (filter == NULL) goto out; @@ -1088,6 +1171,10 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; + if (filter->zone_filter && + !nf_ct_zone_equal_any(ct, &filter->zone)) + goto ignore_entry; + if (filter->orig_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, @@ -1105,13 +1192,12 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK_MASK)) && - (ct->mark & filter->mark.mask) != filter->mark.val) - goto ignore_entry; - else if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK)) && - ct->mark != filter->mark.val) + if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif + status = (u32)READ_ONCE(ct->status); + if ((status & filter->status.mask) != filter->status.val) + goto ignore_entry; out: return 1; @@ -1120,19 +1206,26 @@ ignore_entry: return 0; } +static unsigned long ctnetlink_get_id(const struct nf_conn *ct) +{ + unsigned long id = nf_ct_get_id(ct); + + return id ? id : 1; +} + static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); - struct nf_conn *ct, *last; + unsigned long last_id = cb->args[1]; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *nf_ct_evict[8]; + struct nf_conn *ct; int res, i; spinlock_t *lockp; - last = (struct nf_conn *)cb->args[1]; i = 0; local_bh_disable(); @@ -1153,12 +1246,11 @@ restart: } hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) - continue; ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { + /* need to defer nf_ct_kill() until lock is released */ if (i < ARRAY_SIZE(nf_ct_evict) && - atomic_inc_not_zero(&ct->ct_general.use)) + refcount_inc_not_zero(&ct->ct_general.use)) nf_ct_evict[i++] = ct; continue; } @@ -1166,8 +1258,11 @@ restart: if (!net_eq(net, nf_ct_net(ct))) continue; + if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + if (cb->args[1]) { - if (ct != last) + if (ctnetlink_get_id(ct) != last_id) continue; cb->args[1] = 0; } @@ -1180,8 +1275,7 @@ restart: NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, true, flags); if (res < 0) { - nf_conntrack_get(&ct->ct_general); - cb->args[1] = (unsigned long)ct; + cb->args[1] = ctnetlink_get_id(ct); spin_unlock(lockp); goto out; } @@ -1194,12 +1288,10 @@ restart: } out: local_bh_enable(); - if (last) { + if (last_id) { /* nf ct hash resize happened, now clear the leftover. */ - if ((struct nf_conn *)cb->args[1] == last) + if (cb->args[1] == last_id) cb->args[1] = 0; - - nf_ct_put(last); } while (i) { @@ -1261,15 +1353,11 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; - ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, NULL, NULL); + ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, + cta_ip_nla_policy, NULL); if (ret < 0) return ret; - ret = nla_validate_nested_deprecated(attr, CTA_IP_MAX, - cta_ip_nla_policy, NULL); - if (ret) - return ret; - switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_nlattr_to_tuple(tb, tuple, flags); @@ -1392,7 +1480,8 @@ ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], if (err < 0) return err; - + if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6) + return -EOPNOTSUPP; tuple->src.l3num = l3num; if (flags & CTA_FILTER_FLAG(CTA_IP_DST) || @@ -1493,13 +1582,12 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, + [CTA_STATUS_MASK] = { .type = NLA_U32 }, + [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) { - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) - return 0; - return ctnetlink_filter_match(ct, data); } @@ -1508,68 +1596,64 @@ static int ctnetlink_flush_conntrack(struct net *net, u32 portid, int report, u8 family) { struct ctnetlink_filter *filter = NULL; + struct nf_ct_iter_data iter = { + .net = net, + .portid = portid, + .report = report, + }; if (ctnetlink_needs_filter(family, cda)) { - if (cda[CTA_FILTER]) - return -EOPNOTSUPP; - filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); + + iter.data = filter; } - nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter, - portid, report); + nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter); kfree(filter); return 0; } -static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_del_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + u8 family = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - struct nf_conn *ct; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_conntrack_zone zone; + struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; - if (cda[CTA_TUPLE_ORIG]) + if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, - nfmsg->nfgen_family, &zone); - else if (cda[CTA_TUPLE_REPLY]) + family, &zone); + else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, - nfmsg->nfgen_family, &zone); + family, &zone); else { - u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; + u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC; - return ctnetlink_flush_conntrack(net, cda, + return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, - nlmsg_report(nlh), u3); + nlmsg_report(info->nlh), u3); } if (err < 0) return err; - h = nf_conntrack_find_get(net, &zone, &tuple); + h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) { - nf_ct_put(ct); - return -EBUSY; - } - if (cda[CTA_ID]) { __be32 id = nla_get_be32(cda[CTA_ID]); @@ -1579,28 +1663,25 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } } - nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); return 0; } -static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - struct nf_conn *ct; - struct sk_buff *skb2 = NULL; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct sk_buff *skb2; + struct nf_conn *ct; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = ctnetlink_start, .dump = ctnetlink_dump_table, @@ -1608,7 +1689,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, .data = (void *)cda, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1627,158 +1708,144 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, if (err < 0) return err; - h = nf_conntrack_find_get(net, &zone, &tuple); + h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); - err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_put(ct); return -ENOMEM; } - err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, + true, 0); nf_ct_put(ct); - if (err <= 0) - goto free; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); +} - return 0; +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int ctnetlink_dump_one_entry(struct sk_buff *skb, + struct netlink_callback *cb, + struct nf_conn *ct, + bool dying) +{ + struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u8 l3proto = nfmsg->nfgen_family; + int res; -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + if (l3proto && nf_ct_l3num(ct) != l3proto) + return 0; + + if (ctx->last_id) { + if (ctnetlink_get_id(ct) != ctx->last_id) + return 0; + + ctx->last_id = 0; + } + + /* We can't dump extension info for the unconfirmed + * list because unconfirmed conntracks can have + * ct->ext reallocated (and thus freed). + * + * In the dying list case ct->ext can't be free'd + * until after we drop pcpu->lock. + */ + res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct, dying, 0); + if (res < 0) + ctx->last_id = ctnetlink_get_id(ct); + + return res; } +#endif -static int ctnetlink_done_list(struct netlink_callback *cb) +static int +ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) { - if (cb->args[1]) - nf_ct_put((struct nf_conn *)cb->args[1]); return 0; } static int -ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) +ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conn *ct, *last; + struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; +#ifdef CONFIG_NF_CONNTRACK_EVENTS + const struct net *net = sock_net(skb->sk); + struct nf_conntrack_net_ecache *ecache_net; + unsigned long last_id = ctx->last_id; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - u_int8_t l3proto = nfmsg->nfgen_family; - int res; - int cpu; - struct hlist_nulls_head *list; - struct net *net = sock_net(skb->sk); +#endif - if (cb->args[2]) + if (ctx->done) return 0; - last = (struct nf_conn *)cb->args[1]; + ctx->last_id = 0; - for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { - struct ct_pcpu *pcpu; +#ifdef CONFIG_NF_CONNTRACK_EVENTS + ecache_net = nf_conn_pernet_ecache(net); + spin_lock_bh(&ecache_net->dying_lock); - if (!cpu_possible(cpu)) - continue; + hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) { + struct nf_conn *ct; + int res; - pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - spin_lock_bh(&pcpu->lock); - list = dying ? &pcpu->dying : &pcpu->unconfirmed; -restart: - hlist_nulls_for_each_entry(h, n, list, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - if (l3proto && nf_ct_l3num(ct) != l3proto) - continue; - if (cb->args[1]) { - if (ct != last) - continue; - cb->args[1] = 0; - } + ct = nf_ct_tuplehash_to_ctrack(h); + if (last_id && last_id != ctnetlink_get_id(ct)) + continue; - /* We can't dump extension info for the unconfirmed - * list because unconfirmed conntracks can have - * ct->ext reallocated (and thus freed). - * - * In the dying list case ct->ext can't be free'd - * until after we drop pcpu->lock. - */ - res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, dying ? true : false, 0); - if (res < 0) { - if (!atomic_inc_not_zero(&ct->ct_general.use)) - continue; - cb->args[0] = cpu; - cb->args[1] = (unsigned long)ct; - spin_unlock_bh(&pcpu->lock); - goto out; - } - } - if (cb->args[1]) { - cb->args[1] = 0; - goto restart; + res = ctnetlink_dump_one_entry(skb, cb, ct, true); + if (res < 0) { + spin_unlock_bh(&ecache_net->dying_lock); + return skb->len; } - spin_unlock_bh(&pcpu->lock); + + last_id = 0; } - cb->args[2] = 1; -out: - if (last) - nf_ct_put(last); - return skb->len; -} + spin_unlock_bh(&ecache_net->dying_lock); +#endif + ctx->done = true; -static int -ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) -{ - return ctnetlink_dump_list(skb, cb, true); + return skb->len; } -static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_ct_dying(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, - .done = ctnetlink_done_list, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; } -static int -ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) -{ - return ctnetlink_dump_list(skb, cb, false); -} - -static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, - .done = ctnetlink_done_list, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; @@ -1791,7 +1858,7 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, const struct nlattr *attr) __must_hold(RCU) { - struct nf_nat_hook *nat_hook; + const struct nf_nat_hook *nat_hook; int err; nat_hook = rcu_dereference(nf_nat_hook); @@ -1833,45 +1900,10 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } #endif -static void -__ctnetlink_change_status(struct nf_conn *ct, unsigned long on, - unsigned long off) -{ - unsigned int bit; - - /* Ignore these unchangable bits */ - on &= ~IPS_UNCHANGEABLE_MASK; - off &= ~IPS_UNCHANGEABLE_MASK; - - for (bit = 0; bit < __IPS_MAX_BIT; bit++) { - if (on & (1 << bit)) - set_bit(bit, &ct->status); - else if (off & (1 << bit)) - clear_bit(bit, &ct->status); - } -} - static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { - unsigned long d; - unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); - d = ct->status ^ status; - - if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) - /* unchangeable */ - return -EBUSY; - - if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) - /* SEEN_REPLY bit can only be set */ - return -EBUSY; - - if (d & IPS_ASSURED && !(status & IPS_ASSURED)) - /* ASSURED bit can only be set */ - return -EBUSY; - - __ctnetlink_change_status(ct, status, 0); - return 0; + return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS]))); } static int @@ -1947,7 +1979,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct, } if (help) { - if (help->helper == helper) { + if (rcu_access_pointer(help->helper) == helper) { /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); @@ -1966,16 +1998,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct, static int ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { - u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - - if (timeout > INT_MAX) - timeout = INT_MAX; - ct->timeout = nfct_time_stamp + (u32)timeout; - - if (test_bit(IPS_DYING_BIT, &ct->status)) - return -ETIME; - - return 0; + return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ); } #if defined(CONFIG_NF_CONNTRACK_MARK) @@ -1988,15 +2011,14 @@ static void ctnetlink_change_mark(struct nf_conn *ct, mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); mark = ntohl(nla_get_be32(cda[CTA_MARK])); - newmark = (ct->mark & mask) ^ mark; - if (newmark != ct->mark) - ct->mark = newmark; + newmark = (READ_ONCE(ct->mark) & mask) ^ mark; + if (newmark != READ_ONCE(ct->mark)) + WRITE_ONCE(ct->mark, newmark); } #endif static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, - [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; @@ -2234,11 +2256,6 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - if (timeout > INT_MAX) - timeout = INT_MAX; - ct->timeout = (u32)timeout + nfct_time_stamp; - rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; @@ -2282,14 +2299,10 @@ ctnetlink_create_conntrack(struct net *net, if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); - /* not in hash table yet so not strictly necessary */ + /* disable helper auto-assignment for this entry */ + ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } - } else { - /* try an implicit helper assignation */ - err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - if (err < 0) - goto err2; } err = ctnetlink_setup_nat(ct, cda); @@ -2306,6 +2319,9 @@ ctnetlink_create_conntrack(struct net *net, /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + __nf_ct_set_timeout(ct, timeout); + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) @@ -2362,12 +2378,15 @@ ctnetlink_create_conntrack(struct net *net, err = nf_conntrack_hash_check_insert(ct); if (err < 0) - goto err2; + goto err3; rcu_read_unlock(); return ct; +err3: + if (ct->master) + nf_ct_put(ct->master); err2: rcu_read_unlock(); err1: @@ -2375,18 +2394,15 @@ err1: return ERR_PTR(err); } -static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_new_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nf_conn *ct; - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -2408,13 +2424,13 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, } if (cda[CTA_TUPLE_ORIG]) - h = nf_conntrack_find_get(net, &zone, &otuple); + h = nf_conntrack_find_get(info->net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = nf_conntrack_find_get(net, &zone, &rtuple); + h = nf_conntrack_find_get(info->net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) { + if (info->nlh->nlmsg_flags & NLM_F_CREATE) { enum ip_conntrack_events events; if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) @@ -2422,8 +2438,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, if (otuple.dst.protonum != rtuple.dst.protonum) return -EINVAL; - ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple, - &rtuple, u3); + ct = ctnetlink_create_conntrack(info->net, &zone, cda, + &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); @@ -2446,7 +2462,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_SYNPROXY) | events, ct, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); nf_ct_put(ct); } @@ -2456,7 +2472,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { + if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) { err = ctnetlink_change_conntrack(ct, cda); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | @@ -2468,7 +2484,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } } @@ -2481,23 +2497,17 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, __u16 cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS_CPU); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, htons(cpu)); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || - nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || @@ -2505,7 +2515,11 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, - htonl(st->search_restart))) + htonl(st->search_restart)) || + nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, + htonl(st->clash_resolve)) || + nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, + htonl(st->chaintoolong))) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -2544,17 +2558,15 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_ct_cpu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_ct_stat_cpu_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; @@ -2564,21 +2576,17 @@ static int ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; - unsigned int nr_conntracks = atomic_read(&net->ct.count); + unsigned int nr_conntracks; + struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - + nr_conntracks = nf_conntrack_count(net); if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; @@ -2594,10 +2602,8 @@ nlmsg_failure: return -1; } -static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]) { struct sk_buff *skb2; int err; @@ -2607,23 +2613,15 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, return -ENOMEM; err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { @@ -2662,6 +2660,8 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) + + ctnetlink_acct_size(ct) + + ctnetlink_timestamp_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ @@ -2676,12 +2676,6 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) ; } -static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb, - enum ip_conntrack_info *ctinfo) -{ - return nf_ct_get(skb, ctinfo); -} - static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; @@ -2719,10 +2713,14 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; - if (ctnetlink_dump_timeout(skb, ct) < 0) + if (ctnetlink_dump_timeout(skb, ct, false) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; - if (ctnetlink_dump_protoinfo(skb, ct) < 0) + if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_helpinfo(skb, ct) < 0) @@ -2743,7 +2741,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK - if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) + if (ctnetlink_dump_mark(skb, ct, true) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) @@ -2799,7 +2797,7 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) * unchangeable bits but do not error out. Also user programs * are allowed to clear the bits that they are allowed to change. */ - __ctnetlink_change_status(ct, status, ~status); + __nf_ct_change_status(ct, status, ~status); return 0; } @@ -2914,8 +2912,7 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff); } -static struct nfnl_ct_hook ctnetlink_glue_hook = { - .get_ct = ctnetlink_glue_get_ct, +static const struct nfnl_ct_hook ctnetlink_glue_hook = { .build_size = ctnetlink_glue_build_size, .build = ctnetlink_glue_build, .parse = ctnetlink_glue_parse, @@ -2959,6 +2956,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, memset(&m, 0xFF, sizeof(m)); memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); m.src.u.all = mask->src.u.all; + m.src.l3num = tuple->src.l3num; m.dst.protonum = tuple->dst.protonum; nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK); @@ -2984,11 +2982,13 @@ nla_put_failure: return -1; } +#if IS_ENABLED(CONFIG_NF_NAT) static const union nf_inet_addr any_addr; +#endif static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) { - static __read_mostly siphash_key_t exp_id_seed; + static siphash_aligned_key_t exp_id_seed; unsigned long a, b, c, d; net_get_random_once(&exp_id_seed, sizeof(exp_id_seed)); @@ -3081,19 +3081,14 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, + exp->tuple.src.l3num, NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = exp->tuple.src.l3num; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; @@ -3108,12 +3103,11 @@ nla_put_failure: #ifdef CONFIG_NF_CONNTRACK_EVENTS static int -ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) +ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item) { struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct sk_buff *skb; unsigned int type, group; int flags = 0; @@ -3136,15 +3130,11 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type); - nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, + exp->tuple.src.l3num, NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = exp->tuple.src.l3num; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; @@ -3161,23 +3151,27 @@ errout: return 0; } #endif -static int ctnetlink_exp_done(struct netlink_callback *cb) + +static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp) { - if (cb->args[1]) - nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); - return 0; + unsigned long id = (unsigned long)exp; + + id += nf_ct_get_id(exp->master); + id += exp->class; + + return id ? id : 1; } static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; + unsigned long last_id = cb->args[1]; + struct nf_conntrack_expect *exp; rcu_read_lock(); - last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]], @@ -3189,7 +3183,7 @@ restart: continue; if (cb->args[1]) { - if (exp != last) + if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } @@ -3198,9 +3192,7 @@ restart: cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!refcount_inc_not_zero(&exp->use)) - continue; - cb->args[1] = (unsigned long)exp; + cb->args[1] = ctnetlink_exp_id(exp); goto out; } } @@ -3211,32 +3203,30 @@ restart: } out: rcu_read_unlock(); - if (last) - nf_ct_expect_put(last); - return skb->len; } static int ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nf_conn *ct = cb->data; struct nf_conn_help *help = nfct_help(ct); u_int8_t l3proto = nfmsg->nfgen_family; + unsigned long last_id = cb->args[1]; + struct nf_conntrack_expect *exp; if (cb->args[0]) return 0; rcu_read_lock(); - last = (struct nf_conntrack_expect *)cb->args[1]; + restart: hlist_for_each_entry_rcu(exp, &help->expectations, lnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (cb->args[1]) { - if (exp != last) + if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } @@ -3244,9 +3234,7 @@ restart: cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!refcount_inc_not_zero(&exp->use)) - continue; - cb->args[1] = (unsigned long)exp; + cb->args[1] = ctnetlink_exp_id(exp); goto out; } } @@ -3257,9 +3245,6 @@ restart: cb->args[0] = 1; out: rcu_read_unlock(); - if (last) - nf_ct_expect_put(last); - return skb->len; } @@ -3278,7 +3263,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, - .done = ctnetlink_exp_done, }; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, @@ -3309,29 +3293,27 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, return err; } -static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; - struct sk_buff *skb2; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct sk_buff *skb2; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { if (cda[CTA_EXPECT_MASTER]) - return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda, - extack); + return ctnetlink_dump_exp_ct(info->net, info->sk, skb, + info->nlh, cda, + info->extack); else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, - .done = ctnetlink_exp_done, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } } @@ -3351,7 +3333,7 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, if (err < 0) return err; - exp = nf_ct_expect_find_get(net, &zone, &tuple); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; @@ -3364,42 +3346,39 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, } } - err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_expect_put(exp); - goto out; + return -ENOMEM; } rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); + info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + exp); rcu_read_unlock(); nf_ct_expect_put(exp); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) { + struct nf_conntrack_helper *helper; const struct nf_conn_help *m_help; const char *name = data; m_help = nfct_help(exp->master); - return strcmp(m_help->helper->name, name) == 0; + helper = rcu_dereference(m_help->helper); + if (!helper) + return false; + + return strcmp(helper->name, name) == 0; } static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data) @@ -3407,15 +3386,13 @@ static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data) return true; } -static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_del_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; int err; @@ -3431,13 +3408,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, return err; /* bump usage count to 2 */ - exp = nf_ct_expect_find_get(net, &zone, &tuple); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); - if (ntohl(id) != (u32)(unsigned long)exp) { + + if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } @@ -3445,9 +3423,9 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, /* after list removal, usage count == 1 */ spin_lock_bh(&nf_conntrack_expect_lock); - if (del_timer(&exp->timeout)) { + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); nf_ct_expect_put(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); @@ -3457,14 +3435,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, } else if (cda[CTA_EXPECT_HELP_NAME]) { char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); - nf_ct_expect_iterate_net(net, expect_iter_name, name, + nf_ct_expect_iterate_net(info->net, expect_iter_name, name, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } else { /* This basically means we have to flush everything*/ - nf_ct_expect_iterate_net(net, expect_iter_all, NULL, + nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } return 0; @@ -3474,7 +3452,7 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { if (cda[CTA_EXPECT_TIMEOUT]) { - if (!del_timer(&x->timeout)) + if (!timer_delete(&x->timeout)) return -ETIME; x->timeout.expires = jiffies + @@ -3484,10 +3462,12 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, return 0; } +#if IS_ENABLED(CONFIG_NF_NAT) static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, }; +#endif static int ctnetlink_parse_expect_nat(const struct nlattr *attr, @@ -3660,15 +3640,13 @@ err_ct: return err; } -static int ctnetlink_new_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_new_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; int err; @@ -3687,20 +3665,20 @@ static int ctnetlink_new_expect(struct net *net, struct sock *ctnl, return err; spin_lock_bh(&nf_conntrack_expect_lock); - exp = __nf_ct_expect_find(net, &zone, &tuple); + exp = __nf_ct_expect_find(info->net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) { - err = ctnetlink_create_expect(net, &zone, cda, u3, + if (info->nlh->nlmsg_flags & NLM_F_CREATE) { + err = ctnetlink_create_expect(info->net, &zone, cda, u3, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } return err; } err = -EEXIST; - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); spin_unlock_bh(&nf_conntrack_expect_lock); @@ -3712,20 +3690,15 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_EXP_GET_STATS_CPU); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, htons(cpu)); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) @@ -3766,17 +3739,15 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_exp_cpu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_exp_stat_cpu_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; @@ -3784,44 +3755,77 @@ static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl, #ifdef CONFIG_NF_CONNTRACK_EVENTS static struct nf_ct_event_notifier ctnl_notifier = { - .fcn = ctnetlink_conntrack_event, -}; - -static struct nf_exp_event_notifier ctnl_notifier_exp = { - .fcn = ctnetlink_expect_event, + .ct_event = ctnetlink_conntrack_event, + .exp_event = ctnetlink_expect_event, }; #endif static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { - [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu }, - [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct }, - [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying }, - [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed }, + [IPCTNL_MSG_CT_NEW] = { + .call = ctnetlink_new_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET] = { + .call = ctnetlink_get_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_DELETE] = { + .call = ctnetlink_del_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { + .call = ctnetlink_get_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET_STATS_CPU] = { + .call = ctnetlink_stat_ct_cpu, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_STATS] = { + .call = ctnetlink_stat_ct, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_DYING] = { + .call = ctnetlink_get_ct_dying, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { + .call = ctnetlink_get_ct_unconfirmed, + .type = NFNL_CB_MUTEX, + }, }; static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { - [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu }, + [IPCTNL_MSG_EXP_GET] = { + .call = ctnetlink_get_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_NEW] = { + .call = ctnetlink_new_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_DELETE] = { + .call = ctnetlink_del_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_GET_STATS_CPU] = { + .call = ctnetlink_stat_exp_cpu, + .type = NFNL_CB_MUTEX, + }, }; static const struct nfnetlink_subsystem ctnl_subsys = { @@ -3845,58 +3849,29 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); static int __net_init ctnetlink_net_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - int ret; - - ret = nf_conntrack_register_notifier(net, &ctnl_notifier); - if (ret < 0) { - pr_err("ctnetlink_init: cannot register notifier.\n"); - goto err_out; - } - - ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp); - if (ret < 0) { - pr_err("ctnetlink_init: cannot expect register notifier.\n"); - goto err_unreg_notifier; - } + nf_conntrack_register_notifier(net, &ctnl_notifier); #endif return 0; - -#ifdef CONFIG_NF_CONNTRACK_EVENTS -err_unreg_notifier: - nf_conntrack_unregister_notifier(net, &ctnl_notifier); -err_out: - return ret; -#endif } -static void ctnetlink_net_exit(struct net *net) +static void ctnetlink_net_pre_exit(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp); - nf_conntrack_unregister_notifier(net, &ctnl_notifier); + nf_conntrack_unregister_notifier(net); #endif } -static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) -{ - struct net *net; - - list_for_each_entry(net, net_exit_list, exit_list) - ctnetlink_net_exit(net); - - /* wait for other cpus until they are done with ctnl_notifiers */ - synchronize_rcu(); -} - static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, - .exit_batch = ctnetlink_net_exit_batch, + .pre_exit = ctnetlink_net_pre_exit, }; static int __init ctnetlink_init(void) { int ret; + NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx); + ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c new file mode 100644 index 000000000000..068e9489e1c2 --- /dev/null +++ b/net/netfilter/nf_conntrack_ovs.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Support ct functions for openvswitch and used by OVS and TC conntrack. */ + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/ipv6_frag.h> +#include <net/ip.h> +#include <linux/netfilter_ipv6.h> + +/* 'skb' should already be pulled to nh_ofs. */ +int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + unsigned int protoff; + int err; + + if (ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + if (helper->tuple.src.l3num != NFPROTO_UNSPEC && + helper->tuple.src.l3num != proto) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + proto = ip_hdr(skb)->protocol; + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + int ofs; + + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + protoff = ofs; + proto = nexthdr; + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + if (helper->tuple.dst.protonum != proto) + return NF_ACCEPT; + + err = helper->help(skb, protoff, ct, ctinfo); + if (err != NF_ACCEPT) + return err; + + /* Adjust seqs after helper. This is needed due to some helpers (e.g., + * FTP with NAT) adusting the TCP payload size when mangling IP + * addresses and/or port numbers in the text-based control connection. + */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) + return NF_DROP; + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_ct_helper); + +int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, + u8 proto, bool nat, struct nf_conntrack_helper **hp) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + int ret = 0; + + helper = nf_conntrack_helper_try_module_get(name, family, proto); + if (!helper) + return -EINVAL; + + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (!help) { + nf_conntrack_helper_put(helper); + return -ENOMEM; + } +#if IS_ENABLED(CONFIG_NF_NAT) + if (nat) { + ret = nf_nat_helper_try_module_get(name, family, proto); + if (ret) { + nf_conntrack_helper_put(helper); + return ret; + } + } +#endif + rcu_assign_pointer(help->helper, helper); + *hp = helper; + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_add_helper); + +/* Trim the skb to the length specified by the IP/IPv6 header, + * removing any trailing lower-layer padding. This prepares the skb + * for higher-layer processing that assumes skb->len excludes padding + * (such as nf_ip_checksum). The caller needs to pull the skb to the + * network header, and ensure ip_hdr/ipv6_hdr points to valid data. + */ +int nf_ct_skb_network_trim(struct sk_buff *skb, int family) +{ + unsigned int len; + + switch (family) { + case NFPROTO_IPV4: + len = skb_ip_totlen(skb); + break; + case NFPROTO_IPV6: + len = ntohs(ipv6_hdr(skb)->payload_len); + if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) { + int err = nf_ip6_check_hbh_len(skb, &len); + + if (err) + return err; + } + len += sizeof(struct ipv6hdr); + break; + default: + len = skb->len; + } + + return pskb_trim_rcsum(skb, len); +} +EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim); + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, + u16 zone, u8 family, u8 *proto, u16 *mru) +{ + int err; + + if (family == NFPROTO_IPV4) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(net, skb, user); + local_bh_enable(); + if (err) + return err; + + *mru = IPCB(skb)->frag_max_size; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + } else if (family == NFPROTO_IPV6) { + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + err = nf_ct_frag6_gather(net, skb, user); + if (err) { + if (err != -EINPROGRESS) + kfree_skb(skb); + return err; + } + + *proto = ipv6_hdr(skb)->nexthdr; + *mru = IP6CB(skb)->frag_max_size; +#endif + } else { + kfree_skb(skb); + return -EPFNOSUPPORT; + } + + skb_clear_hash(skb); + skb->ignore_df = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_handle_fragments); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 1f44d523b512..4c679638df06 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. + * PPTP is a protocol for creating virtual private networks. * It is a specification defined by Microsoft and some vendors * working with Microsoft. PPTP is built on top of a modified * version of the Internet Generic Routing Encapsulation Protocol. @@ -45,30 +45,8 @@ MODULE_ALIAS_NFCT_HELPER("pptp"); static DEFINE_SPINLOCK(nf_pptp_lock); -int -(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff, struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound); - -int -(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff, struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound); - -void -(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig, - struct nf_conntrack_expect *expect_reply) - __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre); - -void -(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct, - struct nf_conntrack_expect *exp) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); +const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook); #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) /* PptpControlMessageType names */ @@ -111,8 +89,8 @@ EXPORT_SYMBOL(pptp_msg_name); static void pptp_expectfn(struct nf_conn *ct, struct nf_conntrack_expect *exp) { + const struct nf_nat_pptp_hook *hook; struct net *net = nf_ct_net(ct); - typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn; pr_debug("increasing timeouts\n"); /* increase timeout of GRE data channel conntrack entry */ @@ -122,9 +100,9 @@ static void pptp_expectfn(struct nf_conn *ct, /* Can you see how rusty this code is, compared with the pre-2.6.11 * one? That's what happened to my shiny newnat of 2002 ;( -HW */ - nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn); - if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK) - nf_nat_pptp_expectfn(ct, exp); + hook = rcu_dereference(nf_nat_pptp_hook); + if (hook && ct->master->status & IPS_NAT_MASK) + hook->expectfn(ct, exp); else { struct nf_conntrack_tuple inv_t; struct nf_conntrack_expect *exp_other; @@ -209,9 +187,9 @@ static void pptp_destroy_siblings(struct nf_conn *ct) static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) { struct nf_conntrack_expect *exp_orig, *exp_reply; + const struct nf_nat_pptp_hook *hook; enum ip_conntrack_dir dir; int ret = 1; - typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre; exp_orig = nf_ct_expect_alloc(ct); if (exp_orig == NULL) @@ -239,9 +217,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) IPPROTO_GRE, &callid, &peer_callid); exp_reply->expectfn = pptp_expectfn; - nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre); - if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK) - nf_nat_pptp_exp_gre(exp_orig, exp_reply); + hook = rcu_dereference(nf_nat_pptp_hook); + if (hook && ct->status & IPS_NAT_MASK) + hook->exp_gre(exp_orig, exp_reply); if (nf_ct_expect_related(exp_orig, 0) != 0) goto out_put_both; if (nf_ct_expect_related(exp_reply, 0) != 0) @@ -279,9 +257,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, enum ip_conntrack_info ctinfo) { struct nf_ct_pptp_master *info = nfct_help_data(ct); + const struct nf_nat_pptp_hook *hook; u_int16_t msg; __be16 cid = 0, pcid = 0; - typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; msg = ntohs(ctlh->messageType); pr_debug("inbound control message %s\n", pptp_msg_name(msg)); @@ -383,10 +361,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, goto invalid; } - nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound); - if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK) - return nf_nat_pptp_inbound(skb, ct, ctinfo, - protoff, ctlh, pptpReq); + hook = rcu_dereference(nf_nat_pptp_hook); + if (hook && ct->status & IPS_NAT_MASK) + return hook->inbound(skb, ct, ctinfo, protoff, ctlh, pptpReq); return NF_ACCEPT; invalid: @@ -407,9 +384,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, enum ip_conntrack_info ctinfo) { struct nf_ct_pptp_master *info = nfct_help_data(ct); + const struct nf_nat_pptp_hook *hook; u_int16_t msg; __be16 cid = 0, pcid = 0; - typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; msg = ntohs(ctlh->messageType); pr_debug("outbound control message %s\n", pptp_msg_name(msg)); @@ -479,10 +456,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, goto invalid; } - nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound); - if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK) - return nf_nat_pptp_outbound(skb, ct, ctinfo, - protoff, ctlh, pptpReq); + hook = rcu_dereference(nf_nat_pptp_hook); + if (hook && ct->status & IPS_NAT_MASK) + return hook->outbound(skb, ct, ctinfo, protoff, ctlh, pptpReq); return NF_ACCEPT; invalid: @@ -544,7 +520,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, nexthdr_off = protoff; tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph); - BUG_ON(!tcph); + if (!tcph) + return NF_ACCEPT; + nexthdr_off += tcph->doff * 4; datalen = tcplen - tcph->doff * 4; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index a0560d175a7f..bc1d96686b9c 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -42,17 +42,16 @@ #include <net/ipv6.h> #include <net/inet_frag.h> -extern unsigned int nf_conntrack_net_id; - static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL -__printf(5, 6) +__printf(4, 5) void nf_l4proto_log_invalid(const struct sk_buff *skb, - struct net *net, - u16 pf, u8 protonum, + const struct nf_hook_state *state, + u8 protonum, const char *fmt, ...) { + struct net *net = state->net; struct va_format vaf; va_list args; @@ -64,15 +63,16 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb, vaf.fmt = fmt; vaf.va = &args; - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_proto_%d: %pV ", protonum, &vaf); + nf_log_packet(net, state->pf, 0, skb, state->in, state->out, + NULL, "nf_ct_proto_%d: %pV ", protonum, &vaf); va_end(args); } EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid); -__printf(3, 4) +__printf(4, 5) void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, + const struct nf_hook_state *state, const char *fmt, ...) { struct va_format vaf; @@ -87,7 +87,7 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, vaf.fmt = fmt; vaf.va = &args; - nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct), + nf_l4proto_log_invalid(skb, state, nf_ct_protonum(ct), "%pV", &vaf); va_end(args); } @@ -100,9 +100,6 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) case IPPROTO_UDP: return &nf_conntrack_l4proto_udp; case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp; case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp; -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp; #endif @@ -121,17 +118,64 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) +static bool in_vrf_postrouting(const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (state->hook == NF_INET_POST_ROUTING && + netif_is_l3_master(state->out)) + return true; +#endif + return false; +} + +unsigned int nf_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) { const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + bool seqadj_needed; + __be16 frag_off; + int start; + u8 pnum; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || in_vrf_postrouting(state)) + return NF_ACCEPT; help = nfct_help(ct); + + seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb); + if (!help && !seqadj_needed) + return nf_conntrack_confirm(skb); + + /* helper->help() do not expect ICMP packets */ + if (ctinfo == IP_CT_RELATED_REPLY) + return nf_conntrack_confirm(skb); + + switch (nf_ct_l3num(ct)) { + case NFPROTO_IPV4: + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; + case NFPROTO_IPV6: + pnum = ipv6_hdr(skb)->nexthdr; + start = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); + if (start < 0 || (frag_off & htons(~0x7)) != 0) + return nf_conntrack_confirm(skb); + + protoff = start; + break; + default: + return nf_conntrack_confirm(skb); + } + if (help) { const struct nf_conntrack_helper *helper; int ret; - /* rcu_read_lock()ed by nf_hook_thresh */ + /* rcu_read_lock()ed by nf_hook */ helper = rcu_dereference(help->helper); if (helper) { ret = helper->help(skb, @@ -142,12 +186,10 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } } - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } + if (seqadj_needed && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; } /* We've seen it coming out the other side: confirm it */ @@ -155,22 +197,6 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } EXPORT_SYMBOL_GPL(nf_confirm); -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - return nf_confirm(skb, - skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -217,13 +243,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = { .priority = NF_IP_PRI_CONNTRACK, }, { - .hook = ipv4_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { - .hook = ipv4_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, @@ -255,16 +281,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) /* We only do TCP and SCTP at the moment: is there a better way? */ if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + tuple.dst.protonum != IPPROTO_SCTP) return -ENOPROTOOPT; - } - if ((unsigned int)*len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); + if ((unsigned int)*len < sizeof(struct sockaddr_in)) return -EINVAL; - } h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { @@ -278,17 +299,12 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u3.ip; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); nf_ct_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else return 0; } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); return -ENOENT; } @@ -331,12 +347,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (!h) { - pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", - &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + if (!h) return -ENOENT; - } ct = nf_ct_tuplehash_to_ctrack(h); @@ -360,30 +372,6 @@ static struct nf_sockopt_ops so_getorigdst6 = { .owner = THIS_MODULE, }; -static unsigned int ipv6_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - int protoff; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return nf_conntrack_confirm(skb); - } - - return nf_confirm(skb, protoff, ct, ctinfo); -} - static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -412,13 +400,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = { .priority = NF_IP6_PRI_CONNTRACK, }, { - .hook = ipv6_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { - .hook = ipv6_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_LAST - 1, @@ -446,7 +434,7 @@ static struct nf_ct_bridge_info *nf_ct_bridge_info; static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); bool fixup_needed = false, retry = true; int err = 0; retry: @@ -522,29 +510,37 @@ retry: out_unlock: mutex_unlock(&nf_ct_proto_mutex); - if (fixup_needed) - nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup, - (void *)(unsigned long)nfproto, 0, 0); + if (fixup_needed) { + struct nf_ct_iter_data iter_data = { + .net = net, + .data = (void *)(unsigned long)nfproto, + }; + nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data); + } return err; } static void nf_ct_netns_do_put(struct net *net, u8 nfproto) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { case NFPROTO_IPV4: - if (cnet->users4 && (--cnet->users4 == 0)) + if (cnet->users4 && (--cnet->users4 == 0)) { nf_unregister_net_hooks(net, ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); + nf_defrag_ipv4_disable(net); + } break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: - if (cnet->users6 && (--cnet->users6 == 0)) + if (cnet->users6 && (--cnet->users6 == 0)) { nf_unregister_net_hooks(net, ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); + nf_defrag_ipv6_disable(net); + } break; #endif case NFPROTO_BRIDGE: @@ -565,6 +561,7 @@ static int nf_ct_netns_inet_get(struct net *net) int err; err = nf_ct_netns_do_get(net, NFPROTO_IPV4); +#if IS_ENABLED(CONFIG_IPV6) if (err < 0) goto err1; err = nf_ct_netns_do_get(net, NFPROTO_IPV6); @@ -575,6 +572,7 @@ static int nf_ct_netns_inet_get(struct net *net) err2: nf_ct_netns_put(net, NFPROTO_IPV4); err1: +#endif return err; } @@ -610,7 +608,7 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto) switch (nfproto) { case NFPROTO_BRIDGE: nf_ct_netns_do_put(net, NFPROTO_BRIDGE); - /* fall through */ + fallthrough; case NFPROTO_INET: nf_ct_netns_do_put(net, NFPROTO_IPV4); nf_ct_netns_do_put(net, NFPROTO_IPV6); @@ -658,7 +656,7 @@ int nf_conntrack_proto_init(void) #if IS_ENABLED(CONFIG_IPV6) cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst6); + nf_unregister_sockopt(&so_getorigdst); #endif return ret; } @@ -680,9 +678,6 @@ void nf_conntrack_proto_pernet_init(struct net *net) #if IS_ENABLED(CONFIG_IPV6) nf_conntrack_icmpv6_init_net(net); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - nf_conntrack_dccp_init_net(net); -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP nf_conntrack_sctp_init_net(net); #endif @@ -691,13 +686,6 @@ void nf_conntrack_proto_pernet_init(struct net *net) #endif } -void nf_conntrack_proto_pernet_fini(struct net *net) -{ -#ifdef CONFIG_NF_CT_PROTO_GRE - nf_ct_gre_keymap_flush(net); -#endif -} - module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); @@ -705,3 +693,4 @@ MODULE_ALIAS("ip_conntrack"); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv4 and IPv6 connection tracking"); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c deleted file mode 100644 index b3f4a334f9d7..000000000000 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ /dev/null @@ -1,770 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * DCCP connection tracking protocol helper - * - * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/sysctl.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/dccp.h> -#include <linux/slab.h> - -#include <net/net_namespace.h> -#include <net/netns/generic.h> - -#include <linux/netfilter/nfnetlink_conntrack.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_ecache.h> -#include <net/netfilter/nf_conntrack_timeout.h> -#include <net/netfilter/nf_log.h> - -/* Timeouts are based on values from RFC4340: - * - * - REQUEST: - * - * 8.1.2. Client Request - * - * A client MAY give up on its DCCP-Requests after some time - * (3 minutes, for example). - * - * - RESPOND: - * - * 8.1.3. Server Response - * - * It MAY also leave the RESPOND state for CLOSED after a timeout of - * not less than 4MSL (8 minutes); - * - * - PARTOPEN: - * - * 8.1.5. Handshake Completion - * - * If the client remains in PARTOPEN for more than 4MSL (8 minutes), - * it SHOULD reset the connection with Reset Code 2, "Aborted". - * - * - OPEN: - * - * The DCCP timestamp overflows after 11.9 hours. If the connection - * stays idle this long the sequence number won't be recognized - * as valid anymore. - * - * - CLOSEREQ/CLOSING: - * - * 8.3. Termination - * - * The retransmission timer should initially be set to go off in two - * round-trip times and should back off to not less than once every - * 64 seconds ... - * - * - TIMEWAIT: - * - * 4.3. States - * - * A server or client socket remains in this state for 2MSL (4 minutes) - * after the connection has been town down, ... - */ - -#define DCCP_MSL (2 * 60 * HZ) - -static const char * const dccp_state_names[] = { - [CT_DCCP_NONE] = "NONE", - [CT_DCCP_REQUEST] = "REQUEST", - [CT_DCCP_RESPOND] = "RESPOND", - [CT_DCCP_PARTOPEN] = "PARTOPEN", - [CT_DCCP_OPEN] = "OPEN", - [CT_DCCP_CLOSEREQ] = "CLOSEREQ", - [CT_DCCP_CLOSING] = "CLOSING", - [CT_DCCP_TIMEWAIT] = "TIMEWAIT", - [CT_DCCP_IGNORE] = "IGNORE", - [CT_DCCP_INVALID] = "INVALID", -}; - -#define sNO CT_DCCP_NONE -#define sRQ CT_DCCP_REQUEST -#define sRS CT_DCCP_RESPOND -#define sPO CT_DCCP_PARTOPEN -#define sOP CT_DCCP_OPEN -#define sCR CT_DCCP_CLOSEREQ -#define sCG CT_DCCP_CLOSING -#define sTW CT_DCCP_TIMEWAIT -#define sIG CT_DCCP_IGNORE -#define sIV CT_DCCP_INVALID - -/* - * DCCP state transition table - * - * The assumption is the same as for TCP tracking: - * - * We are the man in the middle. All the packets go through us but might - * get lost in transit to the destination. It is assumed that the destination - * can't receive segments we haven't seen. - * - * The following states exist: - * - * NONE: Initial state, expecting Request - * REQUEST: Request seen, waiting for Response from server - * RESPOND: Response from server seen, waiting for Ack from client - * PARTOPEN: Ack after Response seen, waiting for packet other than Response, - * Reset or Sync from server - * OPEN: Packet other than Response, Reset or Sync seen - * CLOSEREQ: CloseReq from server seen, expecting Close from client - * CLOSING: Close seen, expecting Reset - * TIMEWAIT: Reset seen - * IGNORE: Not determinable whether packet is valid - * - * Some states exist only on one side of the connection: REQUEST, RESPOND, - * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to - * the one it was in before. - * - * Packets are marked as ignored (sIG) if we don't know if they're valid - * (for example a reincarnation of a connection we didn't notice is dead - * already) and the server may send back a connection closing Reset or a - * Response. They're also used for Sync/SyncAck packets, which we don't - * care about. - */ -static const u_int8_t -dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { - [CT_DCCP_ROLE_CLIENT] = { - [DCCP_PKT_REQUEST] = { - /* - * sNO -> sRQ Regular Request - * sRQ -> sRQ Retransmitted Request or reincarnation - * sRS -> sRS Retransmitted Request (apparently Response - * got lost after we saw it) or reincarnation - * sPO -> sIG Ignore, conntrack might be out of sync - * sOP -> sIG Ignore, conntrack might be out of sync - * sCR -> sIG Ignore, conntrack might be out of sync - * sCG -> sIG Ignore, conntrack might be out of sync - * sTW -> sRQ Reincarnation - * - * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ - sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, - }, - [DCCP_PKT_RESPONSE] = { - /* - * sNO -> sIV Invalid - * sRQ -> sIG Ignore, might be response to ignored Request - * sRS -> sIG Ignore, might be response to ignored Request - * sPO -> sIG Ignore, might be response to ignored Request - * sOP -> sIG Ignore, might be response to ignored Request - * sCR -> sIG Ignore, might be response to ignored Request - * sCG -> sIG Ignore, might be response to ignored Request - * sTW -> sIV Invalid, reincarnation in reverse direction - * goes through sRQ - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, - }, - [DCCP_PKT_ACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) - * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN - * sOP -> sOP Regular ACK, remain in OPEN - * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV - }, - [DCCP_PKT_DATA] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) - * sOP -> sOP Regular Data packet - * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, - }, - [DCCP_PKT_DATAACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) - * sPO -> sPO Remain in PARTOPEN state - * sOP -> sOP Regular DataAck packet in OPEN state - * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV - }, - [DCCP_PKT_CLOSEREQ] = { - /* - * CLOSEREQ may only be sent by the server. - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV - }, - [DCCP_PKT_CLOSE] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sCG Client-initiated close - * sOP -> sCG Client-initiated close - * sCR -> sCG Close in response to CloseReq (8.3.) - * sCG -> sCG Retransmit - * sTW -> sIV Late retransmit, already in TIME_WAIT - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV - }, - [DCCP_PKT_RESET] = { - /* - * sNO -> sIV No connection - * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) - * sRS -> sTW Response received without Request - * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) - * sOP -> sTW Connection reset - * sCR -> sTW Connection reset - * sCG -> sTW Connection reset - * sTW -> sIG Ignore (don't refresh timer) - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG - }, - [DCCP_PKT_SYNC] = { - /* - * We currently ignore Sync packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - [DCCP_PKT_SYNCACK] = { - /* - * We currently ignore SyncAck packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - }, - [CT_DCCP_ROLE_SERVER] = { - [DCCP_PKT_REQUEST] = { - /* - * sNO -> sIV Invalid - * sRQ -> sIG Ignore, conntrack might be out of sync - * sRS -> sIG Ignore, conntrack might be out of sync - * sPO -> sIG Ignore, conntrack might be out of sync - * sOP -> sIG Ignore, conntrack might be out of sync - * sCR -> sIG Ignore, conntrack might be out of sync - * sCG -> sIG Ignore, conntrack might be out of sync - * sTW -> sRQ Reincarnation, must reverse roles - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ - }, - [DCCP_PKT_RESPONSE] = { - /* - * sNO -> sIV Response without Request - * sRQ -> sRS Response to clients Request - * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) - * sPO -> sIG Response to an ignored Request or late retransmit - * sOP -> sIG Ignore, might be response to ignored Request - * sCR -> sIG Ignore, might be response to ignored Request - * sCG -> sIG Ignore, might be response to ignored Request - * sTW -> sIV Invalid, Request from client in sTW moves to sRQ - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV - }, - [DCCP_PKT_ACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular Ack in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_DATA] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular Data packet in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_DATAACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular DataAck in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_CLOSEREQ] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) - * sOP -> sCR CloseReq in OPEN state - * sCR -> sCR Retransmit - * sCG -> sCR Simultaneous close, client sends another Close - * sTW -> sIV Already closed - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV - }, - [DCCP_PKT_CLOSE] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP -> sCG Move direcly to CLOSING - * sOP -> sCG Move to CLOSING - * sCR -> sIV Close after CloseReq is invalid - * sCG -> sCG Retransmit - * sTW -> sIV Already closed - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV - }, - [DCCP_PKT_RESET] = { - /* - * sNO -> sIV No connection - * sRQ -> sTW Reset in response to Request - * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) - * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) - * sOP -> sTW - * sCR -> sTW - * sCG -> sTW - * sTW -> sIG Ignore (don't refresh timer) - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ - sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG - }, - [DCCP_PKT_SYNC] = { - /* - * We currently ignore Sync packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - [DCCP_PKT_SYNCACK] = { - /* - * We currently ignore SyncAck packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - }, -}; - -static noinline bool -dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - const struct dccp_hdr *dh) -{ - struct net *net = nf_ct_net(ct); - struct nf_dccp_net *dn; - const char *msg; - u_int8_t state; - - state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; - switch (state) { - default: - dn = nf_dccp_pernet(net); - if (dn->dccp_loose == 0) { - msg = "not picking up existing connection "; - goto out_invalid; - } - case CT_DCCP_REQUEST: - break; - case CT_DCCP_INVALID: - msg = "invalid state transition "; - goto out_invalid; - } - - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.state = CT_DCCP_NONE; - ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; - ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; - ct->proto.dccp.handshake_seq = 0; - return true; - -out_invalid: - nf_ct_l4proto_log_invalid(skb, ct, "%s", msg); - return false; -} - -static u64 dccp_ack_seq(const struct dccp_hdr *dh) -{ - const struct dccp_hdr_ack_bits *dhack; - - dhack = (void *)dh + __dccp_basic_hdr_len(dh); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + - ntohl(dhack->dccph_ack_nr_low); -} - -static bool dccp_error(const struct dccp_hdr *dh, - struct sk_buff *skb, unsigned int dataoff, - const struct nf_hook_state *state) -{ - unsigned int dccp_len = skb->len - dataoff; - unsigned int cscov; - const char *msg; - - if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || - dh->dccph_doff * 4 > dccp_len) { - msg = "nf_ct_dccp: truncated/malformed packet "; - goto out_invalid; - } - - cscov = dccp_len; - if (dh->dccph_cscov) { - cscov = (dh->dccph_cscov - 1) * 4; - if (cscov > dccp_len) { - msg = "nf_ct_dccp: bad checksum coverage "; - goto out_invalid; - } - } - - if (state->hook == NF_INET_PRE_ROUTING && - state->net->ct.sysctl_checksum && - nf_checksum_partial(skb, state->hook, dataoff, cscov, - IPPROTO_DCCP, state->pf)) { - msg = "nf_ct_dccp: bad checksum "; - goto out_invalid; - } - - if (dh->dccph_type >= DCCP_PKT_INVALID) { - msg = "nf_ct_dccp: reserved packet type "; - goto out_invalid; - } - return false; -out_invalid: - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_DCCP, "%s", msg); - return true; -} - -int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct dccp_hdr _dh, *dh; - u_int8_t type, old_state, new_state; - enum ct_dccp_roles role; - unsigned int *timeouts; - - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); - if (!dh) - return NF_DROP; - - if (dccp_error(dh, skb, dataoff, state)) - return -NF_ACCEPT; - - type = dh->dccph_type; - if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh)) - return -NF_ACCEPT; - - if (type == DCCP_PKT_RESET && - !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - /* Tear down connection immediately if only reply is a RESET */ - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_ACCEPT; - } - - spin_lock_bh(&ct->lock); - - role = ct->proto.dccp.role[dir]; - old_state = ct->proto.dccp.state; - new_state = dccp_state_table[role][type][old_state]; - - switch (new_state) { - case CT_DCCP_REQUEST: - if (old_state == CT_DCCP_TIMEWAIT && - role == CT_DCCP_ROLE_SERVER) { - /* Reincarnation in the reverse direction: reopen and - * reverse client/server roles. */ - ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; - } - break; - case CT_DCCP_RESPOND: - if (old_state == CT_DCCP_REQUEST) - ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); - break; - case CT_DCCP_PARTOPEN: - if (old_state == CT_DCCP_RESPOND && - type == DCCP_PKT_ACK && - dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) - set_bit(IPS_ASSURED_BIT, &ct->status); - break; - case CT_DCCP_IGNORE: - /* - * Connection tracking might be out of sync, so we ignore - * packets that might establish a new connection and resync - * if the server responds with a valid Response. - */ - if (ct->proto.dccp.last_dir == !dir && - ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && - type == DCCP_PKT_RESPONSE) { - ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); - new_state = CT_DCCP_RESPOND; - break; - } - ct->proto.dccp.last_dir = dir; - ct->proto.dccp.last_pkt = type; - - spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet"); - return NF_ACCEPT; - case CT_DCCP_INVALID: - spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition"); - return -NF_ACCEPT; - } - - ct->proto.dccp.last_dir = dir; - ct->proto.dccp.last_pkt = type; - ct->proto.dccp.state = new_state; - spin_unlock_bh(&ct->lock); - - if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, ct); - - timeouts = nf_ct_timeout_lookup(ct); - if (!timeouts) - timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; - nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - - return NF_ACCEPT; -} - -static bool dccp_can_early_drop(const struct nf_conn *ct) -{ - switch (ct->proto.dccp.state) { - case CT_DCCP_CLOSEREQ: - case CT_DCCP_CLOSING: - case CT_DCCP_TIMEWAIT: - return true; - default: - break; - } - - return false; -} - -#ifdef CONFIG_NF_CONNTRACK_PROCFS -static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) -{ - seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) -static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) -{ - struct nlattr *nest_parms; - - spin_lock_bh(&ct->lock); - nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); - if (!nest_parms) - goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) || - nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || - nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, - cpu_to_be64(ct->proto.dccp.handshake_seq), - CTA_PROTOINFO_DCCP_PAD)) - goto nla_put_failure; - nla_nest_end(skb, nest_parms); - spin_unlock_bh(&ct->lock); - return 0; - -nla_put_failure: - spin_unlock_bh(&ct->lock); - return -1; -} - -static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { - [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, - [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, -}; - -#define DCCP_NLATTR_SIZE ( \ - NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \ - NLA_ALIGN(NLA_HDRLEN + 0)) - -static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) -{ - struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; - struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; - int err; - - if (!attr) - return 0; - - err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr, - dccp_nla_policy, NULL); - if (err < 0) - return err; - - if (!tb[CTA_PROTOINFO_DCCP_STATE] || - !tb[CTA_PROTOINFO_DCCP_ROLE] || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { - return -EINVAL; - } - - spin_lock_bh(&ct->lock); - ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); - if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; - } else { - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; - } - if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { - ct->proto.dccp.handshake_seq = - be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); - } - spin_unlock_bh(&ct->lock); - return 0; -} -#endif - -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_cttimeout.h> - -static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - struct nf_dccp_net *dn = nf_dccp_pernet(net); - unsigned int *timeouts = data; - int i; - - if (!timeouts) - timeouts = dn->dccp_timeout; - - /* set default DCCP timeouts. */ - for (i=0; i<CT_DCCP_MAX; i++) - timeouts[i] = dn->dccp_timeout[i]; - - /* there's a 1:1 mapping between attributes and protocol states. */ - for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { - if (tb[i]) { - timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; - } - } - - timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST]; - return 0; -} - -static int -dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeouts = data; - int i; - - for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { - if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) - goto nla_put_failure; - } - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { - [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - -void nf_conntrack_dccp_init_net(struct net *net) -{ - struct nf_dccp_net *dn = nf_dccp_pernet(net); - - /* default values */ - dn->dccp_loose = 1; - dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; - - /* timeouts[0] is unused, make it same as SYN_SENT so - * ->timeouts[0] contains 'new' timeout, like udp or icmp. - */ - dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { - .l4proto = IPPROTO_DCCP, - .can_early_drop = dccp_can_early_drop, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = dccp_print_conntrack, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = DCCP_NLATTR_SIZE, - .to_nlattr = dccp_to_nlattr, - .from_nlattr = nlattr_to_dccp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = dccp_timeout_nlattr_to_obj, - .obj_to_nlattr = dccp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_DCCP_MAX, - .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, - .nla_policy = dccp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -}; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 5b05487a60d2..af369e686fc5 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net) return &net->ct.nf_ct_proto.gre; } -void nf_ct_gre_keymap_flush(struct net *net) -{ - struct nf_gre_net *net_gre = gre_pernet(net); - struct nf_ct_gre_keymap *km, *tmp; - - spin_lock_bh(&keymap_lock); - list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) { - list_del_rcu(&km->list); - kfree_rcu(km, rcu); - } - spin_unlock_bh(&keymap_lock); -} - static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) { @@ -218,8 +205,7 @@ int nf_conntrack_gre_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { - if (state->pf != NFPROTO_IPV4) - return -NF_ACCEPT; + unsigned long status; if (!nf_ct_is_confirmed(ct)) { unsigned int *timeouts = nf_ct_timeout_lookup(ct); @@ -233,11 +219,17 @@ int nf_conntrack_gre_packet(struct nf_conn *ct, ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; } + status = READ_ONCE(ct->status); /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ - if (ct->status & IPS_SEEN_REPLY) { + if (status & IPS_SEEN_REPLY) { nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.stream_timeout); + + /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ + if (unlikely((status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe. */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); @@ -312,6 +304,7 @@ void nf_conntrack_gre_init_net(struct net *net) /* protocol helper struct */ const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = { .l4proto = IPPROTO_GRE, + .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 4efd8741c105..b38b7164acd5 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -170,12 +170,12 @@ int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, ct_daddr = &ct->tuplehash[dir].tuple.dst.u3; if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) { if (state->pf == AF_INET) { - nf_l4proto_log_invalid(skb, state->net, state->pf, + nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI4 != inner %pI4", &outer_daddr->ip, &ct_daddr->ip); } else if (state->pf == AF_INET6) { - nf_l4proto_log_invalid(skb, state->net, state->pf, + nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI6 != inner %pI6", &outer_daddr->ip6, &ct_daddr->ip6); @@ -197,8 +197,7 @@ static void icmp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_ICMP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_ICMP, "%s", msg); } /* Small and modified version of icmp_rcv */ diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index facd8c64ec4e..327b8059025d 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = { [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, - [ICMPV6_MLD2_REPORT - 130] = 1 + [ICMPV6_MLD2_REPORT - 130] = 1, + [ICMPV6_MRDISC_ADV - 130] = 1, + [ICMPV6_MRDISC_SOL - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, @@ -126,8 +128,57 @@ static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_ICMPV6, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); +} + +static noinline_for_stack int +nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) +{ + u8 hl = ipv6_hdr(skb)->hop_limit; + union nf_inet_addr outer_daddr; + union { + struct nd_opt_hdr nd_opt; + struct rd_msg rd_msg; + } tmp; + const struct nd_opt_hdr *nd_opt; + const struct rd_msg *rd_msg; + + rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); + if (!rd_msg) { + icmpv6_error_log(skb, state, "short redirect"); + return -NF_ACCEPT; + } + + if (rd_msg->icmph.icmp6_code != 0) + return NF_ACCEPT; + + if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); + return -NF_ACCEPT; + } + + dataoff += sizeof(*rd_msg); + + /* warning: rd_msg no longer usable after this call */ + nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); + if (!nd_opt || nd_opt->nd_opt_len == 0) { + icmpv6_error_log(skb, state, "redirect without options"); + return -NF_ACCEPT; + } + + /* We could call ndisc_parse_options(), but it would need + * skb_linearize() and a bit more work. + */ + if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) + return NF_ACCEPT; + + memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, + sizeof(outer_daddr.ip6)); + dataoff += 8; + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMPV6, &outer_daddr); } int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, @@ -160,6 +211,9 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, return NF_ACCEPT; } + if (icmp6h->icmp6_type == NDISC_REDIRECT) + return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); + /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 4f897b14b606..7c6f7c9f7332 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -27,41 +27,31 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR - - And so for me for SCTP :D -Kiran */ - static const char *const sctp_conntrack_names[] = { - "NONE", - "CLOSED", - "COOKIE_WAIT", - "COOKIE_ECHOED", - "ESTABLISHED", - "SHUTDOWN_SENT", - "SHUTDOWN_RECD", - "SHUTDOWN_ACK_SENT", - "HEARTBEAT_SENT", - "HEARTBEAT_ACKED", + [SCTP_CONNTRACK_NONE] = "NONE", + [SCTP_CONNTRACK_CLOSED] = "CLOSED", + [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", + [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", + [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", + [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", + [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", + [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { - [SCTP_CONNTRACK_CLOSED] = 10 SECS, - [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, - [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS, - [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, - [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, - [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, - [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, - [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, + [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), + [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), + [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), + [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; +#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 + #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT @@ -71,7 +61,6 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT -#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED #define sIV SCTP_CONNTRACK_MAX /* @@ -88,15 +77,12 @@ COOKIE WAIT - We have seen an INIT chunk in the original direction, or als COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. -SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. -HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to - that of the HEARTBEAT chunk. Secondary connection is - established. */ /* TODO @@ -113,33 +99,33 @@ cookie echoed to closed. static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, -/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ +/* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, -/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, +/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; @@ -151,6 +137,7 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) } #endif +/* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ @@ -161,7 +148,8 @@ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - unsigned long *map) + unsigned long *map, + const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; @@ -170,8 +158,6 @@ static int do_basic_checks(struct nf_conn *ct, flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); - if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) @@ -186,7 +172,9 @@ static int do_basic_checks(struct nf_conn *ct, sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { - pr_debug("Basic checks failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "%s failed. chunk num %d, type %d, len %d flag %d\n", + __func__, count, sch->type, sch->length, flag); return 1; } @@ -194,7 +182,6 @@ static int do_basic_checks(struct nf_conn *ct, set_bit(sch->type, map); } - pr_debug("Basic checks passed\n"); return count == 0; } @@ -204,64 +191,47 @@ static int sctp_new_state(enum ip_conntrack_dir dir, { int i; - pr_debug("Chunk type: %d\n", chunk_type); - switch (chunk_type) { case SCTP_CID_INIT: - pr_debug("SCTP_CID_INIT\n"); i = 0; break; case SCTP_CID_INIT_ACK: - pr_debug("SCTP_CID_INIT_ACK\n"); i = 1; break; case SCTP_CID_ABORT: - pr_debug("SCTP_CID_ABORT\n"); i = 2; break; case SCTP_CID_SHUTDOWN: - pr_debug("SCTP_CID_SHUTDOWN\n"); i = 3; break; case SCTP_CID_SHUTDOWN_ACK: - pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); i = 4; break; case SCTP_CID_ERROR: - pr_debug("SCTP_CID_ERROR\n"); i = 5; break; case SCTP_CID_COOKIE_ECHO: - pr_debug("SCTP_CID_COOKIE_ECHO\n"); i = 6; break; case SCTP_CID_COOKIE_ACK: - pr_debug("SCTP_CID_COOKIE_ACK\n"); i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: - pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; case SCTP_CID_HEARTBEAT: - pr_debug("SCTP_CID_HEARTBEAT"); i = 9; break; case SCTP_CID_HEARTBEAT_ACK: - pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ - pr_debug("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); + pr_debug("Unknown chunk type %d, Will stay in %s\n", + chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } - pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", - dir, sctp_conntrack_names[cur_state], chunk_type, - sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); - return sctp_conntracks[dir][i][cur_state]; } @@ -308,7 +278,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; - } else { + } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", @@ -349,7 +319,7 @@ static bool sctp_error(struct sk_buff *skb, } return false; out_invalid: - nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg); + nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } @@ -369,6 +339,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; @@ -377,7 +348,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (sh == NULL) goto out; - if (do_basic_checks(ct, skb, dataoff, map) != 0) + if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { @@ -400,7 +371,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "verification tag check failed %x vs %x for dir %d", + sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } @@ -409,33 +382,67 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { - /* Sec 8.5.1 (A) */ + /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { - /* Sec 8.5.1 (B) */ - if (sh->vtag != ct->proto.sctp.vtag[dir] && - sh->vtag != ct->proto.sctp.vtag[!dir]) + /* (B) vtag MUST match own vtag if T flag is unset OR + * MUST match peer's vtag if T flag is set + */ + if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[dir]) || + ((sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { - /* Sec 8.5.1 (C) */ - if (sh->vtag != ct->proto.sctp.vtag[dir] && - sh->vtag != ct->proto.sctp.vtag[!dir] && - sch->flags & SCTP_CHUNK_FLAG_T) + /* (C) vtag MUST match own vtag if T flag is unset OR + * MUST match peer's vtag if T flag is set + */ + if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[dir]) || + ((sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { - /* Sec 8.5.1 (D) */ + /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; - } else if (sch->type == SCTP_CID_HEARTBEAT || - sch->type == SCTP_CID_HEARTBEAT_ACK) { + } else if (sch->type == SCTP_CID_COOKIE_ACK) { + ct->proto.sctp.init[dir] = 0; + ct->proto.sctp.init[!dir] = 0; + } else if (sch->type == SCTP_CID_HEARTBEAT) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { + if (test_bit(SCTP_CID_DATA, map) || ignore) + goto out_unlock; + + ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + ct->proto.sctp.last_dir = dir; + ignore = true; + continue; + } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + } + } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); - goto out_unlock; + if (test_bit(SCTP_CID_DATA, map) || ignore) + goto out_unlock; + + if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || + ct->proto.sctp.last_dir == dir) + goto out_unlock; + + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + ct->proto.sctp.vtag[dir] = sh->vtag; + ct->proto.sctp.vtag[!dir] = 0; + } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } @@ -444,46 +451,76 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { - pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " - "conntrack=%u\n", - dir, sch->type, old_state); + nf_ct_l4proto_log_invalid(skb, ct, state, + "Invalid, old_state %d, dir %d, type %d", + old_state, dir, sch->type); + goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT || - sch->type == SCTP_CID_INIT_ACK) { - struct sctp_inithdr _inithdr, *ih; + if (sch->type == SCTP_CID_INIT) { + struct sctp_inithdr _ih, *ih; - ih = skb_header_pointer(skb, offset + sizeof(_sch), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) goto out_unlock; - pr_debug("Setting vtag %x for dir %d\n", - ih->init_tag, !dir); + + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) + ct->proto.sctp.init[!dir] = 0; + ct->proto.sctp.init[dir] = 1; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; + + /* don't renew timeout on init retransmit so + * port reuse by client or NAT middlebox cannot + * keep entry alive indefinitely (incl. nat info). + */ + if (new_state == SCTP_CONNTRACK_CLOSED && + old_state == SCTP_CONNTRACK_CLOSED && + nf_ct_is_confirmed(ct)) + ignore = true; + } else if (sch->type == SCTP_CID_INIT_ACK) { + struct sctp_inithdr _ih, *ih; + __be32 vtag; + + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) + goto out_unlock; + + vtag = ct->proto.sctp.vtag[!dir]; + if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) + goto out_unlock; + /* collision */ + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && + vtag != ih->init_tag) + goto out_unlock; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; - if (old_state != new_state) + if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + if (new_state == SCTP_CONNTRACK_ESTABLISHED && + !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } } spin_unlock_bh(&ct->lock); + /* allow but do not refresh timeout */ + if (ignore) + return NF_ACCEPT; + timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && - dir == IP_CT_DIR_REPLY && - new_state == SCTP_CONNTRACK_ESTABLISHED) { - pr_debug("Setting assured bit\n"); - set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_ASSURED, ct); - } - return NF_ACCEPT; out_unlock: @@ -512,7 +549,7 @@ static bool sctp_can_early_drop(const struct nf_conn *ct) #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) + struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; @@ -521,15 +558,20 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) || - nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, + if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) + goto nla_put_failure; + + if (destroy) + goto skip_state; + + if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; +skip_state: spin_unlock_bh(&ct->lock); - nla_nest_end(skb, nest_parms); return 0; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1926fd56df56..0c1d086e96cb 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -14,7 +14,7 @@ #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/tcp.h> @@ -31,20 +31,6 @@ #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> -/* "Be conservative in what you do, - be liberal in what you accept from others." - If it's non-zero, we mark only out of window RST segments as INVALID. */ -static int nf_ct_tcp_be_liberal __read_mostly = 0; - -/* If it is set to zero, we disable picking up already established - connections. */ -static int nf_ct_tcp_loose __read_mostly = 1; - -/* Max number of the retransmitted packets without receiving an (acceptable) - ACK from the destination. If this number is reached, a shorter timer - will be started. */ -static int nf_ct_tcp_max_retrans __read_mostly = 3; - /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ @@ -61,6 +47,12 @@ static const char *const tcp_conntrack_names[] = { "SYN_SENT2", }; +enum nf_ct_tcp_action { + NFCT_TCP_IGNORE, + NFCT_TCP_INVALID, + NFCT_TCP_ACCEPT, +}; + #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS @@ -352,10 +344,11 @@ static void tcp_options(const struct sk_buff *skb, ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); - BUG_ON(ptr == NULL); + if (!ptr) + return; - state->td_scale = - state->flags = 0; + state->td_scale = 0; + state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; while (length > 0) { int opcode=*ptr++; @@ -408,7 +401,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); - BUG_ON(ptr == NULL); + if (!ptr) + return; /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED @@ -458,23 +452,73 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -static bool tcp_in_window(const struct nf_conn *ct, - struct ip_ct_tcp *state, - enum ip_conntrack_dir dir, - unsigned int index, - const struct sk_buff *skb, - unsigned int dataoff, - const struct tcphdr *tcph) +static void tcp_init_sender(struct ip_ct_tcp_state *sender, + struct ip_ct_tcp_state *receiver, + const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + u32 end, u32 win, + enum ip_conntrack_dir dir) { - struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = nf_tcp_pernet(net); + /* SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + /* RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (dir == IP_CT_DIR_REPLY && + !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && + receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) { + sender->td_scale = 0; + receiver->td_scale = 0; + } +} + +__printf(6, 7) +static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const struct nf_hook_state *state, + const struct ip_ct_tcp_state *sender, + enum nf_ct_tcp_action ret, + const char *fmt, ...) +{ + const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct)); + struct va_format vaf; + va_list args; + bool be_liberal; + + be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal; + if (be_liberal) + return NFCT_TCP_ACCEPT; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf); + va_end(args); + + return ret; +} + +static enum nf_ct_tcp_action +tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, + unsigned int index, const struct sk_buff *skb, + unsigned int dataoff, const struct tcphdr *tcph, + const struct nf_hook_state *hook_state) +{ + struct ip_ct_tcp *state = &ct->proto.tcp; struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; - const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; - u16 win_raw; + bool in_recv_win, seq_ok; s32 receiver_offset; - bool res, in_recv_win; + u16 win_raw; /* * Get the required data from the packet. @@ -493,44 +537,17 @@ static bool tcp_in_window(const struct nf_conn *ct, ack -= receiver_offset; sack -= receiver_offset; - pr_debug("tcp_in_window: START\n"); - pr_debug("tcp_in_window: "); - nf_ct_dump_tuple(tuple); - pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", - seq, ack, receiver_offset, sack, receiver_offset, win, end); - pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - if (sender->td_maxwin == 0) { /* * Initialize sender data. */ if (tcph->syn) { - /* - * SYN-ACK in reply to a SYN - * or SYN from reply direction in simultaneous open. - */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); - - tcp_options(skb, dataoff, tcph, sender); - /* - * RFC 1323: - * Both sides must send the Window Scale option - * to enable window scaling in either direction. - */ - if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE - && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) - sender->td_scale = - receiver->td_scale = 0; + tcp_init_sender(sender, receiver, + skb, dataoff, tcph, + end, win, dir); if (!tcph->ack) /* Simultaneous open */ - return true; + return NFCT_TCP_ACCEPT; } else { /* * We are in the middle of a connection, @@ -541,29 +558,39 @@ static bool tcp_in_window(const struct nf_conn *ct, swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; - /* - * We haven't seen traffic in the other direction yet - * but we have to tweak window tracking to pass III - * and IV until that happens. - */ - if (receiver->td_maxwin == 0) + if (receiver->td_maxwin == 0) { + /* We haven't seen traffic in the other + * direction yet but we have to tweak window + * tracking to pass III and IV until that + * happens. + */ receiver->td_end = receiver->td_maxend = sack; + } else if (sack == receiver->td_end + 1) { + /* Likely a reply to a keepalive. + * Needed for III. + */ + receiver->td_end++; + } + } - } else if (((state->state == TCP_CONNTRACK_SYN_SENT - && dir == IP_CT_DIR_ORIGINAL) - || (state->state == TCP_CONNTRACK_SYN_RECV - && dir == IP_CT_DIR_REPLY)) - && after(end, sender->td_end)) { + } else if (tcph->syn && + after(end, sender->td_end) && + (state->state == TCP_CONNTRACK_SYN_SENT || + state->state == TCP_CONNTRACK_SYN_RECV)) { /* * RFC 793: "if a TCP is reinitialized ... then it need * not wait at all; it must only be sure to use sequence * numbers larger than those recently used." + * + * Re-init state for this direction, just like for the first + * syn(-ack) reply, it might differ in seq, ack or tcp options. */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); + tcp_init_sender(sender, receiver, + skb, dataoff, tcph, + end, win, dir); - tcp_options(skb, dataoff, tcph, sender); + if (dir == IP_CT_DIR_REPLY && !tcph->ack) + return NFCT_TCP_ACCEPT; } if (!(tcph->ack)) { @@ -587,113 +614,166 @@ static bool tcp_in_window(const struct nf_conn *ct, */ seq = end = sender->td_end; - pr_debug("tcp_in_window: "); - nf_ct_dump_tuple(tuple); - pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", - seq, ack, receiver_offset, sack, receiver_offset, win, end); - pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); + seq_ok = before(seq, sender->td_maxend + 1); + if (!seq_ok) { + u32 overshot = end - sender->td_maxend + 1; + bool ack_ok; + + ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); + in_recv_win = receiver->td_maxwin && + after(end, sender->td_end - receiver->td_maxwin - 1); + + if (in_recv_win && + ack_ok && + overshot <= receiver->td_maxwin && + before(sack, receiver->td_end + 1)) { + /* Work around TCPs that send more bytes than allowed by + * the receive window. + * + * If the (marked as invalid) packet is allowed to pass by + * the ruleset and the peer acks this data, then its possible + * all future packets will trigger 'ACK is over upper bound' check. + * + * Thus if only the sequence check fails then do update td_end so + * possible ACK for this data can update internal state. + */ + sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, + "%u bytes more than expected", overshot); + } + + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID, + "SEQ is over upper bound %u (over the window of the receiver)", + sender->td_maxend + 1); + } + + if (!before(sack, receiver->td_end + 1)) + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID, + "ACK is over upper bound %u (ACKed data not seen yet)", + receiver->td_end + 1); /* Is the ending sequence in the receive window (if available)? */ in_recv_win = !receiver->td_maxwin || after(end, sender->td_end - receiver->td_maxwin - 1); + if (!in_recv_win) + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, + "SEQ is under lower bound %u (already ACKed data retransmitted)", + sender->td_end - receiver->td_maxwin - 1); + if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, + "ignored ACK under lower bound %u (possible overly delayed)", + receiver->td_end - MAXACKWINDOW(sender) - 1); + + /* Take into account window scaling (RFC 1323). */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* Update sender data. */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) { + sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) { + sender->td_maxack = ack; + } + } - pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", - before(seq, sender->td_maxend + 1), - (in_recv_win ? 1 : 0), - before(sack, receiver->td_end + 1), - after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); + /* Update receiver data. */ + if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + if (ack == receiver->td_end) + receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + /* Check retransmissions. */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir && + state->last_seq == seq && + state->last_ack == ack && + state->last_end == end && + state->last_win == win_raw) { + state->retrans++; + } else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->last_win = win_raw; + state->retrans = 0; + } + } - if (before(seq, sender->td_maxend + 1) && - in_recv_win && - before(sack, receiver->td_end + 1) && - after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { - /* - * Take into account window scaling (RFC 1323). - */ - if (!tcph->syn) - win <<= sender->td_scale; + return NFCT_TCP_ACCEPT; +} - /* - * Update sender data. - */ - swin = win + (sack - ack); - if (sender->td_maxwin < swin) - sender->td_maxwin = swin; - if (after(end, sender->td_end)) { - sender->td_end = end; - sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; - } - if (tcph->ack) { - if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { - sender->td_maxack = ack; - sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; - } else if (after(ack, sender->td_maxack)) - sender->td_maxack = ack; - } +static void __cold nf_tcp_handle_invalid(struct nf_conn *ct, + enum ip_conntrack_dir dir, + int index, + const struct sk_buff *skb, + const struct nf_hook_state *hook_state) +{ + const unsigned int *timeouts; + const struct nf_tcp_net *tn; + unsigned int timeout; + u32 expires; - /* - * Update receiver data. - */ - if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) - receiver->td_maxwin += end - sender->td_maxend; - if (after(sack + win, receiver->td_maxend - 1)) { - receiver->td_maxend = sack + win; - if (win == 0) - receiver->td_maxend++; - } - if (ack == receiver->td_end) - receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + if (!test_bit(IPS_ASSURED_BIT, &ct->status) || + test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) + return; - /* - * Check retransmissions. - */ - if (index == TCP_ACK_SET) { - if (state->last_dir == dir - && state->last_seq == seq - && state->last_ack == ack - && state->last_end == end - && state->last_win == win_raw) - state->retrans++; - else { - state->last_dir = dir; - state->last_seq = seq; - state->last_ack = ack; - state->last_end = end; - state->last_win = win_raw; - state->retrans = 0; - } - } - res = true; - } else { - res = false; - if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - tn->tcp_be_liberal) - res = true; - if (!res) { - nf_ct_l4proto_log_invalid(skb, ct, - "%s", - before(seq, sender->td_maxend + 1) ? - in_recv_win ? - before(sack, receiver->td_end + 1) ? - after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" - : "ACK is under the lower bound (possible overly delayed ACK)" - : "ACK is over the upper bound (ACKed data not seen yet)" - : "SEQ is under the lower bound (already ACKed data retransmitted)" - : "SEQ is over the upper bound (over the window of the receiver)"); - } + /* We don't want to have connections hanging around in ESTABLISHED + * state for long time 'just because' conntrack deemed a FIN/RST + * out-of-window. + * + * Shrink the timeout just like when there is unacked data. + * This speeds up eviction of 'dead' connections where the + * connection and conntracks internal state are out of sync. + */ + switch (index) { + case TCP_RST_SET: + case TCP_FIN_SET: + break; + default: + return; } - pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " - "receiver end=%u maxend=%u maxwin=%u\n", - res, sender->td_end, sender->td_maxend, sender->td_maxwin, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + if (ct->proto.tcp.last_dir != dir && + (ct->proto.tcp.last_index == TCP_FIN_SET || + ct->proto.tcp.last_index == TCP_RST_SET)) { + expires = nf_ct_expires(ct); + if (expires < 120 * HZ) + return; + + tn = nf_tcp_pernet(nf_ct_net(ct)); + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = tn->timeouts; + + timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]); + if (expires > timeout) { + nf_ct_l4proto_log_invalid(skb, ct, hook_state, + "packet (index %d, dir %d) response for index %d lower timeout to %u", + index, dir, ct->proto.tcp.last_index, timeout); - return res; + WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); + } + } else { + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; + } } /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ @@ -715,7 +795,7 @@ static void tcp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ @@ -757,20 +837,19 @@ static bool tcp_error(const struct tcphdr *th, static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - const struct tcphdr *th) + const struct tcphdr *th, + const struct nf_hook_state *state) { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); const struct nf_tcp_net *tn = nf_tcp_pernet(net); - const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; - const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; /* Don't need lock here: this conntrack not in circulation yet */ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { - pr_debug("nf_ct_tcp: invalid new deleting.\n"); + tcp_error_log(skb, state, "invalid new"); return false; } @@ -816,21 +895,68 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, /* tcp_packet will set them */ ct->proto.tcp.last_index = TCP_NONE_SET; - - pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - __func__, - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); return true; } -static bool nf_conntrack_tcp_established(const struct nf_conn *ct) +static bool tcp_can_early_drop(const struct nf_conn *ct) { - return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && - test_bit(IPS_ASSURED_BIT, &ct->status); + switch (ct->proto.tcp.state) { + case TCP_CONNTRACK_FIN_WAIT: + case TCP_CONNTRACK_LAST_ACK: + case TCP_CONNTRACK_TIME_WAIT: + case TCP_CONNTRACK_CLOSE: + case TCP_CONNTRACK_CLOSE_WAIT: + return true; + default: + break; + } + + return false; +} + +void nf_conntrack_tcp_set_closing(struct nf_conn *ct) +{ + enum tcp_conntrack old_state; + const unsigned int *timeouts; + u32 timeout; + + if (!nf_ct_is_confirmed(ct)) + return; + + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; + ct->proto.tcp.state = TCP_CONNTRACK_CLOSE; + + if (old_state == TCP_CONNTRACK_CLOSE || + test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_unlock_bh(&ct->lock); + return; + } + + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) { + const struct nf_tcp_net *tn; + + tn = nf_tcp_pernet(nf_ct_net(ct)); + timeouts = tn->timeouts; + } + + timeout = timeouts[TCP_CONNTRACK_CLOSE]; + WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); + + spin_unlock_bh(&ct->lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); +} + +static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) +{ + state->td_end = 0; + state->td_maxend = 0; + state->td_maxwin = 0; + state->td_maxack = 0; + state->td_scale = 0; + state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Returns verdict for packet, or -1 for invalid. */ @@ -842,9 +968,9 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); - struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; + enum nf_ct_tcp_action res; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; @@ -857,7 +983,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, if (tcp_error(th, skb, dataoff, state)) return -NF_ACCEPT; - if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th)) + if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state)) return -NF_ACCEPT; spin_lock_bh(&ct->lock); @@ -865,7 +991,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; - tuple = &ct->tuplehash[dir].tuple; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: @@ -900,7 +1025,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, return -NF_REPEAT; return NF_DROP; } - /* Fall through */ + fallthrough; case TCP_CONNTRACK_IGNORE: /* Ignored packets: * @@ -939,8 +1064,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; - memset(&ct->proto.tcp.seen[dir], 0, - sizeof(struct ip_ct_tcp_state)); + nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]); break; } ct->proto.tcp.last_index = index; @@ -980,9 +1104,18 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags |= IP_CT_EXP_CHALLENGE_ACK; } + + /* possible challenge ack reply to syn */ + if (old_state == TCP_CONNTRACK_SYN_SENT && + index == TCP_ACK_SET && + dir == IP_CT_DIR_REPLY) + ct->proto.tcp.last_ack = ntohl(th->ack_seq); + spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in " - "state %s ", tcp_conntrack_names[old_state]); + nf_ct_l4proto_log_invalid(skb, ct, state, + "packet (index %d) in dir %d ignored, state %s", + index, dir, + tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or @@ -1001,10 +1134,11 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, } /* Invalid packet */ - pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", - dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid state"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "packet (index %d) in dir %d invalid, state %s", + index, dir, + tcp_conntrack_names[old_state]); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" @@ -1019,7 +1153,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored"); + nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; @@ -1038,13 +1172,33 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, if (index != TCP_RST_SET) break; - if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) { + /* If we are closing, tuple might have been re-used already. + * last_index, last_ack, and all other ct fields used for + * sequence/window validation are outdated in that case. + * + * As the conntrack can already be expired by GC under pressure, + * just skip validation checks. + */ + if (tcp_can_early_drop(ct)) + goto in_window; + + /* td_maxack might be outdated if we let a SYN through earlier */ + if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) && + ct->proto.tcp.last_index != TCP_SYN_SET) { u32 seq = ntohl(th->seq); - if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) { + /* If we are not in established state and SEQ=0 this is most + * likely an answer to a SYN we let go through above (last_index + * can be updated due to out-of-order ACKs). + */ + if (seq == 0 && !nf_conntrack_tcp_established(ct)) + break; + + if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) && + !tn->tcp_ignore_invalid_rst) { /* Invalid RST */ spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid rst"); + nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst"); return -NF_ACCEPT; } @@ -1082,29 +1236,38 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, * segments we ignored. */ goto in_window; } + + /* Reset in response to a challenge-ack we let through earlier */ + if (old_state == TCP_CONNTRACK_SYN_SENT && + ct->proto.tcp.last_index == TCP_ACK_SET && + ct->proto.tcp.last_dir == IP_CT_DIR_REPLY && + ntohl(th->seq) == ct->proto.tcp.last_ack) + goto in_window; + break; default: /* Keep compilers happy. */ break; } - if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, - skb, dataoff, th)) { + res = tcp_in_window(ct, dir, index, + skb, dataoff, th, state); + switch (res) { + case NFCT_TCP_IGNORE: + spin_unlock_bh(&ct->lock); + return NF_ACCEPT; + case NFCT_TCP_INVALID: + nf_tcp_handle_invalid(ct, dir, index, skb, state); spin_unlock_bh(&ct->lock); return -NF_ACCEPT; + case NFCT_TCP_ACCEPT: + break; } in_window: /* From now on we have got in-window packets */ ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; - pr_debug("tcp_conntracks: "); - nf_ct_dump_tuple(tuple); - pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", - (th->syn ? 1 : 0), (th->ack ? 1 : 0), - (th->fin ? 1 : 0), (th->rst ? 1 : 0), - old_state, new_state); - ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) @@ -1142,6 +1305,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } + + if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) { + /* do not renew timeout on SYN retransmit. + * + * Else port reuse by client or NAT middlebox can keep + * entry alive indefinitely (including nat info). + */ + return NF_ACCEPT; + } + /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection * pickup with loose=1. Avoid large ESTABLISHED timeout. */ @@ -1152,7 +1325,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { - /* Set ASSURED if we see see valid ack in ESTABLISHED + /* Set ASSURED if we see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); @@ -1163,29 +1336,13 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, return NF_ACCEPT; } -static bool tcp_can_early_drop(const struct nf_conn *ct) -{ - switch (ct->proto.tcp.state) { - case TCP_CONNTRACK_FIN_WAIT: - case TCP_CONNTRACK_LAST_ACK: - case TCP_CONNTRACK_TIME_WAIT: - case TCP_CONNTRACK_CLOSE: - case TCP_CONNTRACK_CLOSE_WAIT: - return true; - default: - break; - } - - return false; -} - #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) + struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; @@ -1195,8 +1352,13 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) || - nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state)) + goto nla_put_failure; + + if (destroy) + goto skip_state; + + if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, ct->proto.tcp.seen[0].td_scale) || nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, ct->proto.tcp.seen[1].td_scale)) @@ -1211,8 +1373,8 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; +skip_state: spin_unlock_bh(&ct->lock); - nla_nest_end(skb, nest_parms); return 0; @@ -1428,9 +1590,30 @@ void nf_conntrack_tcp_init_net(struct net *net) * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; - tn->tcp_loose = nf_ct_tcp_loose; - tn->tcp_be_liberal = nf_ct_tcp_be_liberal; - tn->tcp_max_retrans = nf_ct_tcp_max_retrans; + + /* If it is set to zero, we disable picking up already established + * connections. + */ + tn->tcp_loose = 1; + + /* "Be conservative in what you do, + * be liberal in what you accept from others." + * If it's non-zero, we mark only out of window RST segments as INVALID. + */ + tn->tcp_be_liberal = 0; + + /* If it's non-zero, we turn off RST sequence number check */ + tn->tcp_ignore_invalid_rst = 0; + + /* Max number of the retransmitted packets without receiving an (acceptable) + * ACK from the destination. If this number is reached, a shorter timer + * will be started. + */ + tn->tcp_max_retrans = 3; + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + tn->offload_timeout = 30 * HZ; +#endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 760ca2422816..0030fbe8885c 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -38,8 +38,7 @@ static void udp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_UDP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg); } static bool udp_error(struct sk_buff *skb, @@ -81,18 +80,6 @@ static bool udp_error(struct sk_buff *skb, return false; } -static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct, - struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - u32 extra_jiffies) -{ - if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY && - ct->status & IPS_NAT_CLASH)) - nf_ct_kill(ct); - else - nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies); -} - /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -101,6 +88,7 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, const struct nf_hook_state *state) { unsigned int *timeouts; + unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; @@ -109,27 +97,34 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); - if (!nf_ct_is_confirmed(ct)) + status = READ_ONCE(ct->status); + if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + if (status & IPS_SEEN_REPLY) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; + bool stream = false; /* Still active after two seconds? Extend timeout. */ - if (time_after(jiffies, ct->proto.udp.stream_ts)) + if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; + stream = (status & IPS_ASSURED) == 0; + } nf_ct_refresh_acct(ct, ctinfo, skb, extra); + /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ + if (unlikely((status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, - timeouts[UDP_CT_UNREPLIED]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } @@ -139,8 +134,7 @@ static void udplite_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_UDPLITE, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg); } static bool udplite_error(struct sk_buff *skb, @@ -206,12 +200,15 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct, if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_REPLIED]); + + if (unlikely((ct->status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, - timeouts[UDP_CT_UNREPLIED]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } @@ -276,6 +273,10 @@ void nf_conntrack_udp_init_net(struct net *net) for (i = 0; i < UDP_CT_MAX; i++) un->timeouts[i] = udp_timeouts[i]; + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + un->offload_timeout = 30 * HZ; +#endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 1aebd6569d4e..13dc421fc4f5 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -34,10 +34,6 @@ MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>"); MODULE_DESCRIPTION("SANE connection tracking helper"); MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); -static char *sane_buffer; - -static DEFINE_SPINLOCK(nf_sane_lock); - #define MAX_PORTS 8 static u_int16_t ports[MAX_PORTS]; static unsigned int ports_c; @@ -67,14 +63,16 @@ static int help(struct sk_buff *skb, unsigned int dataoff, datalen; const struct tcphdr *th; struct tcphdr _tcph; - void *sb_ptr; int ret = NF_ACCEPT; int dir = CTINFO2DIR(ctinfo); struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; struct nf_conntrack_tuple *tuple; - struct sane_request *req; struct sane_reply_net_start *reply; + union { + struct sane_request req; + struct sane_reply_net_start repl; + } buf; /* Until there's been traffic both ways, don't look in packets. */ if (ctinfo != IP_CT_ESTABLISHED && @@ -92,56 +90,62 @@ static int help(struct sk_buff *skb, return NF_ACCEPT; datalen = skb->len - dataoff; - - spin_lock_bh(&nf_sane_lock); - sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer); - BUG_ON(sb_ptr == NULL); - if (dir == IP_CT_DIR_ORIGINAL) { + const struct sane_request *req; + if (datalen != sizeof(struct sane_request)) - goto out; + return NF_ACCEPT; + + req = skb_header_pointer(skb, dataoff, datalen, &buf.req); + if (!req) + return NF_ACCEPT; - req = sb_ptr; if (req->RPC_code != htonl(SANE_NET_START)) { /* Not an interesting command */ - ct_sane_info->state = SANE_STATE_NORMAL; - goto out; + WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL); + return NF_ACCEPT; } /* We're interested in the next reply */ - ct_sane_info->state = SANE_STATE_START_REQUESTED; - goto out; + WRITE_ONCE(ct_sane_info->state, SANE_STATE_START_REQUESTED); + return NF_ACCEPT; } + /* IP_CT_DIR_REPLY */ + /* Is it a reply to an uninteresting command? */ - if (ct_sane_info->state != SANE_STATE_START_REQUESTED) - goto out; + if (READ_ONCE(ct_sane_info->state) != SANE_STATE_START_REQUESTED) + return NF_ACCEPT; /* It's a reply to SANE_NET_START. */ - ct_sane_info->state = SANE_STATE_NORMAL; + WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL); if (datalen < sizeof(struct sane_reply_net_start)) { pr_debug("NET_START reply too short\n"); - goto out; + return NF_ACCEPT; } - reply = sb_ptr; + datalen = sizeof(struct sane_reply_net_start); + + reply = skb_header_pointer(skb, dataoff, datalen, &buf.repl); + if (!reply) + return NF_ACCEPT; + if (reply->status != htonl(SANE_STATUS_SUCCESS)) { /* saned refused the command */ pr_debug("unsuccessful SANE_STATUS = %u\n", ntohl(reply->status)); - goto out; + return NF_ACCEPT; } /* Invalid saned reply? Ignore it. */ if (reply->zero != 0) - goto out; + return NF_ACCEPT; exp = nf_ct_expect_alloc(ct); if (exp == NULL) { nf_ct_helper_log(skb, ct, "cannot alloc expectation"); - ret = NF_DROP; - goto out; + return NF_DROP; } tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; @@ -159,9 +163,6 @@ static int help(struct sk_buff *skb, } nf_ct_expect_put(exp); - -out: - spin_unlock_bh(&nf_sane_lock); return ret; } @@ -175,7 +176,6 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = { static void __exit nf_conntrack_sane_fini(void) { nf_conntrack_helpers_unregister(sane, ports_c * 2); - kfree(sane_buffer); } static int __init nf_conntrack_sane_init(void) @@ -184,10 +184,6 @@ static int __init nf_conntrack_sane_init(void) NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sane_master)); - sane_buffer = kmalloc(65536, GFP_KERNEL); - if (!sane_buffer) - return -ENOMEM; - if (ports_c == 0) ports[ports_c++] = SANE_PORT; @@ -207,7 +203,6 @@ static int __init nf_conntrack_sane_init(void) ret = nf_conntrack_helpers_register(sane, ports_c * 2); if (ret < 0) { pr_err("failed to register helpers\n"); - kfree(sane_buffer); return ret; } diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index 3066449f8bd8..7ab2b25b57bc 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -232,19 +232,3 @@ s32 nf_ct_seq_offset(const struct nf_conn *ct, this_way->offset_after : this_way->offset_before; } EXPORT_SYMBOL_GPL(nf_ct_seq_offset); - -static const struct nf_ct_ext_type nf_ct_seqadj_extend = { - .len = sizeof(struct nf_conn_seqadj), - .align = __alignof__(struct nf_conn_seqadj), - .id = NF_CT_EXT_SEQADJ, -}; - -int nf_conntrack_seqadj_init(void) -{ - return nf_ct_extend_register(&nf_ct_seqadj_extend); -} - -void nf_conntrack_seqadj_fini(void) -{ - nf_ct_extend_unregister(&nf_ct_seqadj_extend); -} diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index b83dc9bf0a5d..ca748f8dbff1 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -60,7 +60,7 @@ module_param(sip_external_media, int, 0600); MODULE_PARM_DESC(sip_external_media, "Expect Media streams between external " "endpoints (default 0)"); -const struct nf_nat_sip_hooks *nf_nat_sip_hooks; +const struct nf_nat_sip_hooks __rcu *nf_nat_sip_hooks; EXPORT_SYMBOL_GPL(nf_nat_sip_hooks); static int string_len(const struct nf_conn *ct, const char *dptr, @@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, return ret; if (ret == 0) break; - dataoff += *matchoff; + dataoff = *matchoff; } *in_header = 0; } @@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, break; if (ret == 0) return ret; - dataoff += *matchoff; + dataoff = *matchoff; } if (in_header) @@ -611,7 +611,7 @@ int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, start += strlen(name); *val = simple_strtoul(start, &end, 0); if (start == end) - return 0; + return -1; if (matchoff && matchlen) { *matchoff = start - dptr; *matchlen = end - start; @@ -1229,6 +1229,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, struct nf_conntrack_expect *exp; union nf_inet_addr *saddr, daddr; const struct nf_nat_sip_hooks *hooks; + struct nf_conntrack_helper *helper; __be16 port; u8 proto; unsigned int expires = 0; @@ -1289,10 +1290,14 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, if (sip_direct_signalling) saddr = &ct->tuplehash[!dir].tuple.src.u3; + helper = rcu_dereference(nfct_help(ct)->helper); + if (!helper) + return NF_DROP; + nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; - exp->helper = nfct_help(ct)->helper; + exp->helper = helper; exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; hooks = rcu_dereference(nf_nat_sip_hooks); @@ -1548,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; @@ -1619,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 6a26299cb064..207b240b14e5 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -14,6 +14,7 @@ #include <linux/sysctl.h> #endif +#include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> @@ -60,18 +61,13 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); break; - case IPPROTO_UDPLITE: /* fallthrough */ + case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.udp.port), ntohs(tuple->dst.u.udp.port)); break; - case IPPROTO_DCCP: - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.dccp.port), - ntohs(tuple->dst.u.dccp.port)); - break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), @@ -98,69 +94,87 @@ struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; + unsigned int skip_elems; unsigned int bucket; u_int64_t time_now; }; -static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) +static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net, + struct ct_iter_state *st) { - struct ct_iter_state *st = seq->private; + struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int i; - for (st->bucket = 0; - st->bucket < st->htable_size; - st->bucket++) { - n = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - if (!is_a_nulls(n)) - return n; - } - return NULL; -} + for (i = st->bucket; i < st->htable_size; i++) { + unsigned int skip = 0; -static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, - struct hlist_nulls_node *head) -{ - struct ct_iter_state *st = seq->private; +restart: + hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct hlist_nulls_node *tmp = n; + + if (!net_eq(net, nf_ct_net(ct))) + continue; + + if (++skip <= st->skip_elems) + continue; - head = rcu_dereference(hlist_nulls_next_rcu(head)); - while (is_a_nulls(head)) { - if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= st->htable_size) - return NULL; + /* h should be returned, skip to nulls marker. */ + while (!is_a_nulls(tmp)) + tmp = rcu_dereference(hlist_nulls_next_rcu(tmp)); + + /* check if h is still linked to hash[i] */ + if (get_nulls_value(tmp) != i) { + skip = 0; + goto restart; + } + + st->skip_elems = skip; + st->bucket = i; + return h; } - head = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - } - return head; -} -static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_nulls_node *head = ct_get_first(seq); + skip = 0; + if (get_nulls_value(n) != i) + goto restart; - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; + st->skip_elems = 0; + } + + st->bucket = i; + return NULL; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; + struct net *net = seq_file_net(seq); st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); - return ct_get_idx(seq, *pos); + + if (*pos == 0) { + st->skip_elems = 0; + st->bucket = 0; + } else if (st->skip_elems) { + /* resume from last dumped entry */ + st->skip_elems--; + } + + return ct_get_next(net, st); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct ct_iter_state *st = s->private; + struct net *net = seq_file_net(s); + (*pos)++; - return ct_get_next(s, v); + return ct_get_next(net, st); } static void ct_seq_stop(struct seq_file *s, void *v) @@ -172,17 +186,16 @@ static void ct_seq_stop(struct seq_file *s, void *v) #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { + struct lsm_context ctx; int ret; - u32 len; - char *secctx; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return; - seq_printf(s, "secctx=%s ", secctx); + seq_printf(s, "secctx=%s ", ctx.context); - security_release_secctx(secctx, len); + security_release_secctx(&ctx); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) @@ -262,16 +275,16 @@ static const char* l4proto_name(u16 proto) case IPPROTO_ICMP: return "icmp"; case IPPROTO_TCP: return "tcp"; case IPPROTO_UDP: return "udp"; - case IPPROTO_DCCP: return "dccp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; + case IPPROTO_ICMPV6: return "icmpv6"; } return "unknown"; } -static unsigned int +static void seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) { struct nf_conn_acct *acct; @@ -279,14 +292,12 @@ seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) acct = nf_conn_acct_find(ct); if (!acct) - return 0; + return; counter = acct->counter; seq_printf(s, "packets=%llu bytes=%llu ", (unsigned long long)atomic64_read(&counter[dir].packets), (unsigned long long)atomic64_read(&counter[dir].bytes)); - - return 0; } /* return 0 on success, 1 in case of error */ @@ -299,10 +310,16 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; WARN_ON(!ct); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use))) return 0; + /* load ->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + if (nf_ct_should_gc(ct)) { + struct ct_iter_state *st = s->private; + + st->skip_elems--; nf_ct_kill(ct); goto release; } @@ -335,8 +352,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_has_overflowed(s)) goto release; - if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) - goto release; + seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL); if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); @@ -345,8 +361,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); - if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) - goto release; + seq_print_acct(s, ct, IP_CT_DIR_REPLY); if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[HW_OFFLOAD] "); @@ -359,14 +374,14 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) - seq_printf(s, "mark=%u ", ct->mark); + seq_printf(s, "mark=%u ", READ_ONCE(ct->mark)); #endif ct_show_secctx(s, ct); ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); - seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); + seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use)); if (seq_has_overflowed(s)) goto release; @@ -424,24 +439,26 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); - unsigned int nr_conntracks = atomic_read(&net->ct.count); const struct ip_conntrack_stat *st = v; + unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { - seq_puts(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } + nr_conntracks = nf_conntrack_count(net); + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, - 0, + st->clash_resolve, st->found, 0, st->invalid, - st->ignore, 0, 0, + st->chaintoolong, st->insert, st->insert_failed, st->drop, @@ -507,22 +524,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) } #endif /* CONFIG_NF_CONNTRACK_PROCFS */ +u32 nf_conntrack_count(const struct net *net) +{ + const struct nf_conntrack_net *cnet = nf_ct_pernet(net); + + return atomic_read(&cnet->count); +} +EXPORT_SYMBOL_GPL(nf_conntrack_count); + /* Sysctl support */ #ifdef CONFIG_SYSCTL -/* Log invalid packets of a given protocol */ -static int log_invalid_proto_min __read_mostly; -static int log_invalid_proto_max __read_mostly = 255; - /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int -nf_conntrack_hash_sysctl(struct ctl_table *table, int write, +nf_conntrack_hash_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; + /* module_param hashsize could have changed value */ + nf_conntrack_htable_size_user = nf_conntrack_htable_size; + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; @@ -535,6 +559,29 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write, return ret; } +static int +nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, i; + + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret < 0 || !write) + return ret; + + if (*(u8 *)table->data == 0) + return 0; + + /* Load nf_log_syslog only if no logger is currently registered */ + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + if (nf_log_is_registered(i)) + return 0; + } + request_module("%s", "nf_log_syslog"); + + return 0; +} + static struct ctl_table_header *nf_ct_netfilter_header; enum nf_ct_sysctl_index { @@ -545,7 +592,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, - NF_SYSCTL_CT_HELPER, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif @@ -563,11 +609,18 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, +#endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, + NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST, NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, +#endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, #ifdef CONFIG_NF_CT_PROTO_SCTP @@ -579,39 +632,27 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, - NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED, -#endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT, - NF_SYSCTL_CT_PROTO_DCCP_LOOSE, #endif #ifdef CONFIG_NF_CT_PROTO_GRE NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif - __NF_SYSCTL_CT_LAST_SYSCTL, + NF_SYSCTL_CT_LAST_SYSCTL, }; -#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) - static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", - .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, @@ -626,43 +667,34 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &log_invalid_proto_min, - .extra2 = &log_invalid_proto_max, + .proc_handler = nf_conntrack_log_invalid_sysctl, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", .data = &init_net.ct.sysctl_acct, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - [NF_SYSCTL_CT_HELPER] = { - .procname = "nf_conntrack_helper", - .data = &init_net.ct.sysctl_auto_assign_helper, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -670,20 +702,20 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", .data = &init_net.ct.sysctl_events, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_TWO, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP [NF_SYSCTL_CT_TIMESTAMP] = { .procname = "nf_conntrack_timestamp", .data = &init_net.ct.sysctl_tstamp, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -754,27 +786,43 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = { + .procname = "nf_flowtable_tcp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = { + .procname = "nf_conntrack_tcp_ignore_invalid_rst", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = { .procname = "nf_conntrack_udp_timeout", @@ -788,6 +836,14 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { + .procname = "nf_flowtable_udp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), @@ -849,64 +905,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { - .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { - .procname = "nf_conntrack_dccp_timeout_request", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = { - .procname = "nf_conntrack_dccp_timeout_respond", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = { - .procname = "nf_conntrack_dccp_timeout_partopen", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = { - .procname = "nf_conntrack_dccp_timeout_open", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = { - .procname = "nf_conntrack_dccp_timeout_closereq", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = { - .procname = "nf_conntrack_dccp_timeout_closing", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = { - .procname = "nf_conntrack_dccp_timeout_timewait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { - .procname = "nf_conntrack_dccp_loose", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif #ifdef CONFIG_NF_CT_PROTO_GRE [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { @@ -922,7 +920,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif - {} }; static struct ctl_table nf_ct_netfilter_table[] = { @@ -931,9 +928,10 @@ static struct ctl_table nf_ct_netfilter_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, - { } }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, @@ -962,7 +960,13 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, XASSIGN(LOOSE, &tn->tcp_loose); XASSIGN(LIBERAL, &tn->tcp_be_liberal); XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); + XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst); #undef XASSIGN + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; +#endif + } static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, @@ -983,34 +987,10 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, XASSIGN(SHUTDOWN_RECD, sn); XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); - XASSIGN(HEARTBEAT_ACKED, sn); #undef XASSIGN #endif } -static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net, - struct ctl_table *table) -{ -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct nf_dccp_net *dn = nf_dccp_pernet(net); - -#define XASSIGN(XNAME, dn) \ - table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \ - &(dn)->dccp_timeout[CT_DCCP_ ## XNAME] - - XASSIGN(REQUEST, dn); - XASSIGN(RESPOND, dn); - XASSIGN(PARTOPEN, dn); - XASSIGN(OPEN, dn); - XASSIGN(CLOSEREQ, dn); - XASSIGN(CLOSING, dn); - XASSIGN(TIMEWAIT, dn); -#undef XASSIGN - - table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose; -#endif -} - static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, struct ctl_table *table) { @@ -1024,6 +1004,7 @@ static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, static int nf_conntrack_standalone_init_sysctl(struct net *net) { + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; @@ -1034,11 +1015,10 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) if (!table) return -ENOMEM; - table[NF_SYSCTL_CT_COUNT].data = &net->ct.count; + table[NF_SYSCTL_CT_COUNT].data = &cnet->count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; - table[NF_SYSCTL_CT_HELPER].data = &net->ct.sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif @@ -1050,27 +1030,25 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; +#endif nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); - nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); - /* Don't allow unprivileged users to alter certain sysctls */ - if (net->user_ns != &init_user_ns) { + /* Don't allow non-init_net ns to alter global sysctls */ + if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; - table[NF_SYSCTL_CT_HELPER].mode = 0444; -#ifdef CONFIG_NF_CONNTRACK_EVENTS - table[NF_SYSCTL_CT_EVENTS].mode = 0444; -#endif - table[NF_SYSCTL_CT_BUCKETS].mode = 0444; - } else if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } - net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); - if (!net->ct.sysctl_header) + cnet->sysctl_header = register_net_sysctl_sz(net, "net/netfilter", + table, + ARRAY_SIZE(nf_ct_sysctl_table)); + if (!cnet->sysctl_header) goto out_unregister_netfilter; return 0; @@ -1082,10 +1060,11 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { - struct ctl_table *table; + struct nf_conntrack_net *cnet = nf_ct_pernet(net); + const struct ctl_table *table; - table = net->ct.sysctl_header->ctl_table_arg; - unregister_net_sysctl_table(net->ct.sysctl_header); + table = cnet->sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(cnet->sysctl_header); kfree(table); } #else @@ -1180,11 +1159,12 @@ static int __init nf_conntrack_standalone_init(void) nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif + nf_conntrack_init_end(); + ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; - nf_conntrack_init_end(); return 0; out_pernet: diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 14387e0b8008..0cc584d3dbb1 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -22,19 +22,21 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_timeout.h> -struct nf_ct_timeout * -(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly; -EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); - -void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly; -EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); +const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_hook); static int untimeout(struct nf_conn *ct, void *timeout) { struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); - if (timeout_ext && (!timeout || timeout_ext->timeout == timeout)) - RCU_INIT_POINTER(timeout_ext->timeout, NULL); + if (timeout_ext) { + const struct nf_ct_timeout *t; + + t = rcu_access_pointer(timeout_ext->timeout); + + if (!timeout || t == timeout) + RCU_INIT_POINTER(timeout_ext->timeout, NULL); + } /* We are not intended to delete this conntrack. */ return 0; @@ -42,37 +44,41 @@ static int untimeout(struct nf_conn *ct, void *timeout) void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout) { - nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); + struct nf_ct_iter_data iter_data = { + .net = net, + .data = timeout, + }; + + nf_ct_iterate_cleanup_net(untimeout, &iter_data); } EXPORT_SYMBOL_GPL(nf_ct_untimeout); static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout) { - typeof(nf_ct_timeout_put_hook) timeout_put; + const struct nf_ct_timeout_hooks *h = rcu_dereference(nf_ct_timeout_hook); - timeout_put = rcu_dereference(nf_ct_timeout_put_hook); - if (timeout_put) - timeout_put(timeout); + if (h) + h->timeout_put(timeout); } int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) { - typeof(nf_ct_timeout_find_get_hook) timeout_find_get; + const struct nf_ct_timeout_hooks *h; struct nf_ct_timeout *timeout; struct nf_conn_timeout *timeout_ext; const char *errmsg = NULL; int ret = 0; rcu_read_lock(); - timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook); - if (!timeout_find_get) { + h = rcu_dereference(nf_ct_timeout_hook); + if (!h) { ret = -ENOENT; errmsg = "Timeout policy base is empty"; goto out; } - timeout = timeout_find_get(net, timeout_name); + timeout = h->timeout_find_get(net, timeout_name); if (!timeout) { ret = -ENOENT; pr_info_ratelimited("No such timeout policy \"%s\"\n", @@ -119,37 +125,22 @@ EXPORT_SYMBOL_GPL(nf_ct_set_timeout); void nf_ct_destroy_timeout(struct nf_conn *ct) { struct nf_conn_timeout *timeout_ext; - typeof(nf_ct_timeout_put_hook) timeout_put; + const struct nf_ct_timeout_hooks *h; rcu_read_lock(); - timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + h = rcu_dereference(nf_ct_timeout_hook); - if (timeout_put) { + if (h) { timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) { - timeout_put(timeout_ext->timeout); + struct nf_ct_timeout *t; + + t = rcu_dereference(timeout_ext->timeout); + if (t) + h->timeout_put(t); RCU_INIT_POINTER(timeout_ext->timeout, NULL); } } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout); - -static const struct nf_ct_ext_type timeout_extend = { - .len = sizeof(struct nf_conn_timeout), - .align = __alignof__(struct nf_conn_timeout), - .id = NF_CT_EXT_TIMEOUT, -}; - -int nf_conntrack_timeout_init(void) -{ - int ret = nf_ct_extend_register(&timeout_extend); - if (ret < 0) - pr_err("nf_ct_timeout: Unable to register timeout extension.\n"); - return ret; -} - -void nf_conntrack_timeout_fini(void) -{ - nf_ct_extend_unregister(&timeout_extend); -} diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index f656d393fa92..9e43a0a59e73 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -19,27 +19,7 @@ static bool nf_ct_tstamp __read_mostly; module_param_named(tstamp, nf_ct_tstamp, bool, 0644); MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); -static const struct nf_ct_ext_type tstamp_extend = { - .len = sizeof(struct nf_conn_tstamp), - .align = __alignof__(struct nf_conn_tstamp), - .id = NF_CT_EXT_TSTAMP, -}; - void nf_conntrack_tstamp_pernet_init(struct net *net) { net->ct.sysctl_tstamp = nf_ct_tstamp; } - -int nf_conntrack_tstamp_init(void) -{ - int ret; - ret = nf_ct_extend_register(&tstamp_extend); - if (ret < 0) - pr_err("Unable to register extension\n"); - return ret; -} - -void nf_conntrack_tstamp_fini(void) -{ - nf_ct_extend_unregister(&tstamp_extend); -} diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index f108a76925dd..fab8b9011098 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -13,13 +13,45 @@ #include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> -static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) +#define NF_RECURSION_LIMIT 2 + +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif + +static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, + enum nf_dev_hooks hook) { - if (skb_mac_header_was_set(skb)) + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) + goto err; + + if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { + if (skb_cow_head(skb, skb->mac_len)) + goto err; + skb_push(skb, skb->mac_len); + } skb->dev = dev; + skb_clear_tstamp(skb); + (*nf_dup_skb_recursion)++; dev_queue_xmit(skb); + (*nf_dup_skb_recursion)--; + return; +err: + kfree_skb(skb); } void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif) @@ -32,7 +64,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif) return; } - nf_do_netdev_egress(pkt->skb, dev); + nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt)); } EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress); @@ -47,7 +79,7 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) skb = skb_clone(pkt->skb, GFP_ATOMIC); if (skb) - nf_do_netdev_egress(skb, dev); + nf_do_netdev_egress(skb, dev, nft_hook(pkt)); } EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); @@ -73,3 +105,4 @@ EXPORT_SYMBOL_GPL(nft_fwd_dup_netdev_offload); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter packet duplication support"); diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c new file mode 100644 index 000000000000..4a5f5195f2d2 --- /dev/null +++ b/net/netfilter/nf_flow_table_bpf.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable Flow Table Helpers for XDP hook + * + * These are called from the XDP programs. + * Note that it is allowed to break compatibility for these functions since + * the interface they are exposed through to BPF programs is explicitly + * unstable. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <net/netfilter/nf_flow_table.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <net/xdp.h> + +/* bpf_flowtable_opts - options for bpf flowtable helpers + * @error: out parameter, set for any encountered error + */ +struct bpf_flowtable_opts { + s32 error; +}; + +enum { + NF_BPF_FLOWTABLE_OPTS_SZ = 4, +}; + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in nf_flow_table BTF"); + +__bpf_kfunc_start_defs(); + +static struct flow_offload_tuple_rhash * +bpf_xdp_flow_tuple_lookup(struct net_device *dev, + struct flow_offload_tuple *tuple, __be16 proto) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *nf_flow_table; + struct flow_offload *nf_flow; + + nf_flow_table = nf_flowtable_by_dev(dev); + if (!nf_flow_table) + return ERR_PTR(-ENOENT); + + tuplehash = flow_offload_lookup(nf_flow_table, tuple); + if (!tuplehash) + return ERR_PTR(-ENOENT); + + nf_flow = container_of(tuplehash, struct flow_offload, + tuplehash[tuplehash->tuple.dir]); + flow_offload_refresh(nf_flow_table, nf_flow, false); + + return tuplehash; +} + +__bpf_kfunc struct flow_offload_tuple_rhash * +bpf_xdp_flow_lookup(struct xdp_md *ctx, struct bpf_fib_lookup *fib_tuple, + struct bpf_flowtable_opts *opts, u32 opts_len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + struct flow_offload_tuple tuple = { + .iifidx = fib_tuple->ifindex, + .l3proto = fib_tuple->family, + .l4proto = fib_tuple->l4_protocol, + .src_port = fib_tuple->sport, + .dst_port = fib_tuple->dport, + }; + struct flow_offload_tuple_rhash *tuplehash; + __be16 proto; + + if (opts_len != NF_BPF_FLOWTABLE_OPTS_SZ) { + opts->error = -EINVAL; + return NULL; + } + + switch (fib_tuple->family) { + case AF_INET: + tuple.src_v4.s_addr = fib_tuple->ipv4_src; + tuple.dst_v4.s_addr = fib_tuple->ipv4_dst; + proto = htons(ETH_P_IP); + break; + case AF_INET6: + tuple.src_v6 = *(struct in6_addr *)&fib_tuple->ipv6_src; + tuple.dst_v6 = *(struct in6_addr *)&fib_tuple->ipv6_dst; + proto = htons(ETH_P_IPV6); + break; + default: + opts->error = -EAFNOSUPPORT; + return NULL; + } + + tuplehash = bpf_xdp_flow_tuple_lookup(xdp->rxq->dev, &tuple, proto); + if (IS_ERR(tuplehash)) { + opts->error = PTR_ERR(tuplehash); + return NULL; + } + + return tuplehash; +} + +__diag_pop() + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(nf_ft_kfunc_set) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_KFUNCS_END(nf_ft_kfunc_set) + +static const struct btf_kfunc_id_set nf_flow_kfunc_set = { + .owner = THIS_MODULE, + .set = &nf_ft_kfunc_set, +}; + +int nf_flow_register_bpf(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, + &nf_flow_kfunc_set); +} +EXPORT_SYMBOL_GPL(nf_flow_register_bpf); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 6a3034f84ab6..06e8251a6644 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -39,22 +39,28 @@ flow_offload_fill_dir(struct flow_offload *flow, ft->l3proto = ctt->src.l3num; ft->l4proto = ctt->dst.protonum; - ft->src_port = ctt->src.u.tcp.port; - ft->dst_port = ctt->dst.u.tcp.port; + + switch (ctt->dst.protonum) { + case IPPROTO_TCP: + case IPPROTO_UDP: + ft->src_port = ctt->src.u.tcp.port; + ft->dst_port = ctt->dst.u.tcp.port; + break; + } } struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { struct flow_offload *flow; - if (unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) + if (unlikely(nf_ct_is_dying(ct))) return NULL; flow = kzalloc(sizeof(*flow), GFP_ATOMIC); if (!flow) - goto err_ct_refcnt; + return NULL; + refcount_inc(&ct->ct_general.use); flow->ct = ct; flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); @@ -66,111 +72,184 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; +} +EXPORT_SYMBOL_GPL(flow_offload_alloc); -err_ct_refcnt: - nf_ct_put(ct); +static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) +{ + if (flow_tuple->l3proto == NFPROTO_IPV6) + return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache)); + + return 0; +} + +static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct dst_entry *dst = route->tuple[dir].dst; + + route->tuple[dir].dst = NULL; - return NULL; + return dst; } -EXPORT_SYMBOL_GPL(flow_offload_alloc); static int flow_offload_fill_route(struct flow_offload *flow, - const struct nf_flow_route *route, + struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; - struct dst_entry *other_dst = route->tuple[!dir].dst; - struct dst_entry *dst = route->tuple[dir].dst; - - if (!dst_hold_safe(route->tuple[dir].dst)) - return -1; + struct dst_entry *dst = nft_route_dst_fetch(route, dir); + int i, j = 0; switch (flow_tuple->l3proto) { case NFPROTO_IPV4: flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: - flow_tuple->mtu = ip6_dst_mtu_forward(dst); + flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true); break; } - flow_tuple->iifidx = other_dst->dev->ifindex; - flow_tuple->dst_cache = dst; + flow_tuple->iifidx = route->tuple[dir].in.ifindex; + for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { + flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; + flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; + if (route->tuple[dir].in.ingress_vlans & BIT(i)) + flow_tuple->in_vlan_ingress |= BIT(j); + j++; + } + + flow_tuple->tun = route->tuple[dir].in.tun; + flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; + + switch (route->tuple[dir].xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest, + ETH_ALEN); + memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, + ETH_ALEN); + flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; + dst_release(dst); + break; + case FLOW_OFFLOAD_XMIT_XFRM: + case FLOW_OFFLOAD_XMIT_NEIGH: + flow_tuple->ifidx = route->tuple[dir].out.ifindex; + flow_tuple->dst_cache = dst; + flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); + break; + default: + WARN_ON_ONCE(1); + break; + } + flow_tuple->xmit_type = route->tuple[dir].xmit_type; return 0; } -int flow_offload_route_init(struct flow_offload *flow, - const struct nf_flow_route *route) +static void nft_flow_dst_release(struct flow_offload *flow, + enum flow_offload_tuple_dir dir) { - int err; - - err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); - if (err < 0) - return err; - - err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); - if (err < 0) - goto err_route_reply; + if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || + flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) + dst_release(flow->tuplehash[dir].tuple.dst_cache); +} +void flow_offload_route_init(struct flow_offload *flow, + struct nf_flow_route *route) +{ + flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); flow->type = NF_FLOW_OFFLOAD_ROUTE; - - return 0; - -err_route_reply: - dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); - - return err; } EXPORT_SYMBOL_GPL(flow_offload_route_init); -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) +static inline bool nf_flow_has_expired(const struct flow_offload *flow) { - tcp->state = TCP_CONNTRACK_ESTABLISHED; + return nf_flow_timeout_delta(flow->timeout) <= 0; +} + +static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state) +{ + struct ip_ct_tcp *tcp = &ct->proto.tcp; + + spin_lock_bh(&ct->lock); + if (tcp->state != tcp_state) + tcp->state = tcp_state; + + /* syn packet triggers the TCP reopen case from conntrack. */ + if (tcp->state == TCP_CONNTRACK_CLOSE) + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + /* Conntrack state is outdated due to offload bypass. + * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks + * TCP reset validation will fail. + */ tcp->seen[0].td_maxwin = 0; + tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; tcp->seen[1].td_maxwin = 0; + tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; + spin_unlock_bh(&ct->lock); } -#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) -#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) - -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct flow_offload *flow) { - const struct nf_conntrack_l4proto *l4proto; + struct nf_conn *ct = flow->ct; + struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); - unsigned int timeout; + bool expired, closing = false; + u32 offload_timeout = 0; + s32 timeout; + + if (l4num == IPPROTO_TCP) { + const struct nf_tcp_net *tn = nf_tcp_pernet(net); + u8 tcp_state; + + /* Enter CLOSE state if fin/rst packet has been seen, this + * allows TCP reopen from conntrack. Otherwise, pick up from + * the last seen TCP state. + */ + closing = test_bit(NF_FLOW_CLOSING, &flow->flags); + if (closing) { + flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE); + timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]); + expired = false; + } else { + tcp_state = READ_ONCE(ct->proto.tcp.state); + flow_offload_fixup_tcp(ct, tcp_state); + timeout = READ_ONCE(tn->timeouts[tcp_state]); + expired = nf_flow_has_expired(flow); + } + offload_timeout = READ_ONCE(tn->offload_timeout); - l4proto = nf_ct_l4proto_find(l4num); - if (!l4proto) - return; + } else if (l4num == IPPROTO_UDP) { + const struct nf_udp_net *tn = nf_udp_pernet(net); + enum udp_conntrack state = + test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + UDP_CT_REPLIED : UDP_CT_UNREPLIED; - if (l4num == IPPROTO_TCP) - timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; - else if (l4num == IPPROTO_UDP) - timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; - else + timeout = READ_ONCE(tn->timeouts[state]); + expired = nf_flow_has_expired(flow); + offload_timeout = READ_ONCE(tn->offload_timeout); + } else { return; + } - if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) - ct->timeout = nfct_time_stamp + timeout; -} + if (expired) + timeout -= offload_timeout; -static void flow_offload_fixup_ct_state(struct nf_conn *ct) -{ - if (nf_ct_protonum(ct) == IPPROTO_TCP) - flow_offload_fixup_tcp(&ct->proto.tcp); -} + if (timeout < 0) + timeout = 0; -static void flow_offload_fixup_ct(struct nf_conn *ct) -{ - flow_offload_fixup_ct_state(ct); - flow_offload_fixup_ct_timeout(ct); + if (closing || + nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) + nf_ct_refresh(ct, timeout); } static void flow_offload_route_release(struct flow_offload *flow) { - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); + nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); + nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY); } void flow_offload_free(struct flow_offload *flow) @@ -191,14 +270,14 @@ static u32 flow_offload_hash(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple *tuple = data; - return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); + return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple_rhash *tuplehash = data; - return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, @@ -207,7 +286,7 @@ static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, const struct flow_offload_tuple *tuple = arg->key; const struct flow_offload_tuple_rhash *x = ptr; - if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) return 1; return 0; @@ -221,11 +300,30 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { .automatic_shrinking = true, }; +unsigned long flow_offload_get_timeout(struct flow_offload *flow) +{ + unsigned long timeout = NF_FLOW_TIMEOUT; + struct net *net = nf_ct_net(flow->ct); + int l4num = nf_ct_protonum(flow->ct); + + if (l4num == IPPROTO_TCP) { + struct nf_tcp_net *tn = nf_tcp_pernet(net); + + timeout = tn->offload_timeout; + } else if (l4num == IPPROTO_UDP) { + struct nf_udp_net *tn = nf_udp_pernet(net); + + timeout = tn->offload_timeout; + } + + return timeout; +} + int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; - flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, @@ -243,6 +341,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } + nf_ct_refresh(flow->ct, NF_CT_DAY); + if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); @@ -253,23 +353,24 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, - struct flow_offload *flow) + struct flow_offload *flow, bool force) { - flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + u32 timeout; - if (likely(!nf_flowtable_hw_offload(flow_table) || - !test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags))) + timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); + if (force || timeout - READ_ONCE(flow->timeout) > HZ) + WRITE_ONCE(flow->timeout, timeout); + else + return; + + if (likely(!nf_flowtable_hw_offload(flow_table)) || + test_bit(NF_FLOW_CLOSING, &flow->flags)) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); -static inline bool nf_flow_has_expired(const struct flow_offload *flow) -{ - return nf_flow_timeout_delta(flow->timeout) <= 0; -} - static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -279,22 +380,14 @@ static void flow_offload_del(struct nf_flowtable *flow_table, rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - - clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); - - if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(flow->ct); - else - flow_offload_fixup_ct_timeout(flow->ct); - flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { - set_bit(NF_FLOW_TEARDOWN, &flow->flags); - - flow_offload_fixup_ct_state(flow->ct); + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); + if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags)) + flow_offload_fixup_ct(flow); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -325,7 +418,8 @@ EXPORT_SYMBOL_GPL(flow_offload_lookup); static int nf_flow_table_iterate(struct nf_flowtable *flow_table, - void (*iter)(struct flow_offload *flow, void *data), + void (*iter)(struct nf_flowtable *flowtable, + struct flow_offload *flow, void *data), void *data) { struct flow_offload_tuple_rhash *tuplehash; @@ -349,7 +443,7 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); - iter(flow, data); + iter(flow_table, flow, data); } rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); @@ -357,14 +451,124 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } -static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) +static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, + const struct flow_offload *flow) { - struct nf_flowtable *flow_table = data; + return flow_table->type->gc && flow_table->type->gc(flow); +} - if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct)) - set_bit(NF_FLOW_TEARDOWN, &flow->flags); +/** + * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry + * @ct: Flowtable offloaded tcp ct + * + * Return: number of seconds when ct entry should expire. + */ +static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct) +{ + u8 state = READ_ONCE(ct->proto.tcp.state); - if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + switch (state) { + case TCP_CONNTRACK_SYN_SENT: + case TCP_CONNTRACK_SYN_RECV: + return 0; + case TCP_CONNTRACK_ESTABLISHED: + return NF_CT_DAY; + case TCP_CONNTRACK_FIN_WAIT: + case TCP_CONNTRACK_CLOSE_WAIT: + case TCP_CONNTRACK_LAST_ACK: + case TCP_CONNTRACK_TIME_WAIT: + return 5 * 60 * HZ; + case TCP_CONNTRACK_CLOSE: + return 0; + } + + return 0; +} + +/** + * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry + * @ct: Flowtable offloaded ct + * + * Datapath lookups in the conntrack table will evict nf_conn entries + * if they have expired. + * + * Once nf_conn entries have been offloaded, nf_conntrack might not see any + * packets anymore. Thus ct->timeout is no longer refreshed and ct can + * be evicted. + * + * To avoid the need for an additional check on the offload bit for every + * packet processed via nf_conntrack_in(), set an arbitrary timeout large + * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT + * from the packet path via nf_ct_is_expired(). + */ +static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct) +{ + static const u32 min_timeout = 5 * 60 * HZ; + u32 expires = nf_ct_expires(ct); + + /* normal case: large enough timeout, nothing to do. */ + if (likely(expires >= min_timeout)) + return; + + /* must check offload bit after this, we do not hold any locks. + * flowtable and ct entries could have been removed on another CPU. + */ + if (!refcount_inc_not_zero(&ct->ct_general.use)) + return; + + /* load ct->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + + if (nf_ct_is_confirmed(ct) && + test_bit(IPS_OFFLOAD_BIT, &ct->status)) { + u8 l4proto = nf_ct_protonum(ct); + u32 new_timeout = true; + + switch (l4proto) { + case IPPROTO_UDP: + new_timeout = NF_CT_DAY; + break; + case IPPROTO_TCP: + new_timeout = nf_flow_table_tcp_timeout(ct); + break; + default: + WARN_ON_ONCE(1); + break; + } + + /* Update to ct->timeout from nf_conntrack happens + * without holding ct->lock. + * + * Use cmpxchg to ensure timeout extension doesn't + * happen when we race with conntrack datapath. + * + * The inverse -- datapath updating ->timeout right + * after this -- is fine, datapath is authoritative. + */ + if (new_timeout) { + new_timeout += nfct_time_stamp; + cmpxchg(&ct->timeout, expires, new_timeout); + } + } + + nf_ct_put(ct); +} + +static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, + struct flow_offload *flow, void *data) +{ + bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags); + + if (nf_flow_has_expired(flow) || + nf_ct_is_dying(flow->ct) || + nf_flow_custom_gc(flow_table, flow)) { + flow_offload_teardown(flow); + teardown = true; + } else if (!teardown) { + nf_flow_table_extend_ct_timeout(flow->ct); + } + + if (teardown) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); @@ -373,129 +577,72 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) } else { flow_offload_del(flow_table, flow); } + } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) && + test_bit(NF_FLOW_HW, &flow->flags) && + !test_bit(NF_FLOW_HW_DYING, &flow->flags)) { + nf_flow_offload_del(flow_table, flow); } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } } +void nf_flow_table_gc_run(struct nf_flowtable *flow_table) +{ + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); +} + static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; flow_table = container_of(work, struct nf_flowtable, gc_work.work); - nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); + nf_flow_table_gc_run(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } -int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, - flow_setup_cb_t *cb, void *cb_priv) -{ - struct flow_block *block = &flow_table->flow_block; - struct flow_block_cb *block_cb; - int err = 0; - - down_write(&flow_table->flow_block_lock); - block_cb = flow_block_cb_lookup(block, cb, cb_priv); - if (block_cb) { - err = -EEXIST; - goto unlock; - } - - block_cb = flow_block_cb_alloc(cb, cb_priv, cb_priv, NULL); - if (IS_ERR(block_cb)) { - err = PTR_ERR(block_cb); - goto unlock; - } - - list_add_tail(&block_cb->list, &block->cb_list); - -unlock: - up_write(&flow_table->flow_block_lock); - return err; -} -EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb); - -void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, - flow_setup_cb_t *cb, void *cb_priv) -{ - struct flow_block *block = &flow_table->flow_block; - struct flow_block_cb *block_cb; - - down_write(&flow_table->flow_block_lock); - block_cb = flow_block_cb_lookup(block, cb, cb_priv); - if (block_cb) { - list_del(&block_cb->list); - flow_block_cb_free(block_cb); - } else { - WARN_ON(true); - } - up_write(&flow_table->flow_block_lock); -} -EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb); - -static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, - __be16 port, __be16 new_port) +static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) { struct tcphdr *tcph; - if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || - skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - tcph = (void *)(skb_network_header(skb) + thoff); - inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); - - return 0; + inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); } -static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, - __be16 port, __be16 new_port) +static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) { struct udphdr *udph; - if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || - skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, - new_port, true); + new_port, false); if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, - u8 protocol, __be16 port, __be16 new_port) +static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, + u8 protocol, __be16 port, __be16 new_port) { switch (protocol) { case IPPROTO_TCP: - if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) - return NF_DROP; + nf_flow_nat_port_tcp(skb, thoff, port, new_port); break; case IPPROTO_UDP: - if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) - return NF_DROP; + nf_flow_nat_port_udp(skb, thoff, port, new_port); break; } - - return 0; } -int nf_flow_snat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir) +void nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; - if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || - skb_try_make_writable(skb, thoff + sizeof(*hdr))) - return -1; - hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { @@ -509,25 +656,19 @@ int nf_flow_snat_port(const struct flow_offload *flow, new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; hdr->dest = new_port; break; - default: - return -1; } - return nf_flow_nat_port(skb, thoff, protocol, port, new_port); + nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_snat_port); -int nf_flow_dnat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir) +void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb, + unsigned int thoff, u8 protocol, + enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; - if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || - skb_try_make_writable(skb, thoff + sizeof(*hdr))) - return -1; - hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { @@ -541,11 +682,9 @@ int nf_flow_dnat_port(const struct flow_offload *flow, new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; hdr->source = new_port; break; - default: - return -1; } - return nf_flow_nat_port(skb, thoff, protocol, port, new_port); + nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); @@ -553,7 +692,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) { int err; - INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); init_rwsem(&flowtable->flow_block_lock); @@ -573,7 +712,8 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) } EXPORT_SYMBOL_GPL(nf_flow_table_init); -static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) +static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table, + struct flow_offload *flow, void *data) { struct net_device *dev = data; @@ -614,24 +754,89 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); - nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); - nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); nf_flow_table_offload_flush(flow_table); - if (nf_flowtable_hw_offload(flow_table)) - nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, - flow_table); + /* ... no more pending work after this stage ... */ + nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); + nf_flow_table_gc_run(flow_table); + nf_flow_table_offload_flush_cleanup(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); +static int nf_flow_table_init_net(struct net *net) +{ + net->ft.stat = alloc_percpu(struct nf_flow_table_stat); + return net->ft.stat ? 0 : -ENOMEM; +} + +static void nf_flow_table_fini_net(struct net *net) +{ + free_percpu(net->ft.stat); +} + +static int nf_flow_table_pernet_init(struct net *net) +{ + int ret; + + ret = nf_flow_table_init_net(net); + if (ret < 0) + return ret; + + ret = nf_flow_table_init_proc(net); + if (ret < 0) + goto out_proc; + + return 0; + +out_proc: + nf_flow_table_fini_net(net); + return ret; +} + +static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) { + nf_flow_table_fini_proc(net); + nf_flow_table_fini_net(net); + } +} + +static struct pernet_operations nf_flow_table_net_ops = { + .init = nf_flow_table_pernet_init, + .exit_batch = nf_flow_table_pernet_exit, +}; + static int __init nf_flow_table_module_init(void) { - return nf_flow_table_offload_init(); + int ret; + + ret = register_pernet_subsys(&nf_flow_table_net_ops); + if (ret < 0) + return ret; + + ret = nf_flow_table_offload_init(); + if (ret) + goto out_offload; + + ret = nf_flow_register_bpf(); + if (ret) + goto out_bpf; + + return 0; + +out_bpf: + nf_flow_table_offload_exit(); +out_offload: + unregister_pernet_subsys(&nf_flow_table_net_ops); + return ret; } static void __exit nf_flow_table_module_exit(void) { nf_flow_table_offload_exit(); + unregister_pernet_subsys(&nf_flow_table_net_ops); } module_init(nf_flow_table_module_init); @@ -639,3 +844,4 @@ module_exit(nf_flow_table_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter flow table module"); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 88bedf1ff1ae..b0f199171932 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -6,12 +6,33 @@ #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> +#include <linux/if_vlan.h> static unsigned int nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct vlan_ethhdr *veth; + __be16 proto; + switch (skb->protocol) { + case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return NF_ACCEPT; + + veth = (struct vlan_ethhdr *)skb_mac_header(skb); + proto = veth->h_vlan_encapsulated_proto; + break; + case htons(ETH_P_PPP_SES): + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; + break; + default: + proto = skb->protocol; + break; + } + + switch (proto) { case htons(ETH_P_IP): return nf_flow_offload_ip_hook(priv, skb, state); case htons(ETH_P_IPV6): @@ -22,7 +43,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, } static int nf_flow_rule_route_inet(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -54,8 +75,30 @@ static struct nf_flowtable_type flowtable_inet = { .owner = THIS_MODULE, }; +static struct nf_flowtable_type flowtable_ipv4 = { + .family = NFPROTO_IPV4, + .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv4, + .free = nf_flow_table_free, + .hook = nf_flow_offload_ip_hook, + .owner = THIS_MODULE, +}; + +static struct nf_flowtable_type flowtable_ipv6 = { + .family = NFPROTO_IPV6, + .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_ipv6, + .free = nf_flow_table_free, + .hook = nf_flow_offload_ipv6_hook, + .owner = THIS_MODULE, +}; + static int __init nf_flow_inet_module_init(void) { + nft_register_flowtable_type(&flowtable_ipv4); + nft_register_flowtable_type(&flowtable_ipv6); nft_register_flowtable_type(&flowtable_inet); return 0; @@ -64,6 +107,8 @@ static int __init nf_flow_inet_module_init(void) static void __exit nf_flow_inet_module_exit(void) { nft_unregister_flowtable_type(&flowtable_inet); + nft_unregister_flowtable_type(&flowtable_ipv6); + nft_unregister_flowtable_type(&flowtable_ipv4); } module_init(nf_flow_inet_module_init); @@ -71,4 +116,7 @@ module_exit(nf_flow_inet_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET6); MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */ +MODULE_DESCRIPTION("Netfilter flow table mixed IPv4/IPv6 module"); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index a3bca758b849..78883343e5d6 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -7,6 +7,8 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netdevice.h> +#include <linux/if_ether.h> +#include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -25,42 +27,33 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto, if (proto != IPPROTO_TCP) return 0; - if (!pskb_may_pull(skb, thoff + sizeof(*tcph))) - return -1; - tcph = (void *)(skb_network_header(skb) + thoff); - if (unlikely(tcph->fin || tcph->rst)) { + if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) { flow_offload_teardown(flow); return -1; } + if ((tcph->fin || tcph->rst) && + !test_bit(NF_FLOW_CLOSING, &flow->flags)) + set_bit(NF_FLOW_CLOSING, &flow->flags); + return 0; } -static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) +static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) { struct tcphdr *tcph; - if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || - skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); - - return 0; } -static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) +static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) { struct udphdr *udph; - if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || - skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&udph->check, skb, addr, @@ -68,31 +61,25 @@ static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, - unsigned int thoff, __be32 addr, - __be32 new_addr) +static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, __be32 addr, + __be32 new_addr) { switch (iph->protocol) { case IPPROTO_TCP: - if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr); break; case IPPROTO_UDP: - if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ip_udp(skb, thoff, addr, new_addr); break; } - - return 0; } -static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_snat_ip(const struct flow_offload *flow, + struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, enum flow_offload_tuple_dir dir) { __be32 addr, new_addr; @@ -107,17 +94,15 @@ static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; iph->daddr = new_addr; break; - default: - return -1; } csum_replace4(&iph->check, addr, new_addr); - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); + nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } -static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_dnat_ip(const struct flow_offload *flow, + struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, enum flow_offload_tuple_dir dir) { __be32 addr, new_addr; @@ -132,31 +117,24 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; iph->saddr = new_addr; break; - default: - return -1; } csum_replace4(&iph->check, addr, new_addr); - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); + nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } -static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, - unsigned int thoff, enum flow_offload_tuple_dir dir) +static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, + unsigned int thoff, enum flow_offload_tuple_dir dir, + struct iphdr *iph) { - struct iphdr *iph = ip_hdr(skb); - - if (test_bit(NF_FLOW_SNAT, &flow->flags) && - (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) - return -1; - - iph = ip_hdr(skb); - if (test_bit(NF_FLOW_DNAT, &flow->flags) && - (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) - return -1; - - return 0; + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir); + nf_flow_snat_ip(flow, skb, iph, thoff, dir); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir); + nf_flow_dnat_ip(flow, skb, iph, thoff, dir); + } } static bool ip_has_options(unsigned int thoff) @@ -164,44 +142,122 @@ static bool ip_has_options(unsigned int thoff) return thoff != sizeof(struct iphdr); } -static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, +static void nf_flow_tuple_encap(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + __be16 inner_proto = skb->protocol; + struct vlan_ethhdr *veth; + struct pppoe_hdr *phdr; + struct iphdr *iph; + u16 offset = 0; + int i = 0; + + if (skb_vlan_tag_present(skb)) { + tuple->encap[i].id = skb_vlan_tag_get(skb); + tuple->encap[i].proto = skb->vlan_proto; + i++; + } + switch (skb->protocol) { + case htons(ETH_P_8021Q): + veth = (struct vlan_ethhdr *)skb_mac_header(skb); + tuple->encap[i].id = ntohs(veth->h_vlan_TCI); + tuple->encap[i].proto = skb->protocol; + inner_proto = veth->h_vlan_encapsulated_proto; + offset += VLAN_HLEN; + break; + case htons(ETH_P_PPP_SES): + phdr = (struct pppoe_hdr *)skb_network_header(skb); + tuple->encap[i].id = ntohs(phdr->sid); + tuple->encap[i].proto = skb->protocol; + inner_proto = *((__be16 *)(phdr + 1)); + offset += PPPOE_SES_HLEN; + break; + } + + if (inner_proto == htons(ETH_P_IP)) { + iph = (struct iphdr *)(skb_network_header(skb) + offset); + if (iph->protocol == IPPROTO_IPIP) { + tuple->tun.dst_v4.s_addr = iph->daddr; + tuple->tun.src_v4.s_addr = iph->saddr; + tuple->tun.l3_proto = IPPROTO_IPIP; + } + } +} + +struct nf_flowtable_ctx { + const struct net_device *in; + u32 offset; + u32 hdrsize; +}; + +static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, struct flow_offload_tuple *tuple) { struct flow_ports *ports; unsigned int thoff; struct iphdr *iph; + u8 ipproto; - if (!pskb_may_pull(skb, sizeof(*iph))) + if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset)) return -1; - iph = ip_hdr(skb); - thoff = iph->ihl * 4; + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); + thoff = (iph->ihl * 4); if (ip_is_fragment(iph) || unlikely(ip_has_options(thoff))) return -1; - if (iph->protocol != IPPROTO_TCP && - iph->protocol != IPPROTO_UDP) + thoff += ctx->offset; + + ipproto = iph->protocol; + switch (ipproto) { + case IPPROTO_TCP: + ctx->hdrsize = sizeof(struct tcphdr); + break; + case IPPROTO_UDP: + ctx->hdrsize = sizeof(struct udphdr); + break; +#ifdef CONFIG_NF_CT_PROTO_GRE + case IPPROTO_GRE: + ctx->hdrsize = sizeof(struct gre_base_hdr); + break; +#endif + default: return -1; + } if (iph->ttl <= 1) return -1; - thoff = iph->ihl * 4; - if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; - iph = ip_hdr(skb); - ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + switch (ipproto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + break; + case IPPROTO_GRE: { + struct gre_base_hdr *greh; + + greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff); + if ((greh->flags & GRE_VERSION) != GRE_VERSION_0) + return -1; + break; + } + } + + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v4.s_addr = iph->saddr; tuple->dst_v4.s_addr = iph->daddr; - tuple->src_port = ports->source; - tuple->dst_port = ports->dest; tuple->l3proto = AF_INET; - tuple->l4proto = iph->protocol; - tuple->iifidx = dev->ifindex; + tuple->l4proto = ipproto; + tuple->iifidx = ctx->in->ifindex; + nf_flow_tuple_encap(skb, tuple); return 0; } @@ -218,12 +274,13 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } -static int nf_flow_offload_dst_check(struct dst_entry *dst) +static inline bool nf_flow_dst_check(struct flow_offload_tuple *tuple) { - if (unlikely(dst_xfrm(dst))) - return dst_check(dst, 0) ? 0 : -1; + if (tuple->xmit_type != FLOW_OFFLOAD_XMIT_NEIGH && + tuple->xmit_type != FLOW_OFFLOAD_XMIT_XFRM) + return true; - return 0; + return dst_check(tuple->dst_cache, tuple->dst_cookie); } static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, @@ -236,106 +293,402 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -unsigned int -nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) { - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; - enum flow_offload_tuple_dir dir; - struct flow_offload *flow; - struct net_device *outdev; - struct rtable *rt; - unsigned int thoff; struct iphdr *iph; - __be32 nexthop; + u16 size; - if (skb->protocol != htons(ETH_P_IP)) - return NF_ACCEPT; + if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) + return false; - if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) - return NF_ACCEPT; + iph = (struct iphdr *)(skb_network_header(skb) + *psize); + size = iph->ihl << 2; - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) + return false; + + if (iph->ttl <= 1) + return false; + + if (iph->protocol == IPPROTO_IPIP) + *psize += size; + + return true; +} + +static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + + if (iph->protocol != IPPROTO_IPIP) + return; + + skb_pull(skb, iph->ihl << 2); + skb_reset_network_header(skb); +} + +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, + u32 *offset) +{ + __be16 inner_proto = skb->protocol; + struct vlan_ethhdr *veth; + bool ret = false; + + switch (skb->protocol) { + case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return false; + + veth = (struct vlan_ethhdr *)skb_mac_header(skb); + if (veth->h_vlan_encapsulated_proto == proto) { + *offset += VLAN_HLEN; + inner_proto = proto; + ret = true; + } + break; + case htons(ETH_P_PPP_SES): + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { + *offset += PPPOE_SES_HLEN; + ret = true; + } + break; + } + + if (inner_proto == htons(ETH_P_IP)) + ret = nf_flow_ip4_tunnel_proto(skb, offset); + + return ret; +} + +static void nf_flow_encap_pop(struct sk_buff *skb, + struct flow_offload_tuple_rhash *tuplehash) +{ + struct vlan_hdr *vlan_hdr; + int i; + + for (i = 0; i < tuplehash->tuple.encap_num; i++) { + if (skb_vlan_tag_present(skb)) { + __vlan_hwaccel_clear_tag(skb); + continue; + } + switch (skb->protocol) { + case htons(ETH_P_8021Q): + vlan_hdr = (struct vlan_hdr *)skb->data; + __skb_pull(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vlan_hdr); + skb_reset_network_header(skb); + break; + case htons(ETH_P_PPP_SES): + skb->protocol = __nf_flow_pppoe_proto(skb); + skb_pull(skb, PPPOE_SES_HLEN); + skb_reset_network_header(skb); + break; + } + } + + if (skb->protocol == htons(ETH_P_IP)) + nf_flow_ip4_tunnel_pop(skb); +} + +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; +}; + +static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + skb->dev = xmit->outdev; + dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); + dev_queue_xmit(skb); + + return NF_STOLEN; +} + +static struct flow_offload_tuple_rhash * +nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, struct sk_buff *skb) +{ + struct flow_offload_tuple tuple = {}; + + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + return NULL; + + if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) + return NULL; + + return flow_offload_lookup(flow_table, &tuple); +} + +static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct flow_offload_tuple_rhash *tuplehash, + struct sk_buff *skb) +{ + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + unsigned int thoff, mtu; + struct iphdr *iph; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; - outdev = rt->dst.dev; - - if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) - return NF_ACCEPT; - if (skb_try_make_writable(skb, sizeof(*iph))) - return NF_DROP; + mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) + mtu -= sizeof(*iph); - thoff = ip_hdr(skb)->ihl * 4; - if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) - return NF_ACCEPT; + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) + return 0; - flow_offload_refresh(flow_table, flow); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); + thoff = (iph->ihl * 4) + ctx->offset; + if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) + return 0; - if (nf_flow_offload_dst_check(&rt->dst)) { + if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); - return NF_ACCEPT; + return 0; } - if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) - return NF_DROP; + if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + return -1; + + flow_offload_refresh(flow_table, flow, false); + + nf_flow_encap_pop(skb, tuplehash); + thoff -= ctx->offset; iph = ip_hdr(skb); + nf_flow_nat_ip(flow, skb, thoff, dir, iph); + ip_decrease_ttl(iph); - skb->tstamp = 0; + skb_clear_tstamp(skb); if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); - if (unlikely(dst_xfrm(&rt->dst))) { + return 1; +} + +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +{ + int data_len = skb->len + sizeof(__be16); + struct ppp_hdr { + struct pppoe_hdr hdr; + __be16 proto; + } *ph; + __be16 proto; + + if (skb_cow_head(skb, PPPOE_SES_HLEN)) + return -1; + + switch (skb->protocol) { + case htons(ETH_P_IP): + proto = htons(PPP_IP); + break; + case htons(ETH_P_IPV6): + proto = htons(PPP_IPV6); + break; + default: + return -1; + } + + __skb_push(skb, PPPOE_SES_HLEN); + skb_reset_network_header(skb); + + ph = (struct ppp_hdr *)(skb->data); + ph->hdr.ver = 1; + ph->hdr.type = 1; + ph->hdr.code = 0; + ph->hdr.sid = htons(id); + ph->hdr.length = htons(data_len); + ph->proto = proto; + skb->protocol = htons(ETH_P_PPP_SES); + + return 0; +} + +static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + struct rtable *rt = dst_rtable(tuple->dst_cache); + u8 tos = iph->tos, ttl = iph->ttl; + __be16 frag_off = iph->frag_off; + u32 headroom = sizeof(*iph); + int err; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); + if (err) + return err; + + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + skb_clear_hash_if_not_l4(skb); + + /* Push down and install the IP header. */ + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; + iph->protocol = tuple->tun.l3_proto; + iph->tos = tos; + iph->daddr = tuple->tun.src_v4.s_addr; + iph->saddr = tuple->tun.dst_v4.s_addr; + iph->ttl = ttl; + iph->tot_len = htons(skb->len); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); + ip_send_check(iph); + + *ip_daddr = tuple->tun.src_v4.s_addr; + + return 0; +} + +static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); + + return 0; +} + +static int nf_flow_encap_push(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + int i; + + for (i = 0; i < tuple->encap_num; i++) { + switch (tuple->encap[i].proto) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + if (skb_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id) < 0) + return -1; + break; + case htons(ETH_P_PPP_SES): + if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + return -1; + break; + } + } + + return 0; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; + enum flow_offload_tuple_dir dir; + struct nf_flowtable_ctx ctx = { + .in = state->in, + }; + struct nf_flow_xmit xmit = {}; + struct flow_offload *flow; + struct neighbour *neigh; + struct rtable *rt; + __be32 ip_daddr; + int ret; + + tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); + if (!tuplehash) + return NF_ACCEPT; + + ret = nf_flow_offload_forward(&ctx, flow_table, tuplehash, skb); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { + rt = dst_rtable(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); - skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip_daddr = other_tuple->src_v4.s_addr; - return NF_STOLEN; + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) + return NF_DROP; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; + + switch (tuplehash->tuple.xmit_type) { + case FLOW_OFFLOAD_XMIT_NEIGH: + rt = dst_rtable(tuplehash->tuple.dst_cache); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; + skb_dst_set_noref(skb, &rt->dst); + break; + case FLOW_OFFLOAD_XMIT_DIRECT: + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; + break; + default: + WARN_ON_ONCE(1); + return NF_DROP; + } + + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); -static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr, + struct ipv6hdr *ip6h) { struct tcphdr *tcph; - if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || - skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, new_addr->s6_addr32, true); - - return 0; } -static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) { struct udphdr *udph; - if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || - skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, @@ -343,32 +696,26 @@ static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, struct in6_addr *addr, + struct in6_addr *new_addr) { switch (ip6h->nexthdr) { case IPPROTO_TCP: - if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h); break; case IPPROTO_UDP: - if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr); break; } - - return 0; } -static int nf_flow_snat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_snat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) { struct in6_addr addr, new_addr; @@ -383,17 +730,15 @@ static int nf_flow_snat_ipv6(const struct flow_offload *flow, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; ip6h->daddr = new_addr; break; - default: - return -1; } - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); + nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); } -static int nf_flow_dnat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_dnat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) { struct in6_addr addr, new_addr; @@ -408,139 +753,231 @@ static int nf_flow_dnat_ipv6(const struct flow_offload *flow, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; ip6h->saddr = new_addr; break; - default: - return -1; } - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); + nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); } -static int nf_flow_nat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, - enum flow_offload_tuple_dir dir) +static void nf_flow_nat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, + enum flow_offload_tuple_dir dir, + struct ipv6hdr *ip6h) { - struct ipv6hdr *ip6h = ipv6_hdr(skb); unsigned int thoff = sizeof(*ip6h); - if (test_bit(NF_FLOW_SNAT, &flow->flags) && - (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) - return -1; - - ip6h = ipv6_hdr(skb); - if (test_bit(NF_FLOW_DNAT, &flow->flags) && - (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) - return -1; - - return 0; + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir); + nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir); + nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir); + } } -static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, +static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, struct flow_offload_tuple *tuple) { struct flow_ports *ports; struct ipv6hdr *ip6h; unsigned int thoff; + u8 nexthdr; - if (!pskb_may_pull(skb, sizeof(*ip6h))) + thoff = sizeof(*ip6h) + ctx->offset; + if (!pskb_may_pull(skb, thoff)) return -1; - ip6h = ipv6_hdr(skb); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); - if (ip6h->nexthdr != IPPROTO_TCP && - ip6h->nexthdr != IPPROTO_UDP) + nexthdr = ip6h->nexthdr; + switch (nexthdr) { + case IPPROTO_TCP: + ctx->hdrsize = sizeof(struct tcphdr); + break; + case IPPROTO_UDP: + ctx->hdrsize = sizeof(struct udphdr); + break; +#ifdef CONFIG_NF_CT_PROTO_GRE + case IPPROTO_GRE: + ctx->hdrsize = sizeof(struct gre_base_hdr); + break; +#endif + default: return -1; + } if (ip6h->hop_limit <= 1) return -1; - thoff = sizeof(*ip6h); - if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; - ip6h = ipv6_hdr(skb); - ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + switch (nexthdr) { + case IPPROTO_TCP: + case IPPROTO_UDP: + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + break; + case IPPROTO_GRE: { + struct gre_base_hdr *greh; + + greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff); + if ((greh->flags & GRE_VERSION) != GRE_VERSION_0) + return -1; + break; + } + } + + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v6 = ip6h->saddr; tuple->dst_v6 = ip6h->daddr; - tuple->src_port = ports->source; - tuple->dst_port = ports->dest; tuple->l3proto = AF_INET6; - tuple->l4proto = ip6h->nexthdr; - tuple->iifidx = dev->ifindex; + tuple->l4proto = nexthdr; + tuple->iifidx = ctx->in->ifindex; + nf_flow_tuple_encap(skb, tuple); return 0; } -unsigned int -nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct flow_offload_tuple_rhash *tuplehash, + struct sk_buff *skb) { - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; enum flow_offload_tuple_dir dir; - const struct in6_addr *nexthop; struct flow_offload *flow; - struct net_device *outdev; + unsigned int thoff, mtu; struct ipv6hdr *ip6h; - struct rt6_info *rt; - - if (skb->protocol != htons(ETH_P_IPV6)) - return NF_ACCEPT; - - if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; - outdev = rt->dst.dev; - if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) - return NF_ACCEPT; - - if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb, - sizeof(*ip6h))) - return NF_ACCEPT; + mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) + return 0; - flow_offload_refresh(flow_table, flow); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); + thoff = sizeof(*ip6h) + ctx->offset; + if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) + return 0; - if (nf_flow_offload_dst_check(&rt->dst)) { + if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); - return NF_ACCEPT; + return 0; } - if (skb_try_make_writable(skb, sizeof(*ip6h))) - return NF_DROP; + if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + return -1; - if (nf_flow_nat_ipv6(flow, skb, dir) < 0) - return NF_DROP; + flow_offload_refresh(flow_table, flow, false); + + nf_flow_encap_pop(skb, tuplehash); ip6h = ipv6_hdr(skb); + nf_flow_nat_ipv6(flow, skb, dir, ip6h); + ip6h->hop_limit--; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); - if (unlikely(dst_xfrm(&rt->dst))) { + return 1; +} + +static struct flow_offload_tuple_rhash * +nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct sk_buff *skb) +{ + struct flow_offload_tuple tuple = {}; + + if (skb->protocol != htons(ETH_P_IPV6) && + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset)) + return NULL; + + if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0) + return NULL; + + return flow_offload_lookup(flow_table, &tuple); +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; + enum flow_offload_tuple_dir dir; + struct nf_flowtable_ctx ctx = { + .in = state->in, + }; + struct nf_flow_xmit xmit = {}; + struct in6_addr *ip6_daddr; + struct flow_offload *flow; + struct neighbour *neigh; + struct rt6_info *rt; + int ret; + + tuplehash = nf_flow_offload_ipv6_lookup(&ctx, flow_table, skb); + if (tuplehash == NULL) + return NF_ACCEPT; + + ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { + rt = dst_rt6_info(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); - skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip6_daddr = &other_tuple->src_v6; - return NF_STOLEN; + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; + + switch (tuplehash->tuple.xmit_type) { + case FLOW_OFFLOAD_XMIT_NEIGH: + rt = dst_rt6_info(tuplehash->tuple.dst_cache); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; + skb_dst_set_noref(skb, &rt->dst); + break; + case FLOW_OFFLOAD_XMIT_DIRECT: + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; + break; + default: + WARN_ON_ONCE(1); + return NF_DROP; + } + + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 62651e6683f6..d8f7bfd60ac6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -13,12 +13,13 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> -static struct workqueue_struct *nf_flow_offload_wq; +static struct workqueue_struct *nf_flow_offload_add_wq; +static struct workqueue_struct *nf_flow_offload_del_wq; +static struct workqueue_struct *nf_flow_offload_stats_wq; struct flow_offload_work { struct list_head list; enum flow_cls_command cmd; - int priority; struct nf_flowtable *flowtable; struct flow_offload *flow; struct work_struct work; @@ -33,7 +34,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; - unsigned int enc_keys; + unsigned long long enc_keys; if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) return; @@ -42,8 +43,8 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); mask->enc_key_id.keyid = 0xffffffff; - enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL); + enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL); if (ip_tunnel_info_af(tun_info) == AF_INET) { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, @@ -54,7 +55,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, mask->enc_ipv4.src = 0xffffffff; if (key->enc_ipv4.dst) mask->enc_ipv4.dst = 0xffffffff; - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else { memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, @@ -63,19 +64,29 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, sizeof(struct in6_addr)); if (memcmp(&key->enc_ipv6.src, &in6addr_any, sizeof(struct in6_addr))) - memset(&key->enc_ipv6.src, 0xff, + memset(&mask->enc_ipv6.src, 0xff, sizeof(struct in6_addr)); if (memcmp(&key->enc_ipv6.dst, &in6addr_any, sizeof(struct in6_addr))) - memset(&key->enc_ipv6.dst, 0xff, + memset(&mask->enc_ipv6.dst, 0xff, sizeof(struct in6_addr)); - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } match->dissector.used_keys |= enc_keys; } +static void nf_flow_rule_vlan_match(struct flow_dissector_key_vlan *key, + struct flow_dissector_key_vlan *mask, + u16 vlan_id, __be16 proto) +{ + key->vlan_id = vlan_id; + mask->vlan_id = VLAN_VID_MASK; + key->vlan_tpid = proto; + mask->vlan_tpid = 0xffff; +} + static int nf_flow_rule_match(struct nf_flow_match *match, const struct flow_offload_tuple *tuple, struct dst_entry *other_dst) @@ -83,6 +94,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; struct ip_tunnel_info *tun_info; + bool vlan_encap = false; NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); @@ -97,9 +109,39 @@ static int nf_flow_rule_match(struct nf_flow_match *match, nf_flow_rule_lwt_match(match, tun_info); } - key->meta.ingress_ifindex = tuple->iifidx; + if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC) + key->meta.ingress_ifindex = tuple->tc.iifidx; + else + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; + if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) && + tuple->encap[0].proto == htons(ETH_P_8021Q)) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, vlan); + nf_flow_rule_vlan_match(&key->vlan, &mask->vlan, + tuple->encap[0].id, + tuple->encap[0].proto); + vlan_encap = true; + } + + if (tuple->encap_num > 1 && !(tuple->in_vlan_ingress & BIT(1)) && + tuple->encap[1].proto == htons(ETH_P_8021Q)) { + if (vlan_encap) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CVLAN, + cvlan); + nf_flow_rule_vlan_match(&key->cvlan, &mask->cvlan, + tuple->encap[1].id, + tuple->encap[1].proto); + } else { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, + vlan); + nf_flow_rule_vlan_match(&key->vlan, &mask->vlan, + tuple->encap[1].id, + tuple->encap[1].proto); + } + } + switch (tuple->l3proto) { case AF_INET: key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -121,16 +163,17 @@ static int nf_flow_rule_match(struct nf_flow_match *match, return -EOPNOTSUPP; } mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(key->control.addr_type); + match->dissector.used_keys |= BIT_ULL(key->control.addr_type); mask->basic.n_proto = 0xffff; switch (tuple->l4proto) { case IPPROTO_TCP: key->tcp.flags = 0; mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP); break; case IPPROTO_UDP: + case IPPROTO_GRE: break; default: return -EOPNOTSUPP; @@ -139,15 +182,22 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->basic.ip_proto = tuple->l4proto; mask->basic.ip_proto = 0xff; - key->tp.src = tuple->src_port; - mask->tp.src = 0xffff; - key->tp.dst = tuple->dst_port; - mask->tp.dst = 0xffff; + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); + + switch (tuple->l4proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + key->tp.src = tuple->src_port; + mask->tp.src = 0xffff; + key->tp.dst = tuple->dst_port; + mask->tp.dst = 0xffff; + + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); + break; + } - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_PORTS); return 0; } @@ -175,27 +225,43 @@ static int flow_offload_eth_src(struct net *net, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { - const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple; struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); - struct net_device *dev; + const struct flow_offload_tuple *other_tuple, *this_tuple; + struct net_device *dev = NULL; + const unsigned char *addr; u32 mask, val; u16 val16; - dev = dev_get_by_index(net, tuple->iifidx); - if (!dev) - return -ENOENT; + this_tuple = &flow->tuplehash[dir].tuple; + + switch (this_tuple->xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + addr = this_tuple->out.h_source; + break; + case FLOW_OFFLOAD_XMIT_NEIGH: + other_tuple = &flow->tuplehash[!dir].tuple; + dev = dev_get_by_index(net, other_tuple->iifidx); + if (!dev) + return -ENOENT; + + addr = dev->dev_addr; + break; + default: + return -EOPNOTSUPP; + } mask = ~0xffff0000; - memcpy(&val16, dev->dev_addr, 2); + memcpy(&val16, addr, 2); val = val16 << 16; flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, &val, &mask); mask = ~0xffffffff; - memcpy(&val, dev->dev_addr + 2, 4); + memcpy(&val, addr + 2, 4); flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, &val, &mask); + dev_put(dev); return 0; @@ -208,27 +274,40 @@ static int flow_offload_eth_dst(struct net *net, { struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); - const void *daddr = &flow->tuplehash[!dir].tuple.src_v4; + const struct flow_offload_tuple *other_tuple, *this_tuple; const struct dst_entry *dst_cache; unsigned char ha[ETH_ALEN]; struct neighbour *n; + const void *daddr; u32 mask, val; u8 nud_state; u16 val16; - dst_cache = flow->tuplehash[dir].tuple.dst_cache; - n = dst_neigh_lookup(dst_cache, daddr); - if (!n) - return -ENOENT; - - read_lock_bh(&n->lock); - nud_state = n->nud_state; - ether_addr_copy(ha, n->ha); - read_unlock_bh(&n->lock); + this_tuple = &flow->tuplehash[dir].tuple; - if (!(nud_state & NUD_VALID)) { + switch (this_tuple->xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + ether_addr_copy(ha, this_tuple->out.h_dest); + break; + case FLOW_OFFLOAD_XMIT_NEIGH: + other_tuple = &flow->tuplehash[!dir].tuple; + daddr = &other_tuple->src_v4; + dst_cache = this_tuple->dst_cache; + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -ENOENT; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); neigh_release(n); - return -ENOENT; + + if (!(nud_state & NUD_VALID)) + return -ENOENT; + break; + default: + return -EOPNOTSUPP; } mask = ~0xffffffff; @@ -241,7 +320,6 @@ static int flow_offload_eth_dst(struct net *net, val = val16; flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, &val, &mask); - neigh_release(n); return 0; } @@ -307,10 +385,10 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, struct flow_action_entry *entry; int i; - for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) { + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) { entry = flow_action_entry_next(flow_rule); flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, - offset + i, &addr[i], mask); + offset + i * sizeof(u32), &addr[i], mask); } } @@ -463,27 +541,52 @@ static void flow_offload_ipv4_checksum(struct net *net, } } -static void flow_offload_redirect(const struct flow_offload *flow, +static void flow_offload_redirect(struct net *net, + const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { - struct flow_action_entry *entry = flow_action_entry_next(flow_rule); - struct rtable *rt; + const struct flow_offload_tuple *this_tuple, *other_tuple; + struct flow_action_entry *entry; + struct net_device *dev; + int ifindex; + + this_tuple = &flow->tuplehash[dir].tuple; + switch (this_tuple->xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + this_tuple = &flow->tuplehash[dir].tuple; + ifindex = this_tuple->out.ifidx; + break; + case FLOW_OFFLOAD_XMIT_NEIGH: + other_tuple = &flow->tuplehash[!dir].tuple; + ifindex = other_tuple->iifidx; + break; + default: + return; + } - rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + dev = dev_get_by_index(net, ifindex); + if (!dev) + return; + + entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_REDIRECT; - entry->dev = rt->dst.dev; - dev_hold(rt->dst.dev); + entry->dev = dev; } static void flow_offload_encap_tunnel(const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { + const struct flow_offload_tuple *this_tuple; struct flow_action_entry *entry; struct dst_entry *dst; - dst = flow->tuplehash[dir].tuple.dst_cache; + this_tuple = &flow->tuplehash[dir].tuple; + if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) + return; + + dst = this_tuple->dst_cache; if (dst && dst->lwtstate) { struct ip_tunnel_info *tun_info; @@ -500,10 +603,15 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { + const struct flow_offload_tuple *other_tuple; struct flow_action_entry *entry; struct dst_entry *dst; - dst = flow->tuplehash[!dir].tuple.dst_cache; + other_tuple = &flow->tuplehash[!dir].tuple; + if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) + return; + + dst = other_tuple->dst_cache; if (dst && dst->lwtstate) { struct ip_tunnel_info *tun_info; @@ -515,10 +623,15 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, } } -int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int +nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { + const struct flow_offload_tuple *other_tuple; + const struct flow_offload_tuple *tuple; + int i; + flow_offload_decap_tunnel(flow, dir, flow_rule); flow_offload_encap_tunnel(flow, dir, flow_rule); @@ -526,6 +639,53 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; + tuple = &flow->tuplehash[dir].tuple; + + for (i = 0; i < tuple->encap_num; i++) { + struct flow_action_entry *entry; + + if (tuple->in_vlan_ingress & BIT(i)) + continue; + + if (tuple->encap[i].proto == htons(ETH_P_8021Q)) { + entry = flow_action_entry_next(flow_rule); + entry->id = FLOW_ACTION_VLAN_POP; + } + } + + other_tuple = &flow->tuplehash[!dir].tuple; + + for (i = 0; i < other_tuple->encap_num; i++) { + struct flow_action_entry *entry; + + if (other_tuple->in_vlan_ingress & BIT(i)) + continue; + + entry = flow_action_entry_next(flow_rule); + + switch (other_tuple->encap[i].proto) { + case htons(ETH_P_PPP_SES): + entry->id = FLOW_ACTION_PPPOE_PUSH; + entry->pppoe.sid = other_tuple->encap[i].id; + break; + case htons(ETH_P_8021Q): + entry->id = FLOW_ACTION_VLAN_PUSH; + entry->vlan.vid = other_tuple->encap[i].id; + entry->vlan.proto = other_tuple->encap[i].proto; + break; + } + } + + return 0; +} + +int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0) + return -1; + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv4_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); @@ -538,21 +698,17 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, test_bit(NF_FLOW_DNAT, &flow->flags)) flow_offload_ipv4_checksum(net, flow, flow_rule); - flow_offload_redirect(flow, dir, flow_rule); + flow_offload_redirect(net, flow, dir, flow_rule); return 0; } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); -int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { - flow_offload_decap_tunnel(flow, dir, flow_rule); - flow_offload_encap_tunnel(flow, dir, flow_rule); - - if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || - flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0) return -1; if (test_bit(NF_FLOW_SNAT, &flow->flags)) { @@ -564,7 +720,7 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, flow_offload_port_dnat(net, flow, dir, flow_rule); } - flow_offload_redirect(flow, dir, flow_rule); + flow_offload_redirect(net, flow, dir, flow_rule); return 0; } @@ -578,10 +734,10 @@ nf_flow_offload_rule_alloc(struct net *net, enum flow_offload_tuple_dir dir) { const struct nf_flowtable *flowtable = offload->flowtable; - const struct flow_offload *flow = offload->flow; - const struct flow_offload_tuple *tuple; + const struct flow_offload_tuple *tuple, *other_tuple; + struct flow_offload *flow = offload->flow; + struct dst_entry *other_dst = NULL; struct nf_flow_rule *flow_rule; - struct dst_entry *other_dst; int err = -ENOMEM; flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); @@ -597,7 +753,10 @@ nf_flow_offload_rule_alloc(struct net *net, flow_rule->rule->match.key = &flow_rule->match.key; tuple = &flow->tuplehash[dir].tuple; - other_dst = flow->tuplehash[!dir].tuple.dst_cache; + other_tuple = &flow->tuplehash[!dir].tuple; + if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + other_dst = other_tuple->dst_cache; + err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst); if (err < 0) goto err_flow_match; @@ -682,8 +841,8 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, struct list_head *block_cb_list) { struct flow_cls_offload cls_flow = {}; + struct netlink_ext_ack extack = {}; struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; int err, i = 0; @@ -714,7 +873,8 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { return nf_flow_offload_tuple(offload->flowtable, offload->flow, - flow_rule, dir, offload->priority, + flow_rule, dir, + offload->flowtable->priority, FLOW_CLS_REPLACE, NULL, &offload->flowtable->flow_block.cb_list); } @@ -723,7 +883,8 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, - offload->priority, FLOW_CLS_DESTROY, NULL, + offload->flowtable->priority, + FLOW_CLS_DESTROY, NULL, &offload->flowtable->flow_block.cb_list); } @@ -734,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, ok_count += flow_offload_tuple_add(offload, flow_rule[0], FLOW_OFFLOAD_DIR_ORIGINAL); - ok_count += flow_offload_tuple_add(offload, flow_rule[1], - FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); if (ok_count == 0) return -ENOENT; @@ -753,10 +915,11 @@ static void flow_offload_work_add(struct flow_offload_work *offload) err = flow_offload_rule_add(offload, flow_rule); if (err < 0) - set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); - else - set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); + goto out; + + set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); +out: nf_flow_offload_destroy(flow_rule); } @@ -764,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload) { clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); } @@ -773,7 +937,8 @@ static void flow_offload_tuple_stats(struct flow_offload_work *offload, struct flow_stats *stats) { nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, - offload->priority, FLOW_CLS_STATS, stats, + offload->flowtable->priority, + FLOW_CLS_STATS, stats, &offload->flowtable->flow_block.cb_list); } @@ -783,11 +948,13 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) u64 lastused; flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); - flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, + &stats[1]); lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, - lastused + NF_FLOW_TIMEOUT); + lastused + flow_offload_get_timeout(offload->flow)); if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) { if (stats[0].pkts) @@ -804,17 +971,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) static void flow_offload_work_handler(struct work_struct *work) { struct flow_offload_work *offload; + struct net *net; offload = container_of(work, struct flow_offload_work, work); + net = read_pnet(&offload->flowtable->net); switch (offload->cmd) { case FLOW_CLS_REPLACE: flow_offload_work_add(offload); + NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_add); break; case FLOW_CLS_DESTROY: flow_offload_work_del(offload); + NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_del); break; case FLOW_CLS_STATS: flow_offload_work_stats(offload); + NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_stats); break; default: WARN_ON_ONCE(1); @@ -826,7 +998,18 @@ static void flow_offload_work_handler(struct work_struct *work) static void flow_offload_queue_work(struct flow_offload_work *offload) { - queue_work(nf_flow_offload_wq, &offload->work); + struct net *net = read_pnet(&offload->flowtable->net); + + if (offload->cmd == FLOW_CLS_REPLACE) { + NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_add); + queue_work(nf_flow_offload_add_wq, &offload->work); + } else if (offload->cmd == FLOW_CLS_DESTROY) { + NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_del); + queue_work(nf_flow_offload_del_wq, &offload->work); + } else { + NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_stats); + queue_work(nf_flow_offload_stats_wq, &offload->work); + } } static struct flow_offload_work * @@ -846,7 +1029,6 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, offload->cmd = cmd; offload->flow = flow; - offload->priority = flowtable->priority; offload->flowtable = flowtable; INIT_WORK(&offload->work, flow_offload_work_handler); @@ -886,7 +1068,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, __s32 delta; delta = nf_flow_timeout_delta(flow->timeout); - if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) + if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10)) return; offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); @@ -896,10 +1078,21 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, flow_offload_queue_work(offload); } +void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable) +{ + if (nf_flowtable_hw_offload(flowtable)) { + flush_workqueue(nf_flow_offload_del_wq); + nf_flow_table_gc_run(flowtable); + } +} + void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { - if (nf_flowtable_hw_offload(flowtable)) - flush_workqueue(nf_flow_offload_wq); + if (nf_flowtable_hw_offload(flowtable)) { + flush_workqueue(nf_flow_offload_add_wq); + flush_workqueue(nf_flow_offload_del_wq); + flush_workqueue(nf_flow_offload_stats_wq); + } } static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, @@ -909,6 +1102,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, struct flow_block_cb *block_cb, *next; int err = 0; + down_write(&flowtable->flow_block_lock); switch (cmd) { case FLOW_BLOCK_BIND: list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); @@ -923,6 +1117,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, WARN_ON_ONCE(1); err = -EOPNOTSUPP; } + up_write(&flowtable->flow_block_lock); return err; } @@ -939,6 +1134,7 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; + bo->cb_list_head = &flowtable->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } @@ -950,6 +1146,7 @@ static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb) nf_flow_table_gc_cleanup(flowtable, dev); down_write(&flowtable->flow_block_lock); list_del(&block_cb->list); + list_del(&block_cb->driver_list); flow_block_cb_free(block_cb); up_write(&flowtable->flow_block_lock); } @@ -963,7 +1160,7 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); - return flow_indr_dev_setup_offload(dev, TC_SETUP_FT, flowtable, bo, + return flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_FT, flowtable, bo, nf_flow_table_indr_cleanup); } @@ -977,7 +1174,9 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); + down_write(&flowtable->flow_block_lock); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + up_write(&flowtable->flow_block_lock); if (err < 0) return err; @@ -993,7 +1192,7 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, int err; if (!nf_flowtable_hw_offload(flowtable)) - return 0; + return nf_flow_offload_xdp_setup(flowtable, dev, cmd); if (dev->netdev_ops->ndo_setup_tc) err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, @@ -1010,15 +1209,33 @@ EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); int nf_flow_table_offload_init(void) { - nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", - WQ_UNBOUND, 0); - if (!nf_flow_offload_wq) + nf_flow_offload_add_wq = alloc_workqueue("nf_ft_offload_add", + WQ_UNBOUND | WQ_SYSFS, 0); + if (!nf_flow_offload_add_wq) return -ENOMEM; + nf_flow_offload_del_wq = alloc_workqueue("nf_ft_offload_del", + WQ_UNBOUND | WQ_SYSFS, 0); + if (!nf_flow_offload_del_wq) + goto err_del_wq; + + nf_flow_offload_stats_wq = alloc_workqueue("nf_ft_offload_stats", + WQ_UNBOUND | WQ_SYSFS, 0); + if (!nf_flow_offload_stats_wq) + goto err_stats_wq; + return 0; + +err_stats_wq: + destroy_workqueue(nf_flow_offload_del_wq); +err_del_wq: + destroy_workqueue(nf_flow_offload_add_wq); + return -ENOMEM; } void nf_flow_table_offload_exit(void) { - destroy_workqueue(nf_flow_offload_wq); + destroy_workqueue(nf_flow_offload_add_wq); + destroy_workqueue(nf_flow_offload_del_wq); + destroy_workqueue(nf_flow_offload_stats_wq); } diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c new file mode 100644 index 000000000000..f0984cf69a09 --- /dev/null +++ b/net/netfilter/nf_flow_table_path.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_common.h> +#include <linux/netfilter/nf_tables.h> +#include <net/ip.h> +#include <net/inet_dscp.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_flow_table.h> + +static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) +{ + if (dst_xfrm(dst)) + return FLOW_OFFLOAD_XMIT_XFRM; + + return FLOW_OFFLOAD_XMIT_NEIGH; +} + +static void nft_default_forward_path(struct nf_flow_route *route, + struct dst_entry *dst_cache, + enum ip_conntrack_dir dir) +{ + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; + route->tuple[dir].dst = dst_cache; + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); +} + +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + +static int nft_dev_fill_forward_path(const struct nf_flow_route *route, + const struct dst_entry *dst_cache, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, u8 *ha, + struct net_device_path_stack *stack) +{ + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; + struct net_device *dev = dst_cache->dev; + struct neighbour *n; + u8 nud_state; + + if (!nft_is_valid_ether_device(dev)) + goto out; + + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -1; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + neigh_release(n); + + if (!(nud_state & NUD_VALID)) + return -1; + +out: + return dev_fill_forward_path(dev, ha, stack); +} + +struct nft_forward_info { + const struct net_device *indev; + const struct net_device *outdev; + struct id { + __u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; + struct flow_offload_tunnel tun; + u8 num_tuns; + u8 ingress_vlans; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + enum flow_offload_xmit_type xmit_type; +}; + +static void nft_dev_path_info(const struct net_device_path_stack *stack, + struct nft_forward_info *info, + unsigned char *ha, struct nf_flowtable *flowtable) +{ + const struct net_device_path *path; + int i; + + memcpy(info->h_dest, ha, ETH_ALEN); + + for (i = 0; i < stack->num_paths; i++) { + path = &stack->path[i]; + switch (path->type) { + case DEV_PATH_ETHERNET: + case DEV_PATH_DSA: + case DEV_PATH_VLAN: + case DEV_PATH_PPPOE: + case DEV_PATH_TUN: + info->indev = path->dev; + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + if (path->type == DEV_PATH_ETHERNET) + break; + if (path->type == DEV_PATH_DSA) { + i = stack->num_paths; + break; + } + + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ + if (path->type == DEV_PATH_TUN) { + if (info->num_tuns) { + info->indev = NULL; + break; + } + info->tun.src_v6 = path->tun.src_v6; + info->tun.dst_v6 = path->tun.dst_v6; + info->tun.l3_proto = path->tun.l3_proto; + info->num_tuns++; + } else { + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = + path->encap.id; + info->encap[info->num_encaps].proto = + path->encap.proto; + info->num_encaps++; + } + if (path->type == DEV_PATH_PPPOE) + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + break; + case DEV_PATH_BRIDGE: + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_UNTAG_HW: + info->ingress_vlans |= BIT(info->num_encaps - 1); + break; + case DEV_PATH_BR_VLAN_TAG: + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = path->bridge.vlan_id; + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; + info->num_encaps++; + break; + case DEV_PATH_BR_VLAN_UNTAG: + if (WARN_ON_ONCE(info->num_encaps-- == 0)) { + info->indev = NULL; + break; + } + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + break; + default: + info->indev = NULL; + break; + } + } + info->outdev = info->indev; + + if (nf_flowtable_hw_offload(flowtable) && + nft_is_valid_ether_device(info->indev)) + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; +} + +static bool nft_flowtable_find_dev(const struct net_device *dev, + struct nft_flowtable *ft) +{ + struct nft_hook *hook; + bool found = false; + + list_for_each_entry_rcu(hook, &ft->hook_list, list) { + if (!nft_hook_find_ops_rcu(hook, dev)) + continue; + + found = true; + break; + } + + return found; +} + +static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, + struct flow_offload_tunnel *tun, + struct nf_flow_route *route, + enum ip_conntrack_dir dir) +{ + struct dst_entry *cur_dst = route->tuple[dir].dst; + struct dst_entry *tun_dst = NULL; + struct flowi fl = {}; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = tun->dst_v4.s_addr; + fl.u.ip4.saddr = tun->src_v4.s_addr; + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = tun->dst_v6; + fl.u.ip6.saddr = tun->src_v6; + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); + if (!tun_dst) + return -ENOENT; + + route->tuple[dir].dst = tun_dst; + dst_release(cur_dst); + + return 0; +} + +static void nft_dev_forward_path(const struct nft_pktinfo *pkt, + struct nf_flow_route *route, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + const struct dst_entry *dst = route->tuple[dir].dst; + struct net_device_path_stack stack; + struct nft_forward_info info = {}; + unsigned char ha[ETH_ALEN]; + int i; + + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) + nft_dev_path_info(&stack, &info, ha, &ft->data); + + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) + return; + + route->tuple[!dir].in.ifindex = info.indev->ifindex; + for (i = 0; i < info.num_encaps; i++) { + route->tuple[!dir].in.encap[i].id = info.encap[i].id; + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; + } + + if (info.num_tuns && + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; + route->tuple[!dir].in.num_tuns = info.num_tuns; + } + + route->tuple[!dir].in.num_encaps = info.num_encaps; + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; + route->tuple[dir].out.ifindex = info.outdev->ifindex; + + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); + route->tuple[dir].xmit_type = info.xmit_type; + } +} + +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + struct dst_entry *this_dst = skb_dst(pkt->skb); + struct dst_entry *other_dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; + fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; + fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + if (!dst_hold_safe(this_dst)) + return -ENOENT; + + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); + if (!other_dst) { + dst_release(this_dst); + return -ENOENT; + } + + nft_default_forward_path(route, this_dst, dir); + nft_default_forward_path(route, other_dst, !dir); + + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, dir, ft); + if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, !dir, ft); + + return 0; +} +EXPORT_SYMBOL_GPL(nft_flow_route); diff --git a/net/netfilter/nf_flow_table_procfs.c b/net/netfilter/nf_flow_table_procfs.c new file mode 100644 index 000000000000..159b033a43e6 --- /dev/null +++ b/net/netfilter/nf_flow_table_procfs.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <net/netfilter/nf_flow_table.h> + +static void *nf_flow_table_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos - 1; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(net->ft.stat, cpu); + } + + return NULL; +} + +static void *nf_flow_table_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(net->ft.stat, cpu); + } + (*pos)++; + return NULL; +} + +static void nf_flow_table_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int nf_flow_table_cpu_seq_show(struct seq_file *seq, void *v) +{ + const struct nf_flow_table_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "wq_add wq_del wq_stats\n"); + return 0; + } + + seq_printf(seq, "%8d %8d %8d\n", + st->count_wq_add, + st->count_wq_del, + st->count_wq_stats + ); + return 0; +} + +static const struct seq_operations nf_flow_table_cpu_seq_ops = { + .start = nf_flow_table_cpu_seq_start, + .next = nf_flow_table_cpu_seq_next, + .stop = nf_flow_table_cpu_seq_stop, + .show = nf_flow_table_cpu_seq_show, +}; + +int nf_flow_table_init_proc(struct net *net) +{ + struct proc_dir_entry *pde; + + pde = proc_create_net("nf_flowtable", 0444, net->proc_net_stat, + &nf_flow_table_cpu_seq_ops, + sizeof(struct seq_net_private)); + return pde ? 0 : -ENOMEM; +} + +void nf_flow_table_fini_proc(struct net *net) +{ + remove_proc_entry("nf_flowtable", net->proc_net_stat); +} diff --git a/net/netfilter/nf_flow_table_xdp.c b/net/netfilter/nf_flow_table_xdp.c new file mode 100644 index 000000000000..e1252d042699 --- /dev/null +++ b/net/netfilter/nf_flow_table_xdp.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_flow_table.h> + +struct flow_offload_xdp_ft { + struct list_head head; + struct nf_flowtable *ft; + struct rcu_head rcuhead; +}; + +struct flow_offload_xdp { + struct hlist_node hnode; + unsigned long net_device_addr; + struct list_head head; +}; + +#define NF_XDP_HT_BITS 4 +static DEFINE_HASHTABLE(nf_xdp_hashtable, NF_XDP_HT_BITS); +static DEFINE_MUTEX(nf_xdp_hashtable_lock); + +/* caller must hold rcu read lock */ +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev) +{ + unsigned long key = (unsigned long)dev; + struct flow_offload_xdp *iter; + + hash_for_each_possible_rcu(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + struct flow_offload_xdp_ft *ft_elem; + + /* The user is supposed to insert a given net_device + * just into a single nf_flowtable so we always return + * the first element here. + */ + ft_elem = list_first_or_null_rcu(&iter->head, + struct flow_offload_xdp_ft, + head); + return ft_elem ? ft_elem->ft : NULL; + } + } + + return NULL; +} + +static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft, + const struct net_device *dev) +{ + struct flow_offload_xdp *iter, *elem = NULL; + unsigned long key = (unsigned long)dev; + struct flow_offload_xdp_ft *ft_elem; + + ft_elem = kzalloc(sizeof(*ft_elem), GFP_KERNEL_ACCOUNT); + if (!ft_elem) + return -ENOMEM; + + ft_elem->ft = ft; + + mutex_lock(&nf_xdp_hashtable_lock); + + hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + elem = iter; + break; + } + } + + if (!elem) { + elem = kzalloc(sizeof(*elem), GFP_KERNEL_ACCOUNT); + if (!elem) + goto err_unlock; + + elem->net_device_addr = key; + INIT_LIST_HEAD(&elem->head); + hash_add_rcu(nf_xdp_hashtable, &elem->hnode, key); + } + list_add_tail_rcu(&ft_elem->head, &elem->head); + + mutex_unlock(&nf_xdp_hashtable_lock); + + return 0; + +err_unlock: + mutex_unlock(&nf_xdp_hashtable_lock); + kfree(ft_elem); + + return -ENOMEM; +} + +static void nf_flowtable_by_dev_remove(struct nf_flowtable *ft, + const struct net_device *dev) +{ + struct flow_offload_xdp *iter, *elem = NULL; + unsigned long key = (unsigned long)dev; + + mutex_lock(&nf_xdp_hashtable_lock); + + hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + elem = iter; + break; + } + } + + if (elem) { + struct flow_offload_xdp_ft *ft_elem, *ft_next; + + list_for_each_entry_safe(ft_elem, ft_next, &elem->head, head) { + if (ft_elem->ft == ft) { + list_del_rcu(&ft_elem->head); + kfree_rcu(ft_elem, rcuhead); + } + } + + if (list_empty(&elem->head)) + hash_del_rcu(&elem->hnode); + else + elem = NULL; + } + + mutex_unlock(&nf_xdp_hashtable_lock); + + if (elem) { + synchronize_rcu(); + kfree(elem); + } +} + +int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + switch (cmd) { + case FLOW_BLOCK_BIND: + return nf_flowtable_by_dev_insert(flowtable, dev); + case FLOW_BLOCK_UNBIND: + nf_flowtable_by_dev_remove(flowtable, dev); + return 0; + } + + WARN_ON_ONCE(1); + return 0; +} diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c new file mode 100644 index 000000000000..2d890dd04ff8 --- /dev/null +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/sysctl.h> +#include <net/lwtunnel.h> +#include <net/netfilter/nf_hooks_lwtunnel.h> +#include <linux/netfilter.h> + +#include "nf_internals.h" + +static inline int nf_hooks_lwtunnel_get(void) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return 1; + else + return 0; +} + +static inline int nf_hooks_lwtunnel_set(int enable) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) { + if (!enable) + return -EBUSY; + } else if (enable) { + static_branch_enable(&nf_hooks_lwtunnel_enabled); + } + + return 0; +} + +#ifdef CONFIG_SYSCTL +int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int proc_nf_hooks_lwtunnel_enabled = 0; + struct ctl_table tmp = { + .procname = table->procname, + .data = &proc_nf_hooks_lwtunnel_enabled, + .maxlen = sizeof(int), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + int ret; + + if (!write) + proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get(); + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); + +static struct ctl_table nf_lwtunnel_sysctl_table[] = { + { + .procname = "nf_hooks_lwtunnel", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = nf_hooks_lwtunnel_sysctl_handler, + }, +}; + +static int __net_init nf_lwtunnel_net_init(struct net *net) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = nf_lwtunnel_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_lwtunnel_sysctl_table, + sizeof(nf_lwtunnel_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } + + hdr = register_net_sysctl_sz(net, "net/netfilter", table, + ARRAY_SIZE(nf_lwtunnel_sysctl_table)); + if (!hdr) + goto err_reg; + + net->nf.nf_lwtnl_dir_header = hdr; + + return 0; +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit nf_lwtunnel_net_exit(struct net *net) +{ + const struct ctl_table *table; + + table = net->nf.nf_lwtnl_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations nf_lwtunnel_net_ops = { + .init = nf_lwtunnel_net_init, + .exit = nf_lwtunnel_net_exit, +}; + +int __init netfilter_lwtunnel_init(void) +{ + return register_pernet_subsys(&nf_lwtunnel_net_ops); +} + +void netfilter_lwtunnel_fini(void) +{ + unregister_pernet_subsys(&nf_lwtunnel_net_ops); +} +#else +int __init netfilter_lwtunnel_init(void) { return 0; } +void netfilter_lwtunnel_fini(void) {} +#endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 832ae64179f0..25403023060b 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -29,6 +29,12 @@ void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ int __init netfilter_log_init(void); +#ifdef CONFIG_LWTUNNEL +/* nf_hooks_lwtunnel.c */ +int __init netfilter_lwtunnel_init(void); +void netfilter_lwtunnel_fini(void); +#endif + /* core.c */ void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 6cb9f9474b05..74cef8bf554c 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -31,10 +31,10 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) int i; for (i = 0; i < NF_LOG_TYPE_MAX; i++) { - if (loggers[pf][i] == NULL) + log = nft_log_dereference(loggers[pf][i]); + if (!log) continue; - log = nft_log_dereference(loggers[pf][i]); if (!strncasecmp(str_logger, log->name, strlen(log->name))) return log; } @@ -125,6 +125,32 @@ void nf_log_unregister(struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_unregister); +/** + * nf_log_is_registered - Check if any logger is registered for a given + * protocol family. + * + * @pf: Protocol family + * + * Returns: true if at least one logger is active for @pf, false otherwise. + */ +bool nf_log_is_registered(u_int8_t pf) +{ + int i; + + if (pf >= NFPROTO_NUMPROTO) { + WARN_ON_ONCE(1); + return false; + } + + for (i = 0; i < NF_LOG_TYPE_MAX; i++) { + if (rcu_access_pointer(loggers[pf][i])) + return true; + } + + return false; +} +EXPORT_SYMBOL(nf_log_is_registered); + int nf_log_bind_pf(struct net *net, u_int8_t pf, const struct nf_logger *logger) { @@ -151,18 +177,16 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf) } EXPORT_SYMBOL(nf_log_unbind_pf); -void nf_logger_request_module(int pf, enum nf_log_type type) -{ - if (loggers[pf][type] == NULL) - request_module("nf-logger-%u-%u", pf, type); -} -EXPORT_SYMBOL_GPL(nf_logger_request_module); - int nf_logger_find_get(int pf, enum nf_log_type type) { struct nf_logger *logger; int ret = -ENOENT; + if (pf >= ARRAY_SIZE(loggers)) + return -EINVAL; + if (type >= NF_LOG_TYPE_MAX) + return -EINVAL; + if (pf == NFPROTO_INET) { ret = nf_logger_find_get(NFPROTO_IPV4, type); if (ret < 0) @@ -177,9 +201,6 @@ int nf_logger_find_get(int pf, enum nf_log_type type) return 0; } - if (rcu_access_pointer(loggers[pf][type]) == NULL) - request_module("nf-logger-%u-%u", pf, type); - rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); if (logger == NULL) @@ -203,11 +224,12 @@ void nf_logger_put(int pf, enum nf_log_type type) return; } - BUG_ON(loggers[pf][type] == NULL); - rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); - module_put(logger->me); + if (!logger) + WARN_ON_ONCE(1); + else + module_put(logger->me); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_logger_put); @@ -399,7 +421,7 @@ static const struct seq_operations nflog_seq_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; -static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO]; static struct ctl_table_header *nf_log_sysctl_fhdr; static struct ctl_table nf_log_sysctl_ftable[] = { @@ -410,10 +432,9 @@ static struct ctl_table nf_log_sysctl_ftable[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; -static int nf_log_proc_dostring(struct ctl_table *table, int write, +static int nf_log_proc_dostring(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; @@ -453,9 +474,9 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write, mutex_lock(&nf_log_mutex); logger = nft_log_dereference(net->nf.nf_loggers[tindex]); if (!logger) - strlcpy(buf, "NONE", sizeof(buf)); + strscpy(buf, "NONE", sizeof(buf)); else - strlcpy(buf, logger->name, sizeof(buf)); + strscpy(buf, logger->name, sizeof(buf)); mutex_unlock(&nf_log_mutex); r = proc_dostring(&tmp, write, buffer, lenp, ppos); } @@ -497,9 +518,10 @@ static int netfilter_log_sysctl_init(struct net *net) for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) table[i].extra2 = net; - net->nf.nf_log_dir_header = register_net_sysctl(net, - "net/netfilter/nf_log", - table); + net->nf.nf_log_dir_header = register_net_sysctl_sz(net, + "net/netfilter/nf_log", + table, + ARRAY_SIZE(nf_log_sysctl_table)); if (!net->nf.nf_log_dir_header) goto err_reg; @@ -517,7 +539,7 @@ err_alloc: static void netfilter_log_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->nf.nf_log_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_log_dir_header); diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c deleted file mode 100644 index ae5628ddbe6d..000000000000 --- a/net/netfilter/nf_log_common.c +++ /dev/null @@ -1,212 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/tcp.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_bridge.h> -#include <linux/netfilter/xt_LOG.h> -#include <net/netfilter/nf_log.h> - -int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset) -{ - struct udphdr _udph; - const struct udphdr *uh; - - if (proto == IPPROTO_UDP) - /* Max length: 10 "PROTO=UDP " */ - nf_log_buf_add(m, "PROTO=UDP "); - else /* Max length: 14 "PROTO=UDPLITE " */ - nf_log_buf_add(m, "PROTO=UDPLITE "); - - if (fragment) - goto out; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); - if (uh == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", - ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); - -out: - return 0; -} -EXPORT_SYMBOL_GPL(nf_log_dump_udp_header); - -int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset, - unsigned int logflags) -{ - struct tcphdr _tcph; - const struct tcphdr *th; - - /* Max length: 10 "PROTO=TCP " */ - nf_log_buf_add(m, "PROTO=TCP "); - - if (fragment) - return 0; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); - if (th == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - nf_log_buf_add(m, "SPT=%u DPT=%u ", - ntohs(th->source), ntohs(th->dest)); - /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & NF_LOG_TCPSEQ) { - nf_log_buf_add(m, "SEQ=%u ACK=%u ", - ntohl(th->seq), ntohl(th->ack_seq)); - } - - /* Max length: 13 "WINDOW=65535 " */ - nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); - /* Max length: 9 "RES=0x3C " */ - nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & - TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ - if (th->cwr) - nf_log_buf_add(m, "CWR "); - if (th->ece) - nf_log_buf_add(m, "ECE "); - if (th->urg) - nf_log_buf_add(m, "URG "); - if (th->ack) - nf_log_buf_add(m, "ACK "); - if (th->psh) - nf_log_buf_add(m, "PSH "); - if (th->rst) - nf_log_buf_add(m, "RST "); - if (th->syn) - nf_log_buf_add(m, "SYN "); - if (th->fin) - nf_log_buf_add(m, "FIN "); - /* Max length: 11 "URGP=65535 " */ - nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); - - if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { - u_int8_t _opt[60 - sizeof(struct tcphdr)]; - const u_int8_t *op; - unsigned int i; - unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); - - op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), - optsize, _opt); - if (op == NULL) { - nf_log_buf_add(m, "OPT (TRUNCATED)"); - return 1; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - nf_log_buf_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - nf_log_buf_add(m, "%02X", op[i]); - - nf_log_buf_add(m, ") "); - } - - return 0; -} -EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); - -void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, - struct sock *sk) -{ - if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) - return; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - const struct cred *cred = sk->sk_socket->file->f_cred; - nf_log_buf_add(m, "UID=%u GID=%u ", - from_kuid_munged(&init_user_ns, cred->fsuid), - from_kgid_munged(&init_user_ns, cred->fsgid)); - } - read_unlock_bh(&sk->sk_callback_lock); -} -EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid); - -void -nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, const char *prefix) -{ - const struct net_device *physoutdev __maybe_unused; - const struct net_device *physindev __maybe_unused; - - nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", - '0' + loginfo->u.log.level, prefix, - in ? in->name : "", - out ? out->name : ""); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - physindev = nf_bridge_get_physindev(skb); - if (physindev && in != physindev) - nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); - physoutdev = nf_bridge_get_physoutdev(skb); - if (physoutdev && out != physoutdev) - nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); -#endif -} -EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); - -/* bridge and netdev logging families share this code. */ -void nf_log_l2packet(struct net *net, u_int8_t pf, - __be16 protocol, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - switch (protocol) { - case htons(ETH_P_IP): - nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - case htons(ETH_P_IPV6): - nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - case htons(ETH_P_ARP): - case htons(ETH_P_RARP): - nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - } -} -EXPORT_SYMBOL_GPL(nf_log_l2packet); - -static int __init nf_log_common_init(void) -{ - return 0; -} - -static void __exit nf_log_common_exit(void) {} - -module_init(nf_log_common_init); -module_exit(nf_log_common_exit); - -MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c deleted file mode 100644 index 968dafa684c9..000000000000 --- a/net/netfilter/nf_log_netdev.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org> - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_log.h> - -static void nf_log_netdev_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out, - loginfo, prefix); -} - -static struct nf_logger nf_netdev_logger __read_mostly = { - .name = "nf_log_netdev", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_netdev_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_netdev_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger); -} - -static void __net_exit nf_log_netdev_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_netdev_logger); -} - -static struct pernet_operations nf_log_netdev_net_ops = { - .init = nf_log_netdev_net_init, - .exit = nf_log_netdev_net_exit, -}; - -static int __init nf_log_netdev_init(void) -{ - int ret; - - /* Request to load the real packet loggers. */ - nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG); - nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG); - nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG); - - ret = register_pernet_subsys(&nf_log_netdev_net_ops); - if (ret < 0) - return ret; - - nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger); - return 0; -} - -static void __exit nf_log_netdev_exit(void) -{ - unregister_pernet_subsys(&nf_log_netdev_net_ops); - nf_log_unregister(&nf_netdev_logger); -} - -module_init(nf_log_netdev_init); -module_exit(nf_log_netdev_exit); - -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_DESCRIPTION("Netfilter netdev packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */ diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c new file mode 100644 index 000000000000..86d5fc5d28e3 --- /dev/null +++ b/net/netfilter/nf_log_syslog.c @@ -0,0 +1,1085 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +static const struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = LOGLEVEL_NOTICE, + .logflags = NF_LOG_DEFAULT_MASK, + }, + }, +}; + +struct arppayload { + unsigned char mac_src[ETH_ALEN]; + unsigned char ip_src[4]; + unsigned char mac_dst[ETH_ALEN]; + unsigned char ip_dst[4]; +}; + +/* Guard against containers flooding syslog. */ +static bool nf_log_allowed(const struct net *net) +{ + return net_eq(net, &init_net) || sysctl_nf_log_all_netns; +} + +static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) +{ + u16 vid; + + if (!skb_vlan_tag_present(skb)) + return; + + vid = skb_vlan_tag_get(skb); + nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); +} +static void noinline_for_stack +dump_arp_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int nhoff) +{ + const struct arppayload *ap; + struct arppayload _arpp; + const struct arphdr *ah; + unsigned int logflags; + struct arphdr _arph; + + ah = skb_header_pointer(skb, nhoff, sizeof(_arph), &_arph); + if (!ah) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + if (logflags & NF_LOG_MACDECODE) { + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + } + + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); + /* If it's for Ethernet and the lengths are OK, then log the ARP + * payload. + */ + if (ah->ar_hrd != htons(ARPHRD_ETHER) || + ah->ar_hln != ETH_ALEN || + ah->ar_pln != sizeof(__be32)) + return; + + ap = skb_header_pointer(skb, nhoff + sizeof(_arph), sizeof(_arpp), &_arpp); + if (!ap) { + nf_log_buf_add(m, " INCOMPLETE [%zu bytes]", + skb->len - sizeof(_arph)); + return; + } + nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", + ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); +} + +static void +nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, const char *prefix, + struct net *net) +{ + const struct net_device *physoutdev __maybe_unused; + const struct net_device *physindev __maybe_unused; + + nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, + in ? in->name : "", + out ? out->name : ""); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + physindev = nf_bridge_get_physindev(skb, net); + if (physindev && in != physindev) + nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = nf_bridge_get_physoutdev(skb); + if (physoutdev && out != physoutdev) + nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); +#endif +} + +static void nf_log_arp_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + if (!nf_log_allowed(net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, + prefix, net); + dump_arp_packet(m, loginfo, skb, skb_network_offset(skb)); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_arp_logger __read_mostly = { + .name = "nf_log_arp", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_arp_packet, + .me = THIS_MODULE, +}; + +static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk) +{ + if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + const struct cred *cred = sk->sk_socket->file->f_cred; + + nf_log_buf_add(m, "UID=%u GID=%u ", + from_kuid_munged(&init_user_ns, cred->fsuid), + from_kgid_munged(&init_user_ns, cred->fsgid)); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static noinline_for_stack int +nf_log_dump_tcp_header(struct nf_log_buf *m, + const struct sk_buff *skb, + u8 proto, int fragment, + unsigned int offset, + unsigned int logflags) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + nf_log_buf_add(m, "PROTO=TCP "); + + if (fragment) + return 0; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (!th) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & NF_LOG_TCPSEQ) { + nf_log_buf_add(m, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + } + + /* Max length: 13 "WINDOW=65535 " */ + nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & + TCP_RESERVED_BITS) >> 22)); + /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->ae) + nf_log_buf_add(m, "AE "); + if (th->cwr) + nf_log_buf_add(m, "CWR "); + if (th->ece) + nf_log_buf_add(m, "ECE "); + if (th->urg) + nf_log_buf_add(m, "URG "); + if (th->ack) + nf_log_buf_add(m, "ACK "); + if (th->psh) + nf_log_buf_add(m, "PSH "); + if (th->rst) + nf_log_buf_add(m, "RST "); + if (th->syn) + nf_log_buf_add(m, "SYN "); + if (th->fin) + nf_log_buf_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & NF_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { + unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr); + u8 _opt[60 - sizeof(struct tcphdr)]; + unsigned int i; + const u8 *op; + + op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), + optsize, _opt); + if (!op) { + nf_log_buf_add(m, "OPT (TRUNCATED)"); + return 1; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + + nf_log_buf_add(m, ") "); + } + + return 0; +} + +static noinline_for_stack int +nf_log_dump_udp_header(struct nf_log_buf *m, + const struct sk_buff *skb, + u8 proto, int fragment, + unsigned int offset) +{ + struct udphdr _udph; + const struct udphdr *uh; + + if (proto == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + nf_log_buf_add(m, "PROTO=UDP "); + else /* Max length: 14 "PROTO=UDPLITE " */ + nf_log_buf_add(m, "PROTO=UDPLITE "); + + if (fragment) + goto out; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (!uh) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); + +out: + return 0; +} + +/* One level of recursion won't kill us */ +static noinline_for_stack void +dump_ipv4_packet(struct net *net, struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int iphoff) +{ + const struct iphdr *ih; + unsigned int logflags; + struct iphdr _iph; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (!ih) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. + * Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " + */ + nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + nf_log_buf_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + nf_log_buf_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + nf_log_buf_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & NF_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + const unsigned char *op; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff + sizeof(_iph), + optsize, _opt); + if (!op) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + nf_log_buf_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff + ih->ihl * 4, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff + ih->ihl * 4)) + return; + break; + case IPPROTO_ICMP: { + static const size_t required_len[NR_ICMP_TYPES + 1] = { + [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + const struct icmphdr *ich; + struct icmphdr _icmph; + + /* Max length: 11 "PROTO=ICMP " */ + nf_log_buf_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (!ich) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl * 4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len - iphoff - ih->ihl * 4 < required_len[ich->type]) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl * 4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + nf_log_buf_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + fallthrough; + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + nf_log_buf_add(m, "["); + dump_ipv4_packet(net, m, info, skb, + iphoff + ih->ihl * 4 + sizeof(_icmph)); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) { + nf_log_buf_add(m, "MTU=%u ", + ntohs(ich->un.frag.mtu)); + } + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + const struct ip_auth_hdr *ah; + struct ip_auth_hdr _ahdr; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + nf_log_buf_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_ahdr), &_ahdr); + if (!ah) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl * 4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + const struct ip_esp_hdr *eh; + struct ip_esp_hdr _esph; + + /* Max length: 10 "PROTO=ESP " */ + nf_log_buf_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_esph), &_esph); + if (!eh) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl * 4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & NF_LOG_UID) && !iphoff) + nf_log_dump_sk_uid_gid(net, m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 255 = 806 */ +} + +static noinline_for_stack void +dump_ipv6_packet(struct net *net, struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + const struct ipv6hdr *ih; + unsigned int hdrlen = 0; + unsigned int logflags; + struct ipv6hdr _ip6h; + unsigned int ptr; + u8 currenthdr; + int fragment; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (!ih) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (!hp) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & NF_LOG_IPOPT) + nf_log_buf_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + nf_log_buf_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (!fh) { + nf_log_buf_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + nf_log_buf_add(m, "INCOMPLETE "); + + nf_log_buf_add(m, "ID:%08x ", + ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & NF_LOG_IPOPT) + nf_log_buf_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & NF_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + nf_log_buf_add(m, "AH "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (!ah) { + /* Max length: 26 "INCOMPLETE [65535 bytes] )" */ + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + } + + hdrlen = ipv6_authlen(hp); + break; + case IPPROTO_ESP: + if (logflags & NF_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + nf_log_buf_add(m, "ESP "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + /* Max length: 26 "INCOMPLETE [65535 bytes] )" */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (!eh) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + nf_log_buf_add(m, "SPI=0x%x )", + ntohl(eh->spi)); + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & NF_LOG_IPOPT) + nf_log_buf_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment, + ptr, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr)) + return; + break; + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + nf_log_buf_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (!ic) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", + ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + nf_log_buf_add(m, "POINTER=%08x ", + ntohl(ic->icmp6_pointer)); + fallthrough; + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + nf_log_buf_add(m, "["); + dump_ipv6_packet(net, m, info, skb, + ptr + sizeof(_icmp6h), 0); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) { + nf_log_buf_add(m, "MTU=%u ", + ntohl(ic->icmp6_mtu)); + } + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & NF_LOG_UID) && recurse) + nf_log_dump_sk_uid_gid(net, m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (recurse && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & NF_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + if (dev->type == ARPHRD_SIT) { + p -= ETH_HLEN; + + if (p < skb->head) + p = NULL; + } + + if (p) { + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++) + nf_log_buf_add(m, ":%02x", *p++); + } + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + + nf_log_buf_add(m, " TUNNEL=%pI4->%pI4", &iph->saddr, + &iph->daddr); + } + } + nf_log_buf_add(m, " "); +} + +static void nf_log_ip_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + if (!nf_log_allowed(net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, + out, loginfo, prefix, net); + + if (in) + dump_mac_header(m, loginfo, skb); + + dump_ipv4_packet(net, m, loginfo, skb, skb_network_offset(skb)); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_ip_logger __read_mostly = { + .name = "nf_log_ipv4", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip_packet, + .me = THIS_MODULE, +}; + +static void nf_log_ip6_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + if (!nf_log_allowed(net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, + loginfo, prefix, net); + + if (in) + dump_mac_header(m, loginfo, skb); + + dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_ip6_logger __read_mostly = { + .name = "nf_log_ipv6", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip6_packet, + .me = THIS_MODULE, +}; + +static void nf_log_unknown_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + if (!nf_log_allowed(net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, + prefix, net); + + dump_mac_header(m, loginfo, skb); + + nf_log_buf_close(m); +} + +static void nf_log_netdev_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + nf_log_ip_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + case htons(ETH_P_IPV6): + nf_log_ip6_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + default: + nf_log_unknown_packet(net, pf, hooknum, skb, + in, out, loginfo, prefix); + break; + } +} + +static struct nf_logger nf_netdev_logger __read_mostly = { + .name = "nf_log_netdev", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_netdev_packet, + .me = THIS_MODULE, +}; + +static struct nf_logger nf_bridge_logger __read_mostly = { + .name = "nf_log_bridge", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_netdev_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_syslog_net_init(struct net *net) +{ + int ret = nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); + + if (ret) + return ret; + + ret = nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); + if (ret) + goto err1; + + ret = nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); + if (ret) + goto err2; + + ret = nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger); + if (ret) + goto err3; + + ret = nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); + if (ret) + goto err4; + return 0; +err4: + nf_log_unset(net, &nf_netdev_logger); +err3: + nf_log_unset(net, &nf_ip6_logger); +err2: + nf_log_unset(net, &nf_arp_logger); +err1: + nf_log_unset(net, &nf_ip_logger); + return ret; +} + +static void __net_exit nf_log_syslog_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_ip_logger); + nf_log_unset(net, &nf_arp_logger); + nf_log_unset(net, &nf_ip6_logger); + nf_log_unset(net, &nf_netdev_logger); + nf_log_unset(net, &nf_bridge_logger); +} + +static struct pernet_operations nf_log_syslog_net_ops = { + .init = nf_log_syslog_net_init, + .exit = nf_log_syslog_net_exit, +}; + +static int __init nf_log_syslog_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_syslog_net_ops); + if (ret < 0) + return ret; + + ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + if (ret < 0) + goto err1; + + ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); + if (ret < 0) + goto err2; + + ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); + if (ret < 0) + goto err3; + + ret = nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger); + if (ret < 0) + goto err4; + + ret = nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); + if (ret < 0) + goto err5; + + return 0; +err5: + nf_log_unregister(&nf_netdev_logger); +err4: + nf_log_unregister(&nf_ip6_logger); +err3: + nf_log_unregister(&nf_arp_logger); +err2: + nf_log_unregister(&nf_ip_logger); +err1: + pr_err("failed to register logger\n"); + unregister_pernet_subsys(&nf_log_syslog_net_ops); + return ret; +} + +static void __exit nf_log_syslog_exit(void) +{ + unregister_pernet_subsys(&nf_log_syslog_net_ops); + nf_log_unregister(&nf_ip_logger); + nf_log_unregister(&nf_arp_logger); + nf_log_unregister(&nf_ip6_logger); + nf_log_unregister(&nf_netdev_logger); + nf_log_unregister(&nf_bridge_logger); +} + +module_init(nf_log_syslog_init); +module_exit(nf_log_syslog_exit); + +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter syslog packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf_log_arp"); +MODULE_ALIAS("nf_log_bridge"); +MODULE_ALIAS("nf_log_ipv4"); +MODULE_ALIAS("nf_log_ipv6"); +MODULE_ALIAS("nf_log_netdev"); +MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); +MODULE_ALIAS_NF_LOGGER(AF_INET, 0); +MODULE_ALIAS_NF_LOGGER(3, 0); +MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */ +MODULE_ALIAS_NF_LOGGER(AF_INET6, 0); diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c index 3bc7e0854efe..98deef6cde69 100644 --- a/net/netfilter/nf_nat_amanda.c +++ b/net/netfilter/nf_nat_amanda.c @@ -44,19 +44,7 @@ static unsigned int help(struct sk_buff *skb, exp->expectfn = nf_nat_follow_master; /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int res; - - exp->tuple.dst.u.tcp.port = htons(port); - res = nf_ct_expect_related(exp, 0); - if (res == 0) - break; - else if (res != -EBUSY) { - port = 0; - break; - } - } - + port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port)); if (port == 0) { nf_ct_helper_log(skb, exp->master, "all ports in use"); return NF_DROP; diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c new file mode 100644 index 000000000000..481be15609b1 --- /dev/null +++ b/net/netfilter/nf_nat_bpf.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable NAT Helpers for XDP and TC-BPF hook + * + * These are called from the XDP and SCHED_CLS BPF programs. Note that it is + * allowed to break compatibility for these functions since the interface they + * are exposed through to BPF programs is explicitly unstable. + */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <net/netfilter/nf_conntrack_bpf.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_nat.h> + +__bpf_kfunc_start_defs(); + +/* bpf_ct_set_nat_info - Set source or destination nat address + * + * Set source or destination nat address of the newly allocated + * nf_conn before insertion. This must be invoked for referenced + * PTR_TO_BTF_ID to nf_conn___init. + * + * Parameters: + * @nfct - Pointer to referenced nf_conn object, obtained using + * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. + * @addr - Nat source/destination address + * @port - Nat source/destination port. Non-positive values are + * interpreted as select a random port. + * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST + */ +__bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, + union nf_inet_addr *addr, int port, + enum nf_nat_manip_type manip) +{ + struct nf_conn *ct = (struct nf_conn *)nfct; + u16 proto = nf_ct_l3num(ct); + struct nf_nat_range2 range; + + if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6) + return -EINVAL; + + memset(&range, 0, sizeof(struct nf_nat_range2)); + range.flags = NF_NAT_RANGE_MAP_IPS; + range.min_addr = *addr; + range.max_addr = range.min_addr; + if (port > 0) { + range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range.min_proto.all = cpu_to_be16(port); + range.max_proto.all = range.min_proto.all; + } + + return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(nf_nat_kfunc_set) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(nf_nat_kfunc_set) + +static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { + .owner = THIS_MODULE, + .set = &nf_nat_kfunc_set, +}; + +int register_nf_nat_bpf(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, + &nf_bpf_nat_kfunc_set); + if (ret) + return ret; + + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, + &nf_bpf_nat_kfunc_set); +} diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index bfc555fcbc72..78a61dac4ade 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -13,10 +13,10 @@ #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/rtnetlink.h> -#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> @@ -27,6 +27,9 @@ #include "nf_internals.h" +#define NF_NAT_MAX_ATTEMPTS 128 +#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) + static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); @@ -34,7 +37,7 @@ static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; -static unsigned int nf_nat_hash_rnd __read_mostly; +static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; @@ -66,7 +69,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } @@ -78,7 +80,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } @@ -99,7 +100,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } @@ -111,7 +111,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } @@ -146,61 +145,69 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) return; } } - -int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) -{ - struct flowi fl; - unsigned int hh_len; - struct dst_entry *dst; - struct sock *sk = skb->sk; - int err; - - err = xfrm_decode_session(skb, &fl, family); - if (err < 0) - return err; - - dst = skb_dst(skb); - if (dst->xfrm) - dst = ((struct xfrm_dst *)dst)->route; - if (!dst_hold_safe(dst)) - return -EHOSTUNREACH; - - if (sk && !net_eq(net, sock_net(sk))) - sk = NULL; - - dst = xfrm_lookup(net, dst, &fl, sk, 0); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - skb_dst_drop(skb); - skb_dst_set(skb, dst); - - /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; - if (skb_headroom(skb) < hh_len && - pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int -hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, + const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple) { unsigned int hash; + struct { + struct nf_conntrack_man src; + u32 net_mix; + u32 protonum; + u32 zone; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + memset(&combined, 0, sizeof(combined)); + /* Original src, to ensure we map it consistently if poss. */ - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); + combined.src = tuple->src; + combined.net_mix = net_hash_mix(net); + combined.protonum = tuple->dst.protonum; + + /* Zone ID can be used provided its valid for both directions */ + if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) + combined.zone = zone->id; + + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } -/* Is this tuple already taken? (not by us) */ +/** + * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry + * @tuple: proposed NAT binding + * @ignored_conntrack: our (unconfirmed) conntrack entry + * + * A conntrack entry can be inserted to the connection tracking table + * if there is no existing entry with an identical tuple in either direction. + * + * Example: + * INITIATOR -> NAT/PAT -> RESPONDER + * + * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). + * Then, later, NAT/PAT itself also connects to RESPONDER. + * + * This will not work if the SNAT done earlier has same IP:PORT source pair. + * + * Conntrack table has: + * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT + * + * and new locally originating connection wants: + * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT + * + * ... which would mean incoming packets cannot be distinguished between + * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). + * + * @return: true if the proposed NAT mapping collides with an existing entry. + */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) @@ -217,6 +224,182 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } +static bool nf_nat_allow_clash(const struct nf_conn *ct) +{ + return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; +} + +/** + * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry + * @tuple: proposed NAT binding + * @ignored_ct: our (unconfirmed) conntrack entry + * + * Same as nf_nat_used_tuple, but also check for rare clash in reverse + * direction. Should be called only when @tuple has not been altered, i.e. + * @ignored_conntrack will not be subject to NAT. + * + * @return: true if the proposed NAT mapping collides with existing entry. + */ +static noinline bool +nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_ct) +{ + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; + const struct nf_conntrack_tuple_hash *thash; + const struct nf_conntrack_zone *zone; + struct nf_conn *ct; + bool taken = true; + struct net *net; + + if (!nf_nat_used_tuple(tuple, ignored_ct)) + return false; + + if (!nf_nat_allow_clash(ignored_ct)) + return true; + + /* Initial choice clashes with existing conntrack. + * Check for (rare) reverse collision. + * + * This can happen when new packets are received in both directions + * at the exact same time on different CPUs. + * + * Without SMP, first packet creates new conntrack entry and second + * packet is resolved as established reply packet. + * + * With parallel processing, both packets could be picked up as + * new and both get their own ct entry allocated. + * + * If ignored_conntrack and colliding ct are not subject to NAT then + * pretend the tuple is available and let later clash resolution + * handle this at insertion time. + * + * Without it, the 'reply' packet has its source port rewritten + * by nat engine. + */ + if (READ_ONCE(ignored_ct->status) & uses_nat) + return true; + + net = nf_ct_net(ignored_ct); + zone = nf_ct_zone(ignored_ct); + + thash = nf_conntrack_find_get(net, zone, tuple); + if (unlikely(!thash)) { + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuple(&reply, tuple); + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) /* clashing entry went away */ + return false; + } + + ct = nf_ct_tuplehash_to_ctrack(thash); + + /* NB: IP_CT_DIR_ORIGINAL should be impossible because + * nf_nat_used_tuple() handles origin collisions. + * + * Handle remote chance other CPU confirmed its ct right after. + */ + if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) + goto out; + + /* clashing connection subject to NAT? Retry with new tuple. */ + if (READ_ONCE(ct->status) & uses_nat) + goto out; + + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && + nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { + taken = false; + goto out; + } +out: + nf_ct_put(ct); + return taken; +} + +static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) +{ + static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | + IPS_DYING; + static const unsigned long flags_needed = IPS_SRC_NAT; + enum tcp_conntrack old_state; + + old_state = READ_ONCE(ct->proto.tcp.state); + if (old_state < TCP_CONNTRACK_TIME_WAIT) + return false; + + if (flags & flags_refuse) + return false; + + return (flags & flags_needed) == flags_needed; +} + +/* reverse direction will send packets to new source, so + * make sure such packets are invalid. + */ +static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) +{ + return (__s32)(new->proto.tcp.seen[0].td_end - + old->proto.tcp.seen[0].td_end) > 0; +} + +static int +nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack, + unsigned int attempts_left) +{ + static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; + struct nf_conntrack_tuple_hash *thash; + const struct nf_conntrack_zone *zone; + struct nf_conntrack_tuple reply; + unsigned long flags; + struct nf_conn *ct; + bool taken = true; + struct net *net; + + nf_ct_invert_tuple(&reply, tuple); + + if (attempts_left > NF_NAT_HARDER_THRESH || + tuple->dst.protonum != IPPROTO_TCP || + ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) + return nf_conntrack_tuple_taken(&reply, ignored_conntrack); + + /* :ast few attempts to find a free tcp port. Destructive + * action: evict colliding if its in timewait state and the + * tcp sequence number has advanced past the one used by the + * old entry. + */ + net = nf_ct_net(ignored_conntrack); + zone = nf_ct_zone(ignored_conntrack); + + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) + return false; + + ct = nf_ct_tuplehash_to_ctrack(thash); + + if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) + goto out; + + if (WARN_ON_ONCE(ct == ignored_conntrack)) + goto out; + + flags = READ_ONCE(ct->status); + if (!nf_nat_may_kill(ct, flags)) + goto out; + + if (!nf_seq_has_advanced(ct, ignored_conntrack)) + goto out; + + /* Even if we can evict do not reuse if entry is offloaded. */ + if (nf_ct_kill(ct)) + taken = flags & flags_offload; +out: + nf_ct_put(ct); + return taken; +} + static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { @@ -245,7 +428,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: - case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; @@ -262,7 +444,7 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ -static int in_range(const struct nf_conntrack_tuple *tuple, +static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the @@ -299,7 +481,7 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { - unsigned int h = hash_by_src(net, tuple); + unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { @@ -311,7 +493,7 @@ find_appropriate_src(struct net *net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; - if (in_range(result, range)) + if (nf_in_range(result, range)) return 1; } } @@ -405,10 +587,9 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; - static const unsigned int max_attempts = 128; switch (tuple->dst.protonum) { - case IPPROTO_ICMP: /* fallthrough */ + case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; @@ -442,11 +623,10 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, } goto find_free_id; #endif - case IPPROTO_UDP: /* fallthrough */ - case IPPROTO_UDPLITE: /* fallthrough */ - case IPPROTO_TCP: /* fallthrough */ - case IPPROTO_SCTP: /* fallthrough */ - case IPPROTO_DCCP: /* fallthrough */ + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_TCP: + case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else @@ -487,12 +667,15 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); + else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || + maniptype != NF_NAT_MANIP_DST) + off = get_random_u16(); else - off = prandom_u32(); + off = 0; attempts = range_size; - if (attempts > max_attempts) - attempts = max_attempts; + if (attempts > NF_NAT_MAX_ATTEMPTS) + attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. @@ -503,14 +686,14 @@ find_free_id: another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); - if (!nf_nat_used_tuple(tuple, ct)) + if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; - off = prandom_u32(); + off = get_random_u16(); goto another_round; } @@ -543,8 +726,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ - if (in_range(orig_tuple, range)) { - if (!nf_nat_used_tuple(orig_tuple, ct)) { + if (nf_in_range(orig_tuple, range)) { + if (!nf_nat_used_tuple_new(orig_tuple, ct)) { *tuple = *orig_tuple; return; } @@ -569,8 +752,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + &range->min_proto, + &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; @@ -646,7 +829,7 @@ nf_nat_setup_info(struct nf_conn *ct, unsigned int srchash; spinlock_t *lock; - srchash = hash_by_src(net, + srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); @@ -719,6 +902,16 @@ unsigned int nf_nat_packet(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(nf_nat_packet); +static bool in_vrf_postrouting(const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (state->hook == NF_INET_POST_ROUTING && + netif_is_l3_master(state->out)) + return true; +#endif + return false; +} + unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -735,7 +928,7 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb, * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ - if (!ct) + if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); @@ -811,11 +1004,11 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } -static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) +static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; - h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); @@ -833,7 +1026,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) - __nf_nat_cleanup_conntrack(ct); + nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -841,20 +1034,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) return 0; } -/* No one using conntrack by the time this called. */ -static void nf_nat_cleanup_conntrack(struct nf_conn *ct) -{ - if (ct->status & IPS_SRC_NAT_DONE) - __nf_nat_cleanup_conntrack(ct); -} - -static struct nf_ct_ext_type nat_extend __read_mostly = { - .len = sizeof(struct nf_conn_nat), - .align = __alignof__(struct nf_conn_nat), - .destroy = nf_nat_cleanup_conntrack, - .id = NF_CT_EXT_NAT, -}; - #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> @@ -911,10 +1090,8 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], range->flags |= NF_NAT_RANGE_MAP_IPS; } - if (tb[CTA_NAT_V4_MAXIP]) - range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); - else - range->max_addr.ip = range->min_addr.ip; + range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], + range->min_addr.ip); return 0; } @@ -1041,7 +1218,7 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); - nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); + nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; @@ -1140,12 +1317,12 @@ static struct pernet_operations nat_net_ops = { .size = sizeof(struct nat_net), }; -static struct nf_nat_hook nat_hook = { +static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif - .manip_pkt = nf_nat_manip_pkt, + .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) @@ -1161,19 +1338,12 @@ static int __init nf_nat_init(void) if (!nf_nat_bysource) return -ENOMEM; - ret = nf_ct_extend_register(&nat_extend); - if (ret < 0) { - kvfree(nf_nat_bysource); - pr_err("Unable to register extension\n"); - return ret; - } - for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { - nf_ct_extend_unregister(&nat_extend); + kvfree(nf_nat_bysource); return ret; } @@ -1182,7 +1352,16 @@ static int __init nf_nat_init(void) WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); - return 0; + ret = register_nf_nat_bpf(); + if (ret < 0) { + RCU_INIT_POINTER(nf_nat_hook, NULL); + nf_ct_helper_expectfn_unregister(&follow_master_nat); + synchronize_net(); + unregister_pernet_subsys(&nat_net_ops); + kvfree(nf_nat_bysource); + } + + return ret; } static void __exit nf_nat_cleanup(void) @@ -1191,7 +1370,6 @@ static void __exit nf_nat_cleanup(void) nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); - nf_ct_extend_unregister(&nat_extend); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); @@ -1201,6 +1379,7 @@ static void __exit nf_nat_cleanup(void) } MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Network address translation core"); module_init(nf_nat_init); module_exit(nf_nat_cleanup); diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index aace6768a64e..c92a436d9c48 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -86,22 +86,9 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, * this one. */ exp->expectfn = nf_nat_follow_master; - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp, 0); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - + port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port)); if (port == 0) { - nf_ct_helper_log(skb, ct, "all ports in use"); + nf_ct_helper_log(skb, exp->master, "all ports in use"); return NF_DROP; } diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index a263505455fc..bf591e6af005 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -198,3 +198,34 @@ void nf_nat_follow_master(struct nf_conn *ct, nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } EXPORT_SYMBOL(nf_nat_follow_master); + +u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port) +{ + static const unsigned int max_attempts = 128; + int range, attempts_left; + u16 min = port; + + range = USHRT_MAX - port; + attempts_left = range; + + if (attempts_left > max_attempts) + attempts_left = max_attempts; + + /* Try to get same port: if not, try to change it. */ + for (;;) { + int res; + + exp->tuple.dst.u.tcp.port = htons(port); + res = nf_ct_expect_related(exp, 0); + if (res == 0) + return port; + + if (res != -EBUSY || (--attempts_left < 0)) + break; + + port = min + get_random_u32_below(range); + } + + return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_exp_find_port); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index c691ab8d234c..19c4fcc60c50 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -48,20 +48,8 @@ static unsigned int help(struct sk_buff *skb, exp->dir = IP_CT_DIR_REPLY; exp->expectfn = nf_nat_follow_master; - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp, 0); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - + port = nf_nat_exp_find_port(exp, + ntohs(exp->saved_proto.tcp.port)); if (port == 0) { nf_ct_helper_log(skb, ct, "all ports in use"); return NF_DROP; diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..1a506b0c6511 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -9,8 +9,20 @@ #include <net/netfilter/nf_nat_masquerade.h> +struct masq_dev_work { + struct work_struct work; + struct net *net; + netns_tracker ns_tracker; + union nf_inet_addr addr; + int ifindex; + int (*iter)(struct nf_conn *i, void *data); +}; + +#define MAX_MASQ_WORKER_COUNT 16 + static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; +static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, @@ -63,13 +75,75 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); -static int device_cmp(struct nf_conn *i, void *ifindex) +static void iterate_cleanup_work(struct work_struct *work) +{ + struct nf_ct_iter_data iter_data = {}; + struct masq_dev_work *w; + + w = container_of(work, struct masq_dev_work, work); + + iter_data.net = w->net; + iter_data.data = (void *)w; + nf_ct_iterate_cleanup_net(w->iter, &iter_data); + + put_net_track(w->net, &w->ns_tracker); + kfree(w); + atomic_dec(&masq_worker_count); + module_put(THIS_MODULE); +} + +/* Iterate conntrack table in the background and remove conntrack entries + * that use the device/address being removed. + * + * In case too many work items have been queued already or memory allocation + * fails iteration is skipped, conntrack entries will time out eventually. + */ +static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, + int ifindex, + int (*iter)(struct nf_conn *i, void *data), + gfp_t gfp_flags) +{ + struct masq_dev_work *w; + + if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) + return; + + net = maybe_get_net(net); + if (!net) + return; + + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kzalloc(sizeof(*w), gfp_flags); + if (w) { + /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ + atomic_inc(&masq_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = ifindex; + w->net = net; + netns_tracker_alloc(net, &w->ns_tracker, gfp_flags); + w->iter = iter; + if (addr) + w->addr = *addr; + schedule_work(&w->work); + return; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); +} + +static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); + const struct masq_dev_work *w = arg; if (!nat) return 0; - return nat->masq_index == (int)(long)ifindex; + return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, @@ -85,8 +159,8 @@ static int masq_device_event(struct notifier_block *this, * and forget them. */ - nf_ct_iterate_cleanup_net(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); + nf_nat_masq_schedule(net, NULL, dev->ifindex, + device_cmp, GFP_KERNEL); } return NOTIFY_DONE; @@ -94,35 +168,45 @@ static int masq_device_event(struct notifier_block *this, static int inet_cmp(struct nf_conn *ct, void *ptr) { - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; struct nf_conntrack_tuple *tuple; + struct masq_dev_work *w = ptr; - if (!device_cmp(ct, (void *)(long)dev->ifindex)) + if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - return ifa->ifa_address == tuple->dst.u3.ip; + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct net *net = dev_net(idev->dev); + const struct in_ifaddr *ifa = ptr; + const struct in_device *idev; + const struct net_device *dev; + union nf_inet_addr addr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ + idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + memset(&addr, 0, sizeof(addr)); + + addr.ip = ifa->ifa_address; + + dev = idev->dev; + nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, + inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } @@ -136,8 +220,6 @@ static struct notifier_block masq_inet_notifier = { }; #if IS_ENABLED(CONFIG_IPV6) -static atomic_t v6_worker_count __read_mostly; - static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, @@ -187,40 +269,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); -struct masq_dev_work { - struct work_struct work; - struct net *net; - struct in6_addr addr; - int ifindex; -}; - -static int inet6_cmp(struct nf_conn *ct, void *work) -{ - struct masq_dev_work *w = (struct masq_dev_work *)work; - struct nf_conntrack_tuple *tuple; - - if (!device_cmp(ct, (void *)(long)w->ifindex)) - return 0; - - tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - - return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); -} - -static void iterate_cleanup_work(struct work_struct *work) -{ - struct masq_dev_work *w; - - w = container_of(work, struct masq_dev_work, work); - - nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0); - - put_net(w->net); - kfree(w); - atomic_dec(&v6_worker_count); - module_put(THIS_MODULE); -} - /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. @@ -233,36 +281,19 @@ static int masq_inet6_event(struct notifier_block *this, { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; - struct masq_dev_work *w; - struct net *net; + union nf_inet_addr addr; - if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16) + if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; - net = maybe_get_net(dev_net(dev)); - if (!net) - return NOTIFY_DONE; - - if (!try_module_get(THIS_MODULE)) - goto err_module; - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (w) { - atomic_inc(&v6_worker_count); + memset(&addr, 0, sizeof(addr)); - INIT_WORK(&w->work, iterate_cleanup_work); - w->ifindex = dev->ifindex; - w->net = net; - w->addr = ifa->addr; - schedule_work(&w->work); + addr.in6 = ifa->addr; - return NOTIFY_DONE; - } - - module_put(THIS_MODULE); - err_module: - put_net(net); + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, + GFP_ATOMIC); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c new file mode 100644 index 000000000000..0f9a559f6207 --- /dev/null +++ b/net/netfilter/nf_nat_ovs.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Support nat functions for openvswitch and used by OVS and TC conntrack. */ + +#include <net/netfilter/nf_nat.h> + +/* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. + * Returns either NF_ACCEPT or NF_DROP. + */ +static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, + enum nf_nat_manip_type maniptype) +{ + __be16 proto = skb_protocol(skb, true); + int hooknum, err = NF_ACCEPT; + + /* See HOOK2MANIP(). */ + if (maniptype == NF_NAT_MANIP_SRC) + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ + else + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (proto == htons(ETH_P_IP) && + ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + hooknum)) + err = NF_DROP; + goto out; + } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { + __be16 frag_off; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int hdrlen = ipv6_skip_exthdr(skb, + sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, + ctinfo, + hooknum, + hdrlen)) + err = NF_DROP; + goto out; + } + } + /* Non-ICMP, fall thru to initialize if needed. */ + fallthrough; + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + /* Initialize according to the NAT action. */ + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) + /* Action is set up to establish a new + * mapping. + */ + ? nf_nat_setup_info(ct, range, maniptype) + : nf_nat_alloc_null_binding(ct, hooknum); + if (err != NF_ACCEPT) + goto out; + } + break; + + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + + default: + err = NF_DROP; + goto out; + } + + err = nf_nat_packet(ct, ctinfo, hooknum, skb); +out: + if (err == NF_ACCEPT) + *action |= BIT(maniptype); + + return err; +} + +int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, bool commit) +{ + enum nf_nat_manip_type maniptype; + int err, ct_action = *action; + + *action = 0; + + /* Add NAT extension if not confirmed yet. */ + if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) + return NF_DROP; /* Can't NAT. */ + + if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) && + (ctinfo != IP_CT_RELATED || commit)) { + /* NAT an established or related connection like before. */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) + /* This is the REPLY direction for a connection + * for which NAT was applied in the forward + * direction. Do the reverse NAT. + */ + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; + else + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; + } else if (ct_action & BIT(NF_NAT_MANIP_SRC)) { + maniptype = NF_NAT_MANIP_SRC; + } else if (ct_action & BIT(NF_NAT_MANIP_DST)) { + maniptype = NF_NAT_MANIP_DST; + } else { + return NF_ACCEPT; + } + + err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, maniptype); + if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { + if (ct->status & IPS_SRC_NAT) { + if (maniptype == NF_NAT_MANIP_SRC) + maniptype = NF_NAT_MANIP_DST; + else + maniptype = NF_NAT_MANIP_SRC; + + err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, + maniptype); + } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + err = nf_ct_nat_execute(skb, ct, ctinfo, action, NULL, + NF_NAT_MANIP_SRC); + } + } + return err; +} +EXPORT_SYMBOL_GPL(nf_ct_nat); diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 59151dc07fdc..b14a434b9561 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -180,46 +180,6 @@ tcp_manip_pkt(struct sk_buff *skb, } static bool -dccp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct dccp_hdr *hdr; - __be16 *portptr, oldport, newport; - int hdrsize = 8; /* DCCP connection tracking guarantees this much */ - - if (skb->len >= hdroff + sizeof(struct dccp_hdr)) - hdrsize = sizeof(struct dccp_hdr); - - if (skb_ensure_writable(skb, hdroff + hdrsize)) - return false; - - hdr = (struct dccp_hdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - newport = tuple->src.u.dccp.port; - portptr = &hdr->dccph_sport; - } else { - newport = tuple->dst.u.dccp.port; - portptr = &hdr->dccph_dport; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return true; - - nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); - inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - false); -#endif - return true; -} - -static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, @@ -338,9 +298,6 @@ static bool l4proto_manip_pkt(struct sk_buff *skb, case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); - case IPPROTO_DCCP: - return dccp_manip_pkt(skb, iphdroff, hdroff, - tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); @@ -646,8 +603,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, } static unsigned int -nf_nat_ipv4_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; @@ -659,6 +616,98 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb, return ret; } +#ifdef CONFIG_XFRM +static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) +{ + struct sock *sk = skb->sk; + struct dst_entry *dst; + unsigned int hh_len; + struct flowi fl; + int err; + + err = xfrm_decode_session(net, skb, &fl, family); + if (err < 0) + return err; + + dst = skb_dst(skb); + if (dst->xfrm) + dst = ((struct xfrm_dst *)dst)->route; + if (!dst_hold_safe(dst)) + return -EHOSTUNREACH; + + if (sk && !net_eq(net, sock_net(sk))) + sk = NULL; + + dst = xfrm_lookup(net, dst, &fl, sk, 0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) + return -ENOMEM; + return 0; +} +#endif + +static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport) +{ + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + const struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + case IPPROTO_UDP: + break; + default: + return false; + } + + dir = CTINFO2DIR(ctinfo); + if (dir != IP_CT_DIR_ORIGINAL) + return false; + + return ct->tuplehash[!dir].tuple.dst.u.all != sport; +} + +static unsigned int +nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + __be32 saddr = ip_hdr(skb)->saddr; + struct sock *sk = skb->sk; + unsigned int ret; + + ret = nf_nat_ipv4_fn(priv, skb, state); + + if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) + return ret; + + /* skb has a socket assigned via tcp edemux. We need to check + * if nf_nat_ipv4_fn() has mangled the packet in a way that + * edemux would not have found this socket. + * + * This includes both changes to the source address and changes + * to the source port, which are both handled by the + * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux + * might have found a socket for the old (pre-snat) address. + */ + if (saddr != ip_hdr(skb)->saddr || + nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) + skb_orphan(skb); /* TCP edemux obtained wrong socket */ + + return ret; +} + static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -715,7 +764,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -736,7 +785,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_in, + .hook = nf_nat_ipv4_pre_routing, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, @@ -757,7 +806,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_fn, + .hook = nf_nat_ipv4_local_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, @@ -883,14 +932,36 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, } static unsigned int +nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct in6_addr saddr = ipv6_hdr(skb)->saddr; + struct sock *sk = skb->sk; + unsigned int ret; + + ret = nf_nat_ipv6_fn(priv, skb, state); + + if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) + return ret; + + /* see nf_nat_ipv4_local_in */ + if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) || + nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) + skb_orphan(skb); + + return ret; +} + +static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - unsigned int ret; + unsigned int ret, verdict; struct in6_addr daddr = ipv6_hdr(skb)->daddr; ret = nf_nat_ipv6_fn(priv, skb, state); - if (ret != NF_DROP && ret != NF_STOLEN && + verdict = ret & NF_VERDICT_MASK; + if (verdict != NF_DROP && verdict != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); @@ -953,7 +1024,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { - err = nf_ip6_route_me_harder(state->net, skb); + err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } @@ -996,7 +1067,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_fn, + .hook = nf_nat_ipv6_local_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index f91579c821e9..5b37487d9d11 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -10,6 +10,7 @@ #include <linux/if.h> #include <linux/inetdevice.h> +#include <linux/in.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/netdevice.h> @@ -24,81 +25,104 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_redirect.h> +static unsigned int +nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range, + const union nf_inet_addr *newdst) +{ + struct nf_nat_range2 newrange; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + + memset(&newrange, 0, sizeof(newrange)); + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr = *newdst; + newrange.max_addr = *newdst; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + unsigned int -nf_nat_redirect_ipv4(struct sk_buff *skb, - const struct nf_nat_ipv4_multi_range_compat *mr, +nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - __be32 newdst; - struct nf_nat_range2 newrange; + union nf_inet_addr newdst = {}; WARN_ON(hooknum != NF_INET_PRE_ROUTING && hooknum != NF_INET_LOCAL_OUT); - ct = nf_ct_get(skb, &ctinfo); - WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); - /* Local packets: make them go to loopback */ if (hooknum == NF_INET_LOCAL_OUT) { - newdst = htonl(0x7F000001); + newdst.ip = htonl(INADDR_LOOPBACK); } else { const struct in_device *indev; - newdst = 0; - indev = __in_dev_get_rcu(skb->dev); if (indev) { const struct in_ifaddr *ifa; ifa = rcu_dereference(indev->ifa_list); if (ifa) - newdst = ifa->ifa_local; + newdst.ip = ifa->ifa_local; } - if (!newdst) + if (!newdst.ip) return NF_DROP; } - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newdst; - newrange.max_addr.ip = newdst; - newrange.min_proto = mr->range[0].min; - newrange.max_proto = mr->range[0].max; - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); + return nf_nat_redirect(skb, range, &newdst); } EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; +static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope) +{ + unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr); + + if (ifa_addr_type & IPV6_ADDR_MAPPED) + return false; + + if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC))) + return false; + + if (scope) { + unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK; + + if (!(scope & ifa_scope)) + return false; + } + + return true; +} + unsigned int nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_nat_range2 newrange; - struct in6_addr newdst; - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; + union nf_inet_addr newdst = {}; - ct = nf_ct_get(skb, &ctinfo); if (hooknum == NF_INET_LOCAL_OUT) { - newdst = loopback_addr; + newdst.in6 = loopback_addr; } else { + unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr); struct inet6_dev *idev; - struct inet6_ifaddr *ifa; bool addr = false; idev = __in6_dev_get(skb->dev); if (idev != NULL) { + const struct inet6_ifaddr *ifa; + read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { - newdst = ifa->addr; + if (!nf_nat_redirect_ipv6_usable(ifa, scope)) + continue; + + newdst.in6 = ifa->addr; addr = true; break; } @@ -109,12 +133,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, return NF_DROP; } - newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.in6 = newdst; - newrange.max_addr.in6 = newdst; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; - - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); + return nf_nat_redirect(skb, range, &newdst); } EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index f0a735e86851..cf4aeb299bde 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -410,19 +410,7 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, exp->dir = !dir; exp->expectfn = nf_nat_sip_expected; - for (; port != 0; port++) { - int ret; - - exp->tuple.dst.u.udp.port = htons(port); - ret = nf_ct_expect_related(exp, NF_CT_EXP_F_SKIP_MASTER); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - + port = nf_nat_exp_find_port(exp, port); if (port == 0) { nf_ct_helper_log(skb, ct, "all ports in use for SIP"); return NF_DROP; diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index bbd1209694b8..7f12e56e6e52 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -21,6 +21,8 @@ #include "nf_internals.h" +static const struct nf_queue_handler __rcu *nf_queue_handler; + /* * Hook for nfnetlink_queue to register its queue handler. * We do this so that most of the NFQUEUE code can be modular. @@ -29,40 +31,43 @@ * receives, no matter what. */ -/* return EBUSY when somebody else is registered, return EEXIST if the - * same handler is registered, return 0 in case of success. */ -void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh) +void nf_register_queue_handler(const struct nf_queue_handler *qh) { /* should never happen, we only have one queueing backend in kernel */ - WARN_ON(rcu_access_pointer(net->nf.queue_handler)); - rcu_assign_pointer(net->nf.queue_handler, qh); + WARN_ON(rcu_access_pointer(nf_queue_handler)); + rcu_assign_pointer(nf_queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -void nf_unregister_queue_handler(struct net *net) +void nf_unregister_queue_handler(void) { - RCU_INIT_POINTER(net->nf.queue_handler, NULL); + RCU_INIT_POINTER(nf_queue_handler, NULL); } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_sock_put(struct sock *sk) +{ +#ifdef CONFIG_INET + sock_gen_put(sk); +#else + sock_put(sk); +#endif +} + static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ - if (state->in) - dev_put(state->in); - if (state->out) - dev_put(state->out); + dev_put(state->in); + dev_put(state->out); if (state->sk) - sock_put(state->sk); + nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->physin) - dev_put(entry->physin); - if (entry->physout) - dev_put(entry->physout); + dev_put(entry->physin); + dev_put(entry->physout); #endif } @@ -77,11 +82,9 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct sk_buff *skb = entry->skb; - struct nf_bridge_info *nf_bridge; - nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge) { - entry->physin = nf_bridge_get_physindev(skb); + if (nf_bridge_info_exists(skb)) { + entry->physin = nf_bridge_get_physindev(skb, entry->state.net); entry->physout = nf_bridge_get_physoutdev(skb); } else { entry->physin = NULL; @@ -91,23 +94,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) } /* Bump dev refs so they don't vanish while packet is out */ -void nf_queue_entry_get_refs(struct nf_queue_entry *entry) +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; - if (state->in) - dev_hold(state->in); - if (state->out) - dev_hold(state->out); - if (state->sk) - sock_hold(state->sk); + if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) + return false; + + dev_hold(state->in); + dev_hold(state->out); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->physin) - dev_hold(entry->physin); - if (entry->physout) - dev_hold(entry->physout); + dev_hold(entry->physin); + dev_hold(entry->physout); #endif + return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -116,7 +117,7 @@ void nf_queue_nf_hook_drop(struct net *net) const struct nf_queue_handler *qh; rcu_read_lock(); - qh = rcu_dereference(net->nf.queue_handler); + qh = rcu_dereference(nf_queue_handler); if (qh) qh->nf_hook_drop(net); rcu_read_unlock(); @@ -157,12 +158,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, { struct nf_queue_entry *entry = NULL; const struct nf_queue_handler *qh; - struct net *net = state->net; unsigned int route_key_size; int status; /* QUEUE == DROP if no one is waiting, to be safe. */ - qh = rcu_dereference(net->nf.queue_handler); + qh = rcu_dereference(nf_queue_handler); if (!qh) return -ESRCH; @@ -178,6 +178,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, break; } + if (skb_sk_is_prefetched(skb)) { + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) { + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + return -ENOTCONN; + + /* drop refcount on skb_orphan */ + skb->destructor = sock_edemux; + } + } + entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) return -ENOMEM; @@ -196,7 +208,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, __nf_queue_entry_init_physdevs(entry); - nf_queue_entry_get_refs(entry); + if (!nf_queue_entry_get_refs(entry)) { + kfree(entry); + return -ENOTCONN; + } switch (entry->state.pf) { case AF_INET: @@ -233,109 +248,3 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, return 0; } EXPORT_SYMBOL_GPL(nf_queue); - -static unsigned int nf_iterate(struct sk_buff *skb, - struct nf_hook_state *state, - const struct nf_hook_entries *hooks, - unsigned int *index) -{ - const struct nf_hook_entry *hook; - unsigned int verdict, i = *index; - - while (i < hooks->num_hook_entries) { - hook = &hooks->hooks[i]; -repeat: - verdict = nf_hook_entry_hookfn(hook, skb, state); - if (verdict != NF_ACCEPT) { - *index = i; - if (verdict != NF_REPEAT) - return verdict; - goto repeat; - } - i++; - } - - *index = i; - return NF_ACCEPT; -} - -static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) -{ - switch (pf) { -#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - case NFPROTO_BRIDGE: - return rcu_dereference(net->nf.hooks_bridge[hooknum]); -#endif - case NFPROTO_IPV4: - return rcu_dereference(net->nf.hooks_ipv4[hooknum]); - case NFPROTO_IPV6: - return rcu_dereference(net->nf.hooks_ipv6[hooknum]); - default: - WARN_ON_ONCE(1); - return NULL; - } - - return NULL; -} - -/* Caller must hold rcu read-side lock */ -void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) -{ - const struct nf_hook_entry *hook_entry; - const struct nf_hook_entries *hooks; - struct sk_buff *skb = entry->skb; - const struct net *net; - unsigned int i; - int err; - u8 pf; - - net = entry->state.net; - pf = entry->state.pf; - - hooks = nf_hook_entries_head(net, pf, entry->state.hook); - - i = entry->hook_index; - if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { - kfree_skb(skb); - nf_queue_entry_free(entry); - return; - } - - hook_entry = &hooks->hooks[i]; - - /* Continue traversal iff userspace said ok... */ - if (verdict == NF_REPEAT) - verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); - - if (verdict == NF_ACCEPT) { - if (nf_reroute(skb, entry) < 0) - verdict = NF_DROP; - } - - if (verdict == NF_ACCEPT) { -next_hook: - ++i; - verdict = nf_iterate(skb, &entry->state, hooks, &i); - } - - switch (verdict & NF_VERDICT_MASK) { - case NF_ACCEPT: - case NF_STOP: - local_bh_disable(); - entry->state.okfn(entry->state.net, entry->state.sk, skb); - local_bh_enable(); - break; - case NF_QUEUE: - err = nf_queue(skb, &entry->state, i, verdict); - if (err == 1) - goto next_hook; - break; - case NF_STOLEN: - break; - default: - kfree_skb(skb); - } - - nf_queue_entry_free(entry); -} -EXPORT_SYMBOL(nf_reinject); diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index 46cb3786e0ec..34afcd03b6f6 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -89,78 +89,32 @@ out: return ops; } -/* Call get/setsockopt() */ -static int nf_sockopt(struct sock *sk, u_int8_t pf, int val, - char __user *opt, int *len, int get) +int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt, + unsigned int len) { struct nf_sockopt_ops *ops; int ret; - ops = nf_sockopt_find(sk, pf, val, get); + ops = nf_sockopt_find(sk, pf, val, 0); if (IS_ERR(ops)) return PTR_ERR(ops); - - if (get) - ret = ops->get(sk, val, opt, len); - else - ret = ops->set(sk, val, opt, *len); - + ret = ops->set(sk, val, opt, len); module_put(ops->owner); return ret; } - -int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, - unsigned int len) -{ - return nf_sockopt(sk, pf, val, opt, &len, 0); -} EXPORT_SYMBOL(nf_setsockopt); int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len) { - return nf_sockopt(sk, pf, val, opt, len, 1); -} -EXPORT_SYMBOL(nf_getsockopt); - -#ifdef CONFIG_COMPAT -static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val, - char __user *opt, int *len, int get) -{ struct nf_sockopt_ops *ops; int ret; - ops = nf_sockopt_find(sk, pf, val, get); + ops = nf_sockopt_find(sk, pf, val, 1); if (IS_ERR(ops)) return PTR_ERR(ops); - - if (get) { - if (ops->compat_get) - ret = ops->compat_get(sk, val, opt, len); - else - ret = ops->get(sk, val, opt, len); - } else { - if (ops->compat_set) - ret = ops->compat_set(sk, val, opt, *len); - else - ret = ops->set(sk, val, opt, *len); - } - + ret = ops->get(sk, val, opt, len); module_put(ops->owner); return ret; } - -int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, - int val, char __user *opt, unsigned int len) -{ - return compat_nf_sockopt(sk, pf, val, opt, &len, 0); -} -EXPORT_SYMBOL(compat_nf_setsockopt); - -int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, - int val, char __user *opt, int *len) -{ - return compat_nf_sockopt(sk, pf, val, opt, len, 1); -} -EXPORT_SYMBOL(compat_nf_getsockopt); -#endif +EXPORT_SYMBOL(nf_getsockopt); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index b9cbe1e2453e..3fa3f5dfb264 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/tcp.h> #include <net/netns/generic.h> #include <linux/proc_fs.h> @@ -31,6 +31,9 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, int length = (th->doff * 4) - sizeof(*th); u8 buf[40], *ptr; + if (unlikely(length < 0)) + return false; + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); if (ptr == NULL) return false; @@ -47,6 +50,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, length--; continue; default: + if (length < 2) + return true; opsize = *ptr++; if (opsize < 2) return true; @@ -148,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; - opts->tsval = tcp_time_stamp_raw() & ~0x3f; + opts->tsval = tcp_clock_ms() & ~0x3f; if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; @@ -231,12 +236,6 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, return 1; } -static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { - .len = sizeof(struct nf_conn_synproxy), - .align = __alignof__(struct nf_conn_synproxy), - .id = NF_CT_EXT_SYNPROXY, -}; - #ifdef CONFIG_PROC_FS static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) { @@ -344,7 +343,6 @@ static int __net_init synproxy_net_init(struct net *net) goto err2; __set_bit(IPS_CONFIRMED_BIT, &ct->status); - nf_conntrack_get(&ct->ct_general); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); @@ -383,28 +381,12 @@ static struct pernet_operations synproxy_net_ops = { static int __init synproxy_core_init(void) { - int err; - - err = nf_ct_extend_register(&nf_ct_synproxy_extend); - if (err < 0) - goto err1; - - err = register_pernet_subsys(&synproxy_net_ops); - if (err < 0) - goto err2; - - return 0; - -err2: - nf_ct_extend_unregister(&nf_ct_synproxy_extend); -err1: - return err; + return register_pernet_subsys(&synproxy_net_ops); } static void __exit synproxy_core_exit(void) { unregister_pernet_subsys(&synproxy_net_ops); - nf_ct_extend_unregister(&nf_ct_synproxy_extend); } module_init(synproxy_core_init); @@ -423,7 +405,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); - iph->ttl = net->ipv4.sysctl_ip_default_ttl; + iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; @@ -446,7 +428,7 @@ synproxy_send_tcp(struct net *net, skb_dst_set_noref(nskb, skb_dst(skb)); nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) goto free_nskb; if (nfct) { @@ -635,7 +617,7 @@ synproxy_recv_client_ack(struct net *net, struct synproxy_net *snet = synproxy_pernet(net); int mss; - mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + mss = __cookie_v4_check(ip_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; @@ -704,8 +686,7 @@ ipv4_synproxy_hook(void *priv, struct sk_buff *skb, nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ + fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; @@ -819,7 +800,7 @@ synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, skb_reset_network_header(skb); iph = skb_put(skb, sizeof(*iph)); ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; + iph->hop_limit = READ_ONCE(net->ipv6.devconf_all->hop_limit); iph->nexthdr = IPPROTO_TCP; iph->saddr = *saddr; iph->daddr = *daddr; @@ -850,7 +831,7 @@ synproxy_send_tcp_ipv6(struct net *net, fl6.fl6_sport = nth->source; fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, - flowi6_to_flowi(&fl6)); + flowi6_to_flowi_common(&fl6)); err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (err) { goto free_nskb; @@ -1053,7 +1034,7 @@ synproxy_recv_client_ack_ipv6(struct net *net, struct synproxy_net *snet = synproxy_pernet(net); int mss; - mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + mss = nf_cookie_v6_check(ipv6_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; @@ -1128,8 +1109,7 @@ ipv6_synproxy_hook(void *priv, struct sk_buff *skb, nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ + fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; @@ -1237,3 +1217,4 @@ EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("nftables SYNPROXY expression support"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 073aa1051d43..f3de2f9bbebf 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -12,6 +12,7 @@ #include <linux/netlink.h> #include <linux/vmalloc.h> #include <linux/rhashtable.h> +#include <linux/audit.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> @@ -23,13 +24,19 @@ #include <net/sock.h> #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) +#define NFT_SET_MAX_ANONLEN 16 + +/* limit compaction to avoid huge kmalloc/krealloc sizes. */ +#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem)) + +unsigned int nf_tables_net_id __read_mostly; static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); -static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); -static u64 table_handle; +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); enum { NFT_VALIDATE_SKIP = 0, @@ -65,9 +72,45 @@ static const struct rhashtable_params nft_objname_ht_params = { .automatic_shrinking = true, }; -static void nft_validate_state_update(struct net *net, u8 new_validate_state) +struct nft_audit_data { + struct nft_table *table; + int entries; + int op; + struct list_head list; +}; + +static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types + [NFT_MSG_NEWTABLE] = AUDIT_NFT_OP_TABLE_REGISTER, + [NFT_MSG_GETTABLE] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELTABLE] = AUDIT_NFT_OP_TABLE_UNREGISTER, + [NFT_MSG_NEWCHAIN] = AUDIT_NFT_OP_CHAIN_REGISTER, + [NFT_MSG_GETCHAIN] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELCHAIN] = AUDIT_NFT_OP_CHAIN_UNREGISTER, + [NFT_MSG_NEWRULE] = AUDIT_NFT_OP_RULE_REGISTER, + [NFT_MSG_GETRULE] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELRULE] = AUDIT_NFT_OP_RULE_UNREGISTER, + [NFT_MSG_NEWSET] = AUDIT_NFT_OP_SET_REGISTER, + [NFT_MSG_GETSET] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELSET] = AUDIT_NFT_OP_SET_UNREGISTER, + [NFT_MSG_NEWSETELEM] = AUDIT_NFT_OP_SETELEM_REGISTER, + [NFT_MSG_GETSETELEM] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELSETELEM] = AUDIT_NFT_OP_SETELEM_UNREGISTER, + [NFT_MSG_NEWGEN] = AUDIT_NFT_OP_GEN_REGISTER, + [NFT_MSG_GETGEN] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_TRACE] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_NEWOBJ] = AUDIT_NFT_OP_OBJ_REGISTER, + [NFT_MSG_GETOBJ] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELOBJ] = AUDIT_NFT_OP_OBJ_UNREGISTER, + [NFT_MSG_GETOBJ_RESET] = AUDIT_NFT_OP_OBJ_RESET, + [NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER, + [NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, + [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET, +}; + +static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state) { - switch (net->nft.validate_state) { + switch (table->validate_state) { case NFT_VALIDATE_SKIP: WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); break; @@ -78,10 +121,12 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) return; } - net->nft.validate_state = new_validate_state; + table->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); -static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); + +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, @@ -102,108 +147,224 @@ static void nft_ctx_init(struct nft_ctx *ctx, ctx->report = nlmsg_report(nlh); ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; + + bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } -static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, - int msg_type, u32 size, gfp_t gfp) +static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, + int msg_type, u32 size) { struct nft_trans *trans; - trans = kzalloc(sizeof(struct nft_trans) + size, gfp); + trans = kzalloc(size, GFP_KERNEL); if (trans == NULL) return NULL; + INIT_LIST_HEAD(&trans->list); trans->msg_type = msg_type; - trans->ctx = *ctx; + + trans->net = ctx->net; + trans->table = ctx->table; + trans->seq = ctx->seq; + trans->flags = ctx->flags; + trans->report = ctx->report; return trans; } -static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, - int msg_type, u32 size) +static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) { - return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + case NFT_MSG_NEWSET: + return container_of(trans, struct nft_trans_binding, nft_trans); + } + + return NULL; } -static void nft_trans_destroy(struct nft_trans *trans) +static void nft_trans_list_del(struct nft_trans *trans) { + struct nft_trans_binding *trans_binding; + list_del(&trans->list); + + trans_binding = nft_trans_get_binding(trans); + if (trans_binding) + list_del(&trans_binding->binding_list); +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + nft_trans_list_del(trans); kfree(trans); } -static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, + bool bind) { + struct nftables_pernet *nft_net; struct net *net = ctx->net; struct nft_trans *trans; if (!nft_set_is_anonymous(set)) return; - list_for_each_entry_reverse(trans, &net->nft.commit_list, list) { + nft_net = nft_pernet(net); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) - nft_trans_set_bound(trans) = true; + nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) - nft_trans_elem_set_bound(trans) = true; + nft_trans_elem_set_bound(trans) = bind; break; } } } +static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, true); +} + +static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, false); +} + +static void __nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain, bool bind) +{ + struct nftables_pernet *nft_net; + struct net *net = ctx->net; + struct nft_trans *trans; + + if (!nft_chain_binding(chain)) + return; + + nft_net = nft_pernet(net); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain(trans) == chain) + nft_trans_chain_bound(trans) = bind; + break; + case NFT_MSG_NEWRULE: + if (nft_trans_rule_chain(trans) == chain) + nft_trans_rule_bound(trans) = bind; + break; + } + } +} + +static void nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, true); +} + +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + if (!nft_chain_binding(chain)) + return 0; + + if (nft_chain_binding(ctx->chain)) + return -EOPNOTSUPP; + + if (chain->bound) + return -EBUSY; + + if (!nft_use_inc(&chain->use)) + return -EMFILE; + + chain->bound = true; + nft_chain_trans_bind(ctx, chain); + + return 0; +} + +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, false); +} + static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { + struct nf_hook_ops *ops; struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) - goto err_register; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nf_register_net_hook(net, ops); + if (err < 0) + goto err_register; - j++; + j++; + } } return 0; err_register: list_for_each_entry(hook, hook_list, list) { - if (j-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (j-- <= 0) + break; - nf_unregister_net_hook(net, &hook->ops); + nf_unregister_net_hook(net, ops); + } } return err; } -static void nft_netdev_unregister_hooks(struct net *net, - struct list_head *hook_list) +static void nft_netdev_hook_free_ops(struct nft_hook *hook) { - struct nft_hook *hook; + struct nf_hook_ops *ops, *next; - list_for_each_entry(hook, hook_list, list) - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry_safe(ops, next, &hook->ops_list, list) { + list_del(&ops->list); + kfree(ops); + } } -static int nft_register_basechain_hooks(struct net *net, int family, - struct nft_base_chain *basechain) +static void nft_netdev_hook_free(struct nft_hook *hook) { - if (family == NFPROTO_NETDEV) - return nft_netdev_register_hooks(net, &basechain->hook_list); + nft_netdev_hook_free_ops(hook); + kfree(hook); +} - return nf_register_net_hook(net, &basechain->ops); +static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu) +{ + struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu); + + nft_netdev_hook_free(hook); } -static void nft_unregister_basechain_hooks(struct net *net, int family, - struct nft_base_chain *basechain) +static void nft_netdev_hook_free_rcu(struct nft_hook *hook) { - if (family == NFPROTO_NETDEV) - nft_netdev_unregister_hooks(net, &basechain->hook_list); - else - nf_unregister_net_hook(net, &basechain->ops); + call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); +} + +static void nft_netdev_unregister_hooks(struct net *net, + struct list_head *hook_list, + bool release_netdev) +{ + struct nft_hook *hook, *next; + struct nf_hook_ops *ops; + + list_for_each_entry_safe(hook, next, hook_list, list) { + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); + if (release_netdev) { + list_del(&hook->list); + nft_netdev_hook_free_rcu(hook); + } + } } static int nf_tables_register_hook(struct net *net, @@ -223,12 +384,16 @@ static int nf_tables_register_hook(struct net *net, if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); - return nft_register_basechain_hooks(net, table->family, basechain); + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) + return nft_netdev_register_hooks(net, &basechain->hook_list); + + return nf_register_net_hook(net, &basechain->ops); } -static void nf_tables_unregister_hook(struct net *net, - const struct nft_table *table, - struct nft_chain *chain) +static void __nf_tables_unregister_hook(struct net *net, + const struct nft_table *table, + struct nft_chain *chain, + bool release_netdev) { struct nft_base_chain *basechain; const struct nf_hook_ops *ops; @@ -242,7 +407,144 @@ static void nf_tables_unregister_hook(struct net *net, if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - nft_unregister_basechain_hooks(net, table->family, basechain); + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) + nft_netdev_unregister_hooks(net, &basechain->hook_list, + release_netdev); + else + nf_unregister_net_hook(net, &basechain->ops); +} + +static void nf_tables_unregister_hook(struct net *net, + const struct nft_table *table, + struct nft_chain *chain) +{ + return __nf_tables_unregister_hook(net, table, chain, false); +} + +static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b) +{ + /* NB: the ->bound equality check is defensive, at this time we only merge + * a new nft_trans_elem transaction request with the transaction tail + * element, but a->bound != b->bound would imply a NEWRULE transaction + * is queued in-between. + * + * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents + * huge krealloc() requests. + */ + return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS; +} + +static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, + struct nft_trans_elem *tail, + struct nft_trans_elem *trans) +{ + unsigned int nelems, old_nelems = tail->nelems; + struct nft_trans_elem *new_trans; + + if (!nft_trans_collapse_set_elem_allowed(tail, trans)) + return false; + + /* "cannot happen", at this time userspace element add + * requests always allocate a new transaction element. + * + * This serves as a reminder to adjust the list_add_tail + * logic below in case this ever changes. + */ + if (WARN_ON_ONCE(trans->nelems != 1)) + return false; + + if (check_add_overflow(old_nelems, trans->nelems, &nelems)) + return false; + + /* krealloc might free tail which invalidates list pointers */ + list_del_init(&tail->nft_trans.list); + + new_trans = krealloc(tail, struct_size(tail, elems, nelems), + GFP_KERNEL); + if (!new_trans) { + list_add_tail(&tail->nft_trans.list, + &nft_net->commit_list); + return false; + } + + /* + * new_trans->nft_trans.list contains garbage, but + * list_add_tail() doesn't care. + */ + new_trans->nelems = nelems; + new_trans->elems[old_nelems] = trans->elems[0]; + list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list); + + return true; +} + +static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, + struct nft_trans *trans) +{ + struct nft_trans *tail; + + if (list_empty(&nft_net->commit_list)) + return false; + + tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list); + + if (tail->msg_type != trans->msg_type) + return false; + + switch (trans->msg_type) { + case NFT_MSG_NEWSETELEM: + case NFT_MSG_DELSETELEM: + return nft_trans_collapse_set_elem(nft_net, + nft_trans_container_elem(tail), + nft_trans_container_elem(trans)); + } + + return false; +} + +static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + struct nft_trans_binding *binding; + struct nft_trans_set *trans_set; + + list_add_tail(&trans->list, &nft_net->commit_list); + + binding = nft_trans_get_binding(trans); + if (!binding) + return; + + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + trans_set = nft_trans_container_set(trans); + + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans))) + list_add_tail(&binding->binding_list, &nft_net->binding_list); + + list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list); + break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans))) + list_add_tail(&binding->binding_list, &nft_net->binding_list); + break; + } +} + +static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + + WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && + trans->msg_type != NFT_MSG_DELSETELEM); + + if (nft_trans_try_collapse(nft_net, trans)) { + kfree(trans); + return; + } + + nft_trans_commit_list_add_tail(net, trans); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -256,7 +558,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) if (msg_type == NFT_MSG_NEWTABLE) nft_activate_next(ctx->net, ctx->table); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -272,18 +574,41 @@ static int nft_deltable(struct nft_ctx *ctx) return err; } -static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +static struct nft_trans * +nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type) { + struct nft_trans_chain *trans_chain; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (!trans) + return NULL; + + trans_chain = nft_trans_container_chain(trans); + INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list); + trans_chain->chain = ctx->chain; + + return trans; +} + +static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc_chain(ctx, msg_type); if (trans == NULL) return ERR_PTR(-ENOMEM); - if (msg_type == NFT_MSG_NEWCHAIN) + if (msg_type == NFT_MSG_NEWCHAIN) { nft_activate_next(ctx->net, ctx->chain); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + if (ctx->nla[NFTA_CHAIN_ID]) { + nft_trans_chain_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); + } + } + nft_trans_commit_list_add_tail(ctx->net, trans); + return trans; } @@ -295,19 +620,18 @@ static int nft_delchain(struct nft_ctx *ctx) if (IS_ERR(trans)) return PTR_ERR(trans); - ctx->table->use--; + nft_use_dec(&ctx->table->use); nft_deactivate_next(ctx->net, ctx->chain); return 0; } -static void nft_rule_expr_activate(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { if (expr->ops->activate) expr->ops->activate(ctx, expr); @@ -315,14 +639,13 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, } } -static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, - struct nft_rule *rule, - enum nft_trans_phase phase) +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase) { struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { if (expr->ops->deactivate) expr->ops->deactivate(ctx, expr, phase); @@ -336,7 +659,7 @@ nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) /* You cannot delete the same rule twice */ if (nft_is_active_next(ctx->net, rule)) { nft_deactivate_next(ctx->net, rule); - ctx->chain->use--; + nft_use_dec(&ctx->chain->use); return 0; } return -ENOENT; @@ -356,7 +679,8 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); } nft_trans_rule(trans) = rule; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_rule_chain(trans) = ctx->chain; + nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } @@ -407,26 +731,98 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, - struct nft_set *set) +static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + struct nft_set *set, + const struct nft_set_desc *desc) { + struct nft_trans_set *trans_set; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); if (trans == NULL) return -ENOMEM; - if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + trans_set = nft_trans_container_set(trans); + INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list); + INIT_LIST_HEAD(&trans_set->list_trans_newset); + + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + if (desc) { + nft_trans_set_update(trans) = true; + nft_trans_set_gc_int(trans) = desc->gc_int; + nft_trans_set_timeout(trans) = desc->timeout; + nft_trans_set_size(trans) = desc->size; + } + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } +static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + return __nft_trans_set_add(ctx, msg_type, set, NULL); +} + +static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_set_elem_change_active(ctx->net, set, ext); + nft_setelem_data_deactivate(ctx->net, set, elem_priv); + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + struct nft_elem_priv *elem; +}; + +static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + nft_set_elem_change_active(ctx->net, set, ext); + nft_setelem_data_deactivate(ctx->net, set, catchall->elem); + break; + } +} + +static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nft_mapelem_deactivate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_deactivate(ctx, set); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -435,8 +831,11 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -454,7 +853,7 @@ static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, obj); nft_trans_obj(trans) = obj; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -468,64 +867,125 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; nft_deactivate_next(ctx->net, obj); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } -static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, - struct nft_flowtable *flowtable) +static struct nft_trans * +nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, + struct nft_flowtable *flowtable) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_flowtable)); if (trans == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWFLOWTABLE) nft_activate_next(ctx->net, flowtable); + INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); nft_trans_flowtable(trans) = flowtable; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); - return 0; + return trans; } static int nft_delflowtable(struct nft_ctx *ctx, struct nft_flowtable *flowtable) { - int err; + struct nft_trans *trans; - err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); - if (err < 0) - return err; + trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); + if (IS_ERR(trans)) + return PTR_ERR(trans); nft_deactivate_next(ctx->net, flowtable); - ctx->table->use--; + nft_use_dec(&ctx->table->use); - return err; + return 0; } +static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg) +{ + int i; + + for (i = track->regs[dreg].num_reg; i > 0; i--) + __nft_reg_track_cancel(track, dreg - i); +} + +static void __nft_reg_track_update(struct nft_regs_track *track, + const struct nft_expr *expr, + u8 dreg, u8 num_reg) +{ + track->regs[dreg].selector = expr; + track->regs[dreg].bitwise = NULL; + track->regs[dreg].num_reg = num_reg; +} + +void nft_reg_track_update(struct nft_regs_track *track, + const struct nft_expr *expr, u8 dreg, u8 len) +{ + unsigned int regcount; + int i; + + __nft_reg_track_clobber(track, dreg); + + regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE); + for (i = 0; i < regcount; i++, dreg++) + __nft_reg_track_update(track, expr, dreg, i); +} +EXPORT_SYMBOL_GPL(nft_reg_track_update); + +void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len) +{ + unsigned int regcount; + int i; + + __nft_reg_track_clobber(track, dreg); + + regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE); + for (i = 0; i < regcount; i++, dreg++) + __nft_reg_track_cancel(track, dreg); +} +EXPORT_SYMBOL_GPL(nft_reg_track_cancel); + +void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg) +{ + track->regs[dreg].selector = NULL; + track->regs[dreg].bitwise = NULL; + track->regs[dreg].num_reg = 0; +} +EXPORT_SYMBOL_GPL(__nft_reg_track_cancel); + /* * Tables */ static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, - u8 family, u8 genmask) + u8 family, u8 genmask, u32 nlpid) { + struct nftables_pernet *nft_net; struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(table, &net->nft.tables, list, - lockdep_is_held(&net->nft.commit_mutex)) { + nft_net = nft_pernet(net); + list_for_each_entry_rcu(table, &nft_net->tables, list, + lockdep_is_held(&nft_net->commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && - nft_active_genmask(table, genmask)) + nft_active_genmask(table, genmask)) { + if (nft_table_has_owner(table) && + nlpid && table->nlpid != nlpid) + return ERR_PTR(-EPERM); + return table; + } } return ERR_PTR(-ENOENT); @@ -533,14 +993,22 @@ static struct nft_table *nft_table_lookup(const struct net *net, static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, - u8 genmask) + int family, u8 genmask, u32 nlpid) { + struct nftables_pernet *nft_net; struct nft_table *table; - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = nft_pernet(net); + list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && - nft_active_genmask(table, genmask)) + table->family == family && + nft_active_genmask(table, genmask)) { + if (nft_table_has_owner(table) && + nlpid && table->nlpid != nlpid) + return ERR_PTR(-EPERM); + return table; + } } return ERR_PTR(-ENOENT); @@ -586,9 +1054,11 @@ struct nft_module_request { }; #ifdef CONFIG_MODULES -static int nft_request_module(struct net *net, const char *fmt, ...) +__printf(2, 3) int nft_request_module(struct net *net, const char *fmt, + ...) { char module_name[MODULE_NAME_LEN]; + struct nftables_pernet *nft_net; struct nft_module_request *req; va_list args; int ret; @@ -599,7 +1069,8 @@ static int nft_request_module(struct net *net, const char *fmt, ...) if (ret >= MODULE_NAME_LEN) return 0; - list_for_each_entry(req, &net->nft.module_list, list) { + nft_net = nft_pernet(net); + list_for_each_entry(req, &nft_net->module_list, list) { if (!strcmp(req->module, module_name)) { if (req->done) return 0; @@ -614,17 +1085,19 @@ static int nft_request_module(struct net *net, const char *fmt, ...) return -ENOMEM; req->done = false; - strlcpy(req->module, module_name, MODULE_NAME_LEN); - list_add_tail(&req->list, &net->nft.module_list); + strscpy(req->module, module_name, MODULE_NAME_LEN); + list_add_tail(&req->list, &nft_net->module_list); return -EAGAIN; } +EXPORT_SYMBOL_GPL(nft_request_module); #endif static void lockdep_nfnl_nft_mutex_not_held(void) { #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + if (debug_locks) + WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); #endif } @@ -650,11 +1123,23 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } +static unsigned int nft_base_seq(const struct net *net) +{ + return READ_ONCE(net->nft.base_seq); +} + +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); +} + static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, + [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN } }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -662,25 +1147,38 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, int family, const struct nft_table *table) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || - nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), NFTA_TABLE_PAD)) goto nla_put_failure; + if (event == NFT_MSG_DELTABLE || + event == NFT_MSG_DESTROYTABLE) { + nlmsg_end(skb, nlh); + return 0; + } + + if (nla_put_be32(skb, NFTA_TABLE_FLAGS, + htonl(table->flags & NFT_TABLE_F_MASK))) + goto nla_put_failure; + + if (nft_table_has_owner(table) && + nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid))) + goto nla_put_failure; + + if (table->udata) { + if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata)) + goto nla_put_failure; + } + nlmsg_end(skb, nlh); return 0; @@ -689,9 +1187,23 @@ nla_put_failure: return -1; } +struct nftnl_skb_parms { + bool report; +}; +#define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb)) + +static void nft_notify_enqueue(struct sk_buff *skb, bool report, + struct list_head *notify_list) +{ + NFT_CB(skb).report = report; + list_add_tail(&skb->list, notify_list); +} + static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { + struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -702,15 +1214,18 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table); + event, flags, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; } - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nft_net = nft_pernet(ctx->net); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -720,15 +1235,17 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nftables_pernet *nft_net; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = nft_pernet(net); + cb->seq = nft_base_seq(net); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -774,28 +1291,27 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, } /* called with rcu_read_lock held */ -static int nf_tables_gettable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; + struct net *net = info->net; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_tables, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask); + table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]); return PTR_ERR(table); @@ -806,14 +1322,14 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, - family, table); + info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, + 0, family, table); if (err < 0) - goto err; + goto err_fill_table_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_table_info: kfree_skb(skb2); return err; } @@ -832,8 +1348,7 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) if (cnt && i++ == cnt) break; - nft_unregister_basechain_hooks(net, table->family, - nft_base_chain(chain)); + nf_tables_unregister_hook(net, table, chain); } } @@ -848,8 +1363,7 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table) if (!nft_is_base_chain(chain)) continue; - err = nft_register_basechain_hooks(net, table->family, - nft_base_chain(chain)); + err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err_register_hooks; @@ -865,25 +1379,68 @@ err_register_hooks: static void nf_tables_table_disable(struct net *net, struct nft_table *table) { + table->flags &= ~NFT_TABLE_F_DORMANT; nft_table_disable(net, table, 0); + table->flags |= NFT_TABLE_F_DORMANT; +} + +#define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1) +#define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0) +#define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1) +#define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2) +#define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ + __NFT_TABLE_F_WAS_AWAKEN | \ + __NFT_TABLE_F_WAS_ORPHAN) + +static bool nft_table_pending_update(const struct nft_ctx *ctx) +{ + struct nftables_pernet *nft_net = nft_pernet(ctx->net); + struct nft_trans *trans; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return true; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->table == ctx->table && + ((trans->msg_type == NFT_MSG_NEWCHAIN && + nft_trans_chain_update(trans)) || + (trans->msg_type == NFT_MSG_DELCHAIN && + nft_is_base_chain(nft_trans_chain(trans))))) + return true; + } + + return false; } static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; u32 flags; - int ret = 0; + int ret; if (!ctx->nla[NFTA_TABLE_FLAGS]) return 0; flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + if (flags & ~NFT_TABLE_F_MASK) + return -EOPNOTSUPP; - if (flags == ctx->table->flags) + if (flags == (ctx->table->flags & NFT_TABLE_F_MASK)) return 0; + if ((nft_table_has_owner(ctx->table) && + !(flags & NFT_TABLE_F_OWNER)) || + (flags & NFT_TABLE_F_OWNER && + !nft_table_is_orphan(ctx->table))) + return -EOPNOTSUPP; + + if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST) + return -EOPNOTSUPP; + + /* No dormant off/on/off/on games in single transaction */ + if (nft_table_pending_update(ctx)) + return -EINVAL; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) @@ -891,22 +1448,35 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if ((flags & NFT_TABLE_F_DORMANT) && !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { - nft_trans_table_enable(trans) = false; + ctx->table->flags |= NFT_TABLE_F_DORMANT; + if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) + ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(ctx->net, ctx->table); - if (ret >= 0) { - ctx->table->flags &= ~NFT_TABLE_F_DORMANT; - nft_trans_table_enable(trans) = true; + ctx->table->flags &= ~NFT_TABLE_F_DORMANT; + if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) { + ret = nf_tables_table_enable(ctx->net, ctx->table); + if (ret < 0) + goto err_register_hooks; + + ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT; } } - if (ret < 0) - goto err; + + if ((flags & NFT_TABLE_F_OWNER) && + !nft_table_has_owner(ctx->table)) { + ctx->table->nlpid = ctx->portid; + ctx->table->flags |= NFT_TABLE_F_OWNER | + __NFT_TABLE_F_WAS_ORPHAN; + } nft_trans_table_update(trans) = true; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); + return 0; -err: + +err_register_hooks: + ctx->table->flags |= NFT_TABLE_F_DORMANT; nft_trans_destroy(trans); return ret; } @@ -962,53 +1532,91 @@ static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg, return strcmp(obj->key.name, k->name); } -static int nf_tables_newtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static bool nft_supported_family(u8 family) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; + return false +#ifdef CONFIG_NF_TABLES_INET + || family == NFPROTO_INET +#endif +#ifdef CONFIG_NF_TABLES_IPV4 + || family == NFPROTO_IPV4 +#endif +#ifdef CONFIG_NF_TABLES_ARP + || family == NFPROTO_ARP +#endif +#ifdef CONFIG_NF_TABLES_NETDEV + || family == NFPROTO_NETDEV +#endif +#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) + || family == NFPROTO_BRIDGE +#endif +#ifdef CONFIG_NF_TABLES_IPV6 + || family == NFPROTO_IPV6 +#endif + ; +} + +static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; - u32 flags = 0; struct nft_ctx ctx; + u32 flags = 0; int err; - lockdep_assert_held(&net->nft.commit_mutex); + if (!nft_supported_family(family)) + return -EOPNOTSUPP; + + lockdep_assert_held(&nft_net->commit_mutex); attr = nla[NFTA_TABLE_NAME]; - table = nft_table_lookup(net, attr, family, genmask); + table = nft_table_lookup(net, attr, family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + return nf_tables_updtable(&ctx); } if (nla[NFTA_TABLE_FLAGS]) { flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + if (flags & ~NFT_TABLE_F_MASK) + return -EOPNOTSUPP; } err = -ENOMEM; - table = kzalloc(sizeof(*table), GFP_KERNEL); + table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT); if (table == NULL) goto err_kzalloc; - table->name = nla_strdup(attr, GFP_KERNEL); + table->validate_state = nft_net->validate_state; + table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (table->name == NULL) goto err_strdup; + if (nla[NFTA_TABLE_USERDATA]) { + table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT); + if (table->udata == NULL) + goto err_table_udata; + + table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]); + } + err = rhltable_init(&table->chains_ht, &nft_chain_ht_params); if (err) goto err_chain_ht; @@ -1019,18 +1627,22 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, INIT_LIST_HEAD(&table->flowtables); table->family = family; table->flags = flags; - table->handle = ++table_handle; + table->handle = ++nft_net->table_handle; + if (table->flags & NFT_TABLE_F_OWNER) + table->nlpid = NETLINK_CB(skb).portid; - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) goto err_trans; - list_add_tail_rcu(&table->list, &net->nft.tables); + list_add_tail_rcu(&table->list, &nft_net->tables); return 0; err_trans: rhltable_destroy(&table->chains_ht); err_chain_ht: + kfree(table->udata); +err_table_udata: kfree(table->name); err_strdup: kfree(table); @@ -1050,6 +1662,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_binding(chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -1061,8 +1676,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, set)) continue; - if (nft_set_is_anonymous(set) && - !list_empty(&set->bindings)) + if (nft_set_is_anonymous(set)) continue; err = nft_delset(ctx, set); @@ -1092,6 +1706,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_binding(chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -1106,11 +1723,12 @@ out: static int nft_flush(struct nft_ctx *ctx, int family) { - struct nft_table *table, *nt; + struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nlattr * const *nla = ctx->nla; + struct nft_table *table, *nt; int err = 0; - list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) { + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (family != AF_UNSPEC && table->family != family) continue; @@ -1119,6 +1737,9 @@ static int nft_flush(struct nft_ctx *ctx, int family) if (!nft_is_active_next(ctx->net, table)) continue; + if (nft_table_has_owner(table) && table->nlpid != ctx->portid) + continue; + if (nla[NFTA_TABLE_NAME] && nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) continue; @@ -1133,37 +1754,42 @@ out: return err; } -static int nf_tables_deltable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; - nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla); if (family == AF_UNSPEC || (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); if (nla[NFTA_TABLE_HANDLE]) { attr = nla[NFTA_TABLE_HANDLE]; - table = nft_table_lookup_byhandle(net, attr, genmask); + table = nft_table_lookup_byhandle(net, attr, family, genmask, + NETLINK_CB(skb).portid); } else { attr = nla[NFTA_TABLE_NAME]; - table = nft_table_lookup(net, attr, family, genmask); + table = nft_table_lookup(net, attr, family, genmask, + NETLINK_CB(skb).portid); } if (IS_ERR(table)) { + if (PTR_ERR(table) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); } - if (nlh->nlmsg_flags & NLM_F_NONREC && + if (info->nlh->nlmsg_flags & NLM_F_NONREC && table->use > 0) return -EBUSY; @@ -1173,14 +1799,15 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, return nft_flush_table(&ctx); } -static void nf_tables_table_destroy(struct nft_ctx *ctx) +static void nf_tables_table_destroy(struct nft_table *table) { - if (WARN_ON(ctx->table->use > 0)) + if (WARN_ON(table->use > 0)) return; - rhltable_destroy(&ctx->table->chains_ht); - kfree(ctx->table->name); - kfree(ctx->table); + rhltable_destroy(&table->chains_ht); + kfree(table->name); + kfree(table->udata); + kfree(table); } void nft_register_chain_type(const struct nft_chain_type *ctype) @@ -1224,7 +1851,9 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING - return lockdep_is_held(&net->nft.commit_mutex); + struct nftables_pernet *nft_net = nft_pernet(net); + + return lockdep_is_held(&nft_net->commit_mutex); #else return true; #endif @@ -1241,7 +1870,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - nla_strlcpy(search, nla, sizeof(search)); + nla_strscpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && !lockdep_commit_lock_is_held(net)); @@ -1274,6 +1903,9 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, + [NFTA_CHAIN_ID] = { .type = NLA_U32 }, + [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -1298,10 +1930,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); do { - seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + seq = u64_stats_fetch_begin(&cpu_stats->syncp); pkts = cpu_stats->pkts; bytes = cpu_stats->bytes; - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq)); total.pkts += pkts; total.bytes += bytes; } @@ -1322,8 +1954,22 @@ nla_put_failure: return -ENOSPC; } -static int nft_dump_basechain_hook(struct sk_buff *skb, int family, - const struct nft_base_chain *basechain) +static bool hook_is_prefix(struct nft_hook *hook) +{ + return strlen(hook->ifname) >= hook->ifnamelen; +} + +static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) +{ + int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME; + + return nla_put_string(skb, attr, hook->ifname); +} + +static int nft_dump_basechain_hook(struct sk_buff *skb, + const struct net *net, int family, + const struct nft_base_chain *basechain, + const struct list_head *hook_list) { const struct nf_hook_ops *ops = &basechain->ops; struct nft_hook *hook, *first = NULL; @@ -1338,21 +1984,28 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family, if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) goto nla_put_failure; - if (family == NFPROTO_NETDEV) { + if (nft_base_chain_netdev(family, ops->hooknum)) { nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); - list_for_each_entry(hook, &basechain->hook_list, list) { + if (!nest_devs) + goto nla_put_failure; + + if (!hook_list) + hook_list = &basechain->hook_list; + + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { if (!first) first = hook; - if (nla_put_string(skb, NFTA_DEVICE_NAME, - hook->ops.dev->name)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + !hook_is_prefix(first) && + nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -1365,34 +2018,35 @@ nla_put_failure: static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, - const struct nft_chain *chain) + const struct nft_chain *chain, + const struct list_head *hook_list) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - - if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) - goto nla_put_failure; - if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), + if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) || + nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) || + nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), NFTA_CHAIN_PAD)) goto nla_put_failure; - if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) - goto nla_put_failure; + + if (!hook_list && + (event == NFT_MSG_DELCHAIN || + event == NFT_MSG_DESTROYCHAIN)) { + nlmsg_end(skb, nlh); + return 0; + } if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; - if (nft_dump_basechain_hook(skb, family, basechain)) + if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -1406,16 +2060,19 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; - - if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) && - nla_put_be32(skb, NFTA_CHAIN_FLAGS, - htonl(NFT_CHAIN_HW_OFFLOAD))) - goto nla_put_failure; } + if (chain->flags && + nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; + if (chain->udata && + nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1424,9 +2081,12 @@ nla_put_failure: return -1; } -static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, + const struct list_head *hook_list) { + struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -1437,16 +2097,19 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, - ctx->chain); + event, flags, ctx->family, ctx->table, + ctx->chain, hook_list); if (err < 0) { kfree_skb(skb); goto err; } - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nft_net = nft_pernet(ctx->net); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -1456,16 +2119,18 @@ static int nf_tables_dump_chains(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_table *table; - const struct nft_chain *chain; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; + const struct nft_table *table; + const struct nft_chain *chain; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = nft_pernet(net); + cb->seq = nft_base_seq(net); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -1483,7 +2148,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, NFT_MSG_NEWCHAIN, NLM_F_MULTI, table->family, table, - chain) < 0) + chain, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -1498,29 +2163,28 @@ done: } /* called with rcu_read_lock held */ -static int nf_tables_getchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; + struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_chains, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -1537,14 +2201,14 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, - family, table, chain); + info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, + 0, family, table, chain, NULL); if (err < 0) - goto err; + goto err_fill_chain_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_chain_info: kfree_skb(skb2); return err; } @@ -1564,14 +2228,14 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, NULL); if (err < 0) - return ERR_PTR(err); + return ERR_PTR_PCPU(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) - return ERR_PTR(-EINVAL); + return ERR_PTR_PCPU(-EINVAL); newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) - return ERR_PTR(-ENOMEM); + return ERR_PTR_PCPU(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. @@ -1585,38 +2249,39 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) return newstats; } -static void nft_chain_stats_replace(struct nft_trans *trans) +static void nft_chain_stats_replace(struct nft_trans_chain *trans) { - struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain); + const struct nft_trans *t = &trans->nft_trans_binding.nft_trans; + struct nft_base_chain *chain = nft_base_chain(trans->chain); - if (!nft_trans_chain_stats(trans)) + if (!trans->stats) return; - nft_trans_chain_stats(trans) = - rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans), - lockdep_commit_lock_is_held(trans->ctx.net)); + trans->stats = + rcu_replace_pointer(chain->stats, trans->stats, + lockdep_commit_lock_is_held(t->net)); - if (!nft_trans_chain_stats(trans)) + if (!trans->stats) static_branch_inc(&nft_counters_enabled); } static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) { - struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0); - struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1); + struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0); + struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1); if (g0 != g1) kvfree(g1); kvfree(g0); /* should be NULL either via abort or via successful commit */ - WARN_ON_ONCE(chain->rules_next); - kvfree(chain->rules_next); + WARN_ON_ONCE(chain->blob_next); + kvfree(chain->blob_next); } -static void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_chain *chain) { - struct nft_chain *chain = ctx->chain; + const struct nft_table *table = chain->table; struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) @@ -1628,11 +2293,11 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); - if (ctx->family == NFPROTO_NETDEV) { + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } module_put(basechain->type->owner); @@ -1641,41 +2306,57 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) free_percpu(rcu_dereference_raw(basechain->stats)); } kfree(chain->name); + kfree(chain->udata); kfree(basechain); } else { kfree(chain->name); + kfree(chain->udata); kfree(chain); } } static struct nft_hook *nft_netdev_hook_alloc(struct net *net, - const struct nlattr *attr) + const struct nlattr *attr, + bool prefix) { + struct nf_hook_ops *ops; struct net_device *dev; - char ifname[IFNAMSIZ]; struct nft_hook *hook; int err; - hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL); - if (!hook) { - err = -ENOMEM; - goto err_hook_alloc; - } + hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); + if (!hook) + return ERR_PTR(-ENOMEM); - nla_strlcpy(ifname, attr, IFNAMSIZ); - dev = __dev_get_by_name(net, ifname); - if (!dev) { - err = -ENOENT; - goto err_hook_dev; - } - hook->ops.dev = dev; - hook->inactive = false; + INIT_LIST_HEAD(&hook->ops_list); + + err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); + if (err < 0) + goto err_hook_free; + /* include the terminating NUL-char when comparing non-prefixes */ + hook->ifnamelen = strlen(hook->ifname) + !prefix; + + /* nf_tables_netdev_event() is called under rtnl_mutex, this is + * indirectly serializing all the other holders of the commit_mutex with + * the rtnl_mutex. + */ + for_each_netdev(net, dev) { + if (strncmp(dev->name, hook->ifname, hook->ifnamelen)) + continue; + + ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); + if (!ops) { + err = -ENOMEM; + goto err_hook_free; + } + ops->dev = dev; + list_add_tail(&ops->list, &hook->ops_list); + } return hook; -err_hook_dev: - kfree(hook); -err_hook_alloc: +err_hook_free: + nft_netdev_hook_free(hook); return ERR_PTR(err); } @@ -1685,7 +2366,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { - if (this->ops.dev == hook->ops.dev) + if (!strncmp(hook->ifname, this->ifname, + min(hook->ifnamelen, this->ifnamelen))) return hook; } @@ -1694,25 +2376,36 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, static int nf_tables_parse_netdev_hooks(struct net *net, const struct nlattr *attr, - struct list_head *hook_list) + struct list_head *hook_list, + struct netlink_ext_ack *extack) { struct nft_hook *hook, *next; const struct nlattr *tmp; int rem, n = 0, err; + bool prefix; nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { + switch (nla_type(tmp)) { + case NFTA_DEVICE_NAME: + prefix = false; + break; + case NFTA_DEVICE_PREFIX: + prefix = true; + break; + default: err = -EINVAL; goto err_hook; } - hook = nft_netdev_hook_alloc(net, tmp); + hook = nft_netdev_hook_alloc(net, tmp, prefix); if (IS_ERR(hook)) { + NL_SET_BAD_ATTR(extack, tmp); err = PTR_ERR(hook); goto err_hook; } if (nft_hook_list_find(hook_list, hook)) { - kfree(hook); + NL_SET_BAD_ATTR(extack, tmp); + nft_netdev_hook_free(hook); err = -EEXIST; goto err_hook; } @@ -1730,7 +2423,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, err_hook: list_for_each_entry_safe(hook, next, hook_list, list) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } return err; } @@ -1742,44 +2435,48 @@ struct nft_chain_hook { struct list_head list; }; -static int nft_chain_parse_netdev(struct net *net, - struct nlattr *tb[], - struct list_head *hook_list) +static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[], + struct list_head *hook_list, + struct netlink_ext_ack *extack, u32 flags) { struct nft_hook *hook; int err; if (tb[NFTA_HOOK_DEV]) { - hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); - if (IS_ERR(hook)) + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false); + if (IS_ERR(hook)) { + NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]); return PTR_ERR(hook); + } list_add_tail(&hook->list, hook_list); } else if (tb[NFTA_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], - hook_list); + hook_list, extack); if (err < 0) return err; - if (list_empty(hook_list)) - return -EINVAL; - } else { - return -EINVAL; } + if (flags & NFT_CHAIN_HW_OFFLOAD && + list_empty(hook_list)) + return -EINVAL; + return 0; } static int nft_chain_parse_hook(struct net *net, + struct nft_base_chain *basechain, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, - bool autoload) + u32 flags, struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = nft_pernet(net); struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; int err; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX, @@ -1788,38 +2485,69 @@ static int nft_chain_parse_hook(struct net *net, if (err < 0) return err; - if (ha[NFTA_HOOK_HOOKNUM] == NULL || - ha[NFTA_HOOK_PRIORITY] == NULL) - return -EINVAL; + if (!basechain) { + if (!ha[NFTA_HOOK_HOOKNUM] || + !ha[NFTA_HOOK_PRIORITY]) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); + return -ENOENT; + } - hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); - hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); + hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); - if (!type) - return -EOPNOTSUPP; + type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); + if (!type) + return -EOPNOTSUPP; - if (nla[NFTA_CHAIN_TYPE]) { - type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], - family, autoload); - if (IS_ERR(type)) - return PTR_ERR(type); - } - if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) - return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_TYPE]) { + type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], + family, true); + if (IS_ERR(type)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); + return PTR_ERR(type); + } + } + if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) + return -EOPNOTSUPP; - if (type->type == NFT_CHAIN_T_NAT && - hook->priority <= NF_IP_PRI_CONNTRACK) - return -EOPNOTSUPP; + if (type->type == NFT_CHAIN_T_NAT && + hook->priority <= NF_IP_PRI_CONNTRACK) + return -EOPNOTSUPP; + } else { + if (ha[NFTA_HOOK_HOOKNUM]) { + hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); + if (hook->num != basechain->ops.hooknum) + return -EOPNOTSUPP; + } + if (ha[NFTA_HOOK_PRIORITY]) { + hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + if (hook->priority != basechain->ops.priority) + return -EOPNOTSUPP; + } - if (!try_module_get(type->owner)) + if (nla[NFTA_CHAIN_TYPE]) { + type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], + family); + if (!type) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); + return -ENOENT; + } + } else { + type = basechain->type; + } + } + + if (!try_module_get(type->owner)) { + if (nla[NFTA_CHAIN_TYPE]) + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); return -ENOENT; + } hook->type = type; INIT_LIST_HEAD(&hook->list); - if (family == NFPROTO_NETDEV) { - err = nft_chain_parse_netdev(net, ha, &hook->list); + if (nft_base_chain_netdev(family, hook->num)) { + err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags); if (err < 0) { module_put(type->owner); return err; @@ -1838,99 +2566,133 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) list_for_each_entry_safe(h, next, &hook->list, list) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } module_put(hook->type->owner); } -struct nft_rules_old { - struct rcu_head h; - struct nft_rule **start; -}; +static void nft_last_rule(const struct nft_chain *chain, const void *ptr) +{ + struct nft_rule_dp_last *lrule; + + BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0); + + lrule = (struct nft_rule_dp_last *)ptr; + lrule->end.is_last = 1; + lrule->chain = chain; + /* blob size does not include the trailer rule */ +} -static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain, - unsigned int alloc) +static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain, + unsigned int size) { - if (alloc > INT_MAX) + struct nft_rule_blob *blob; + + if (size > INT_MAX) return NULL; - alloc += 1; /* NULL, ends rules */ - if (sizeof(struct nft_rule *) > INT_MAX / alloc) + size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last); + + blob = kvmalloc(size, GFP_KERNEL_ACCOUNT); + if (!blob) return NULL; - alloc *= sizeof(struct nft_rule *); - alloc += sizeof(struct nft_rules_old); + blob->size = 0; + nft_last_rule(chain, blob->data); - return kvmalloc(alloc, GFP_KERNEL); + return blob; } static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, const struct nft_chain_hook *hook, struct nft_chain *chain) { - ops->pf = family; - ops->hooknum = hook->num; - ops->priority = hook->priority; - ops->priv = chain; - ops->hook = hook->type->hooks[ops->hooknum]; + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; + ops->hook_ops_type = NF_HOOK_OP_NF_TABLES; } static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; + struct nf_hook_ops *ops; struct nft_hook *h; basechain->type = hook->type; INIT_LIST_HEAD(&basechain->hook_list); chain = &basechain->chain; - if (family == NFPROTO_NETDEV) { + if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); - list_for_each_entry(h, &basechain->hook_list, list) - nft_basechain_hook_init(&h->ops, family, hook, chain); - - basechain->ops.hooknum = hook->num; - basechain->ops.priority = hook->priority; - } else { - nft_basechain_hook_init(&basechain->ops, family, hook, chain); + list_for_each_entry(h, &basechain->hook_list, list) { + list_for_each_entry(ops, &h->ops_list, list) + nft_basechain_hook_init(ops, family, hook, chain); + } } + nft_basechain_hook_init(&basechain->ops, family, hook, chain); - chain->flags |= NFT_BASE_CHAIN | flags; + chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - nft_chain_offload_priority(basechain) < 0) + !nft_chain_offload_support(basechain)) { + list_splice_init(&basechain->hook_list, &hook->list); return -EOPNOTSUPP; + } flow_block_init(&basechain->flow_block); return 0; } -static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, - u8 policy, u32 flags) +int nft_chain_add(struct nft_table *table, struct nft_chain *chain) +{ + int err; + + err = rhltable_insert_key(&table->chains_ht, chain->name, + &chain->rhlhead, nft_chain_ht_params); + if (err) + return err; + + list_add_tail_rcu(&chain->list, &table->chains); + + return 0; +} + +static u64 chain_id; + +static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, + u32 flags, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; struct nft_base_chain *basechain; - struct nft_stats __percpu *stats; struct net *net = ctx->net; + char name[NFT_NAME_MAXLEN]; + struct nft_rule_blob *blob; struct nft_trans *trans; struct nft_chain *chain; - struct nft_rule **rules; int err; - if (table->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_CHAIN_HOOK]) { - struct nft_chain_hook hook; + struct nft_stats __percpu *stats = NULL; + struct nft_chain_hook hook = {}; - err = nft_chain_parse_hook(net, nla, &hook, family, true); + if (table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; + + if (flags & NFT_CHAIN_BINDING) + return -EOPNOTSUPP; + + err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags, + extack); if (err < 0) return err; - basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); + basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT); if (basechain == NULL) { nft_chain_release_hook(&hook); return -ENOMEM; @@ -1939,145 +2701,192 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) { + if (IS_ERR_PCPU(stats)) { nft_chain_release_hook(&hook); kfree(basechain); - return PTR_ERR(stats); + return PTR_ERR_PCPU(stats); } rcu_assign_pointer(basechain->stats, stats); - static_branch_inc(&nft_counters_enabled); } err = nft_basechain_init(basechain, family, &hook, flags); if (err < 0) { nft_chain_release_hook(&hook); kfree(basechain); + free_percpu(stats); return err; } + if (stats) + static_branch_inc(&nft_counters_enabled); } else { - chain = kzalloc(sizeof(*chain), GFP_KERNEL); + if (flags & NFT_CHAIN_BASE) + return -EINVAL; + if (flags & NFT_CHAIN_HW_OFFLOAD) + return -EOPNOTSUPP; + + chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT); if (chain == NULL) return -ENOMEM; + + chain->flags = flags; } ctx->chain = chain; INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; - chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + + if (nla[NFTA_CHAIN_NAME]) { + chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT); + } else { + if (!(flags & NFT_CHAIN_BINDING)) { + err = -EINVAL; + goto err_destroy_chain; + } + + snprintf(name, sizeof(name), "__chain%llu", ++chain_id); + chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT); + } + if (!chain->name) { err = -ENOMEM; - goto err1; + goto err_destroy_chain; } - rules = nf_tables_chain_alloc_rules(chain, 0); - if (!rules) { - err = -ENOMEM; - goto err1; + if (nla[NFTA_CHAIN_USERDATA]) { + chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT); + if (chain->udata == NULL) { + err = -ENOMEM; + goto err_destroy_chain; + } + chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]); } - *rules = NULL; - rcu_assign_pointer(chain->rules_gen_0, rules); - rcu_assign_pointer(chain->rules_gen_1, rules); + blob = nf_tables_chain_alloc_rules(chain, 0); + if (!blob) { + err = -ENOMEM; + goto err_destroy_chain; + } - err = nf_tables_register_hook(net, table, chain); - if (err < 0) - goto err1; + RCU_INIT_POINTER(chain->blob_gen_0, blob); + RCU_INIT_POINTER(chain->blob_gen_1, blob); - err = rhltable_insert_key(&table->chains_ht, chain->name, - &chain->rhlhead, nft_chain_ht_params); - if (err) - goto err2; + if (!nft_use_inc(&table->use)) { + err = -EMFILE; + goto err_destroy_chain; + } trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); - rhltable_remove(&table->chains_ht, &chain->rhlhead, - nft_chain_ht_params); - goto err2; + goto err_trans; } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; - table->use++; - list_add_tail_rcu(&chain->list, &table->chains); - - return 0; -err2: - nf_tables_unregister_hook(net, table, chain); -err1: - nf_tables_chain_destroy(ctx); - - return err; -} + err = nft_chain_add(table, chain); + if (err < 0) + goto err_chain_add; -static bool nft_hook_list_equal(struct list_head *hook_list1, - struct list_head *hook_list2) -{ - struct nft_hook *hook; - int n = 0, m = 0; + /* This must be LAST to ensure no packets are walking over this chain. */ + err = nf_tables_register_hook(net, table, chain); + if (err < 0) + goto err_register_hook; - n = 0; - list_for_each_entry(hook, hook_list2, list) { - if (!nft_hook_list_find(hook_list1, hook)) - return false; + return 0; - n++; - } - list_for_each_entry(hook, hook_list1, list) - m++; +err_register_hook: + nft_chain_del(chain); +err_chain_add: + nft_trans_destroy(trans); +err_trans: + nft_use_dec_restore(&table->use); +err_destroy_chain: + nf_tables_chain_destroy(chain); - return n == m; + return err; } static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, - u32 flags) + u32 flags, const struct nlattr *attr, + struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; + struct nft_base_chain *basechain = NULL; struct nft_table *table = ctx->table; struct nft_chain *chain = ctx->chain; - struct nft_base_chain *basechain; - struct nft_stats *stats = NULL; - struct nft_chain_hook hook; + struct nft_chain_hook hook = {}; + struct nft_stats __percpu *stats = NULL; + struct nftables_pernet *nft_net; + struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; + bool unregister = false; int err; if (chain->flags ^ flags) return -EOPNOTSUPP; + INIT_LIST_HEAD(&hook.list); + if (nla[NFTA_CHAIN_HOOK]) { - if (!nft_is_base_chain(chain)) - return -EBUSY; + if (!nft_is_base_chain(chain)) { + NL_SET_BAD_ATTR(extack, attr); + return -EEXIST; + } - err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, - false); + basechain = nft_base_chain(chain); + err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook, + ctx->family, flags, extack); if (err < 0) return err; - basechain = nft_base_chain(chain); if (basechain->type != hook.type) { nft_chain_release_hook(&hook); - return -EBUSY; + NL_SET_BAD_ATTR(extack, attr); + return -EEXIST; } - if (ctx->family == NFPROTO_NETDEV) { - if (!nft_hook_list_equal(&basechain->hook_list, - &hook.list)) { - nft_chain_release_hook(&hook); - return -EBUSY; + if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { + list_for_each_entry_safe(h, next, &hook.list, list) { + list_for_each_entry(ops, &h->ops_list, list) { + ops->pf = basechain->ops.pf; + ops->hooknum = basechain->ops.hooknum; + ops->priority = basechain->ops.priority; + ops->priv = basechain->ops.priv; + ops->hook = basechain->ops.hook; + } + + if (nft_hook_list_find(&basechain->hook_list, h)) { + list_del(&h->list); + nft_netdev_hook_free(h); + continue; + } + + nft_net = nft_pernet(ctx->net); + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWCHAIN || + trans->table != ctx->table || + !nft_trans_chain_update(trans)) + continue; + + if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) { + nft_chain_release_hook(&hook); + return -EEXIST; + } + } } } else { ops = &basechain->ops; if (ops->hooknum != hook.num || ops->priority != hook.priority) { nft_chain_release_hook(&hook); - return -EBUSY; + NL_SET_BAD_ATTR(extack, attr); + return -EEXIST; } } - nft_chain_release_hook(&hook); } if (nla[NFTA_CHAIN_HANDLE] && @@ -2086,24 +2895,52 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, chain2 = nft_chain_lookup(ctx->net, table, nla[NFTA_CHAIN_NAME], genmask); - if (!IS_ERR(chain2)) - return -EEXIST; + if (!IS_ERR(chain2)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); + err = -EEXIST; + goto err_hooks; + } + } + + if (table->flags & __NFT_TABLE_F_UPDATE && + !list_empty(&hook.list)) { + NL_SET_BAD_ATTR(extack, attr); + err = -EOPNOTSUPP; + goto err_hooks; + } + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + nft_is_base_chain(chain) && + !list_empty(&hook.list)) { + basechain = nft_base_chain(chain); + ops = &basechain->ops; + + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { + err = nft_netdev_register_hooks(ctx->net, &hook.list); + if (err < 0) + goto err_hooks; + + unregister = true; + } } if (nla[NFTA_CHAIN_COUNTERS]) { - if (!nft_is_base_chain(chain)) - return -EOPNOTSUPP; + if (!nft_is_base_chain(chain)) { + err = -EOPNOTSUPP; + goto err_hooks; + } stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) - return PTR_ERR(stats); + if (IS_ERR_PCPU(stats)) { + err = PTR_ERR_PCPU(stats); + goto err_hooks; + } } err = -ENOMEM; - trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN, - sizeof(struct nft_trans_chain)); + trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN); if (trans == NULL) - goto err; + goto err_trans; nft_trans_chain_stats(trans) = stats; nft_trans_chain_update(trans) = true; @@ -2115,56 +2952,98 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_trans *tmp; char *name; err = -ENOMEM; - name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT); if (!name) - goto err; + goto err_trans; err = -EEXIST; - list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) { + list_for_each_entry(tmp, &nft_net->commit_list, list) { if (tmp->msg_type == NFT_MSG_NEWCHAIN && - tmp->ctx.table == table && + tmp->table == table && nft_trans_chain_update(tmp) && nft_trans_chain_name(tmp) && strcmp(name, nft_trans_chain_name(tmp)) == 0) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); kfree(name); - goto err; + goto err_trans; } } nft_trans_chain_name(trans) = name; } - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + nft_trans_basechain(trans) = basechain; + INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); + list_splice(&hook.list, &nft_trans_chain_hooks(trans)); + if (nla[NFTA_CHAIN_HOOK]) + module_put(hook.type->owner); + + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; -err: + +err_trans: free_percpu(stats); kfree(trans); +err_hooks: + if (nla[NFTA_CHAIN_HOOK]) { + list_for_each_entry_safe(h, next, &hook.list, list) { + if (unregister) { + list_for_each_entry(ops, &h->ops_list, list) + nf_unregister_net_hook(ctx->net, ops); + } + list_del(&h->list); + nft_netdev_hook_free_rcu(h); + } + module_put(hook.type->owner); + } + return err; } -static int nf_tables_newchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static struct nft_chain *nft_chain_lookup_byid(const struct net *net, + const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net = nft_pernet(net); + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type == NFT_MSG_NEWCHAIN && + nft_trans_chain(trans)->table == table && + id == nft_trans_chain_id(trans) && + nft_active_genmask(nft_trans_chain(trans), genmask)) + return nft_trans_chain(trans); + } + return ERR_PTR(-ENOENT); +} + +static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; + struct nft_chain *chain = NULL; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; - struct nft_chain *chain; u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; u32 flags = 0; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -2181,7 +3060,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } attr = nla[NFTA_CHAIN_HANDLE]; - } else { + } else if (nla[NFTA_CHAIN_NAME]) { chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { @@ -2190,6 +3069,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } chain = NULL; } + } else if (!nla[NFTA_CHAIN_ID]) { + return -EINVAL; } if (nla[NFTA_CHAIN_POLICY]) { @@ -2220,31 +3101,89 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, else if (chain) flags = chain->flags; - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + if (flags & ~NFT_CHAIN_FLAGS) + return -EOPNOTSUPP; + + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain != NULL) { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (chain->flags & NFT_CHAIN_BINDING) + return -EINVAL; + + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - flags |= chain->flags & NFT_BASE_CHAIN; - return nf_tables_updchain(&ctx, genmask, policy, flags); + flags |= chain->flags & NFT_CHAIN_BASE; + return nf_tables_updchain(&ctx, genmask, policy, flags, attr, + extack); } - return nf_tables_addchain(&ctx, family, genmask, policy, flags); + return nf_tables_addchain(&ctx, family, policy, flags, extack); } -static int nf_tables_delchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nft_delchain_hook(struct nft_ctx *ctx, + struct nft_base_chain *basechain, + struct netlink_ext_ack *extack) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; + const struct nft_chain *chain = &basechain->chain; + const struct nlattr * const *nla = ctx->nla; + struct nft_chain_hook chain_hook = {}; + struct nft_hook *this, *hook; + LIST_HEAD(chain_del_list); + struct nft_trans *trans; + int err; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return -EOPNOTSUPP; + + err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, + ctx->family, chain->flags, extack); + if (err < 0) + return err; + + list_for_each_entry(this, &chain_hook.list, list) { + hook = nft_hook_list_find(&basechain->hook_list, this); + if (!hook) { + err = -ENOENT; + goto err_chain_del_hook; + } + list_move(&hook->list, &chain_del_list); + } + + trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN); + if (!trans) { + err = -ENOMEM; + goto err_chain_del_hook; + } + + nft_trans_basechain(trans) = basechain; + nft_trans_chain_update(trans) = true; + INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); + list_splice(&chain_del_list, &nft_trans_chain_hooks(trans)); + nft_chain_release_hook(&chain_hook); + + nft_trans_commit_list_add_tail(ctx->net, trans); + + return 0; + +err_chain_del_hook: + list_splice(&chain_del_list, &basechain->hook_list); + nft_chain_release_hook(&chain_hook); + + return err; +} + +static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; @@ -2254,7 +3193,8 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, u32 use; int err; - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -2269,16 +3209,36 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } - if (nlh->nlmsg_flags & NLM_F_NONREC && + if (nft_chain_binding(chain)) + return -EOPNOTSUPP; + + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); + + if (nla[NFTA_CHAIN_HOOK]) { + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN || + chain->flags & NFT_CHAIN_HW_OFFLOAD) + return -EOPNOTSUPP; + + if (nft_is_base_chain(chain)) { + struct nft_base_chain *basechain = nft_base_chain(chain); + + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) + return nft_delchain_hook(&ctx, basechain, extack); + } + } + + if (info->nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); - use = chain->use; list_for_each_entry(rule, &chain->rules, list) { if (!nft_is_active_next(net, rule)) @@ -2307,13 +3267,16 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, /** * nft_register_expr - register nf_tables expr type - * @ops: expr type + * @type: expr type * * Registers the expr type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. */ int nft_register_expr(struct nft_expr_type *type) { + if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR)) + return -ENOMEM; + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); @@ -2326,7 +3289,7 @@ EXPORT_SYMBOL_GPL(nft_register_expr); /** * nft_unregister_expr - unregister nf_tables expr type - * @ops: expr type + * @type: expr type * * Unregisters the expr typefor use with nf_tables. */ @@ -2343,7 +3306,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -2375,9 +3338,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -2401,7 +3368,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { }; static int nf_tables_fill_expr_info(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) goto nla_put_failure; @@ -2411,7 +3378,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb, NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; - if (expr->ops->dump(skb, expr) < 0) + if (expr->ops->dump(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, data); } @@ -2423,14 +3390,14 @@ nla_put_failure: }; int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; - if (nf_tables_fill_expr_info(skb, expr) < 0) + if (nf_tables_fill_expr_info(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; @@ -2441,6 +3408,7 @@ nla_put_failure: struct nft_expr_info { const struct nft_expr_ops *ops; + const struct nlattr *attr; struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; }; @@ -2488,7 +3456,9 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, } else ops = type->ops; + info->attr = nla; info->ops = ops; + return 0; err1: @@ -2496,16 +3466,65 @@ err1: return err; } +int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, + struct nft_expr_info *info) +{ + struct nlattr *tb[NFTA_EXPR_MAX + 1]; + const struct nft_expr_type *type; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, + nft_expr_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME]) + return -EINVAL; + + rcu_read_lock(); + + type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); + if (!type) { + err = -ENOENT; + goto out_unlock; + } + + if (!type->inner_ops) { + err = -EOPNOTSUPP; + goto out_unlock; + } + + err = nla_parse_nested_deprecated(info->tb, type->maxattr, + tb[NFTA_EXPR_DATA], + type->policy, NULL); + if (err < 0) + goto out_unlock; + + info->attr = nla; + info->ops = type->inner_ops; + + /* No module reference will be taken on type->owner. + * Presence of type->inner_ops implies that the expression + * is builtin, so it cannot go away. + */ + rcu_read_unlock(); + return 0; + +out_unlock: + rcu_read_unlock(); + return err; +} + static int nf_tables_newexpr(const struct nft_ctx *ctx, - const struct nft_expr_info *info, + const struct nft_expr_info *expr_info, struct nft_expr *expr) { - const struct nft_expr_ops *ops = info->ops; + const struct nft_expr_ops *ops = expr_info->ops; int err; expr->ops = ops; if (ops->init) { - err = ops->init(ctx, expr, (const struct nlattr **)info->tb); + err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb); if (err < 0) goto err1; } @@ -2529,49 +3548,52 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx, static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, const struct nlattr *nla) { - struct nft_expr_info info; + struct nft_expr_info expr_info; struct nft_expr *expr; struct module *owner; int err; - err = nf_tables_expr_parse(ctx, nla, &info); + err = nf_tables_expr_parse(ctx, nla, &expr_info); if (err < 0) - goto err1; + goto err_expr_parse; + + err = -EOPNOTSUPP; + if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL)) + goto err_expr_stateful; err = -ENOMEM; - expr = kzalloc(info.ops->size, GFP_KERNEL); + expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT); if (expr == NULL) - goto err2; + goto err_expr_stateful; - err = nf_tables_newexpr(ctx, &info, expr); + err = nf_tables_newexpr(ctx, &expr_info, expr); if (err < 0) - goto err3; + goto err_expr_new; return expr; -err3: +err_expr_new: kfree(expr); -err2: - owner = info.ops->type->owner; - if (info.ops->type->release_ops) - info.ops->type->release_ops(info.ops); +err_expr_stateful: + owner = expr_info.ops->type->owner; + if (expr_info.ops->type->release_ops) + expr_info.ops->type->release_ops(expr_info.ops); module_put(owner); -err1: +err_expr_parse: return ERR_PTR(err); } -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp) { int err; - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } + if (WARN_ON_ONCE(!src->ops->clone)) + return -EINVAL; + + dst->ops = src->ops; + err = src->ops->clone(dst, src, gfp); + if (err < 0) + return err; __module_get(src->ops->type->owner); @@ -2588,13 +3610,15 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) * Rules */ -static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *__nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, u64 handle) { struct nft_rule *rule; // FIXME: this sucks - list_for_each_entry_rcu(rule, &chain->rules, list) { + list_for_each_entry_rcu(rule, &chain->rules, list, + lockdep_commit_lock_is_held(net)) { if (handle == rule->handle) return rule; } @@ -2602,13 +3626,14 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, return ERR_PTR(-ENOENT); } -static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); - return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); + return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { @@ -2617,13 +3642,14 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { [NFTA_RULE_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, - [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, [NFTA_RULE_POSITION] = { .type = NLA_U64 }, [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_RULE_ID] = { .type = NLA_U32 }, [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 }, + [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, @@ -2631,24 +3657,19 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule, - const struct nft_rule *prule) + const struct nft_rule *rule, u64 handle, + bool reset) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; const struct nft_expr *expr, *next; struct nlattr *list; u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, + nft_base_seq_be16(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) @@ -2657,18 +3678,20 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, NFTA_RULE_PAD)) goto nla_put_failure; - if (event != NFT_MSG_DELRULE && prule) { - if (nla_put_be64(skb, NFTA_RULE_POSITION, - cpu_to_be64(prule->handle), + if (event != NFT_MSG_DELRULE && handle) { + if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle), NFTA_RULE_PAD)) goto nla_put_failure; } + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_stats(chain, rule); + list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS); if (list == NULL) goto nla_put_failure; nft_rule_for_each_expr(expr, next, rule) { - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, list); @@ -2691,7 +3714,11 @@ nla_put_failure: static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { + struct nftables_pernet *nft_net = nft_pernet(ctx->net); + const struct nft_rule *prule; struct sk_buff *skb; + u64 handle = 0; + u16 flags = 0; int err; if (!ctx->report && @@ -2702,24 +3729,48 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (event == NFT_MSG_NEWRULE && + !list_is_first(&rule->list, &ctx->chain->rules) && + !list_is_last(&rule->list, &ctx->chain->rules)) { + prule = list_prev_entry(rule, list); + handle = prule->handle; + } + if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE)) + flags |= NLM_F_APPEND; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, - ctx->chain, rule, NULL); + event, flags, ctx->family, ctx->table, + ctx->chain, rule, handle, false); if (err < 0) { kfree_skb(skb); goto err; } - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } +static void audit_log_rule_reset(const struct nft_table *table, + unsigned int base_seq, + unsigned int nentries) +{ + char *buf = kasprintf(GFP_ATOMIC, "%s:%u", + table->name, base_seq); + + audit_log_nfcfg(buf, table->family, nentries, + AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); + kfree(buf); +} + struct nft_rule_dump_ctx { + unsigned int s_idx; char *table; char *chain; + bool reset; }; static int __nf_tables_dump_rules(struct sk_buff *skb, @@ -2728,59 +3779,71 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, const struct nft_table *table, const struct nft_chain *chain) { + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; - unsigned int s_idx = cb->args[0]; + unsigned int entries = 0; + int ret = 0; + u64 handle; prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) goto cont_skip; - if (*idx < s_idx) + if (*idx < ctx->s_idx) goto cont; - if (*idx > s_idx) { - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - } + if (prule) + handle = prule->handle; + else + handle = 0; + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, prule) < 0) - return 1; - + table, chain, rule, handle, ctx->reset) < 0) { + ret = 1; + break; + } + entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: prule = rule; cont_skip: (*idx)++; } - return 0; + + if (ctx->reset && entries) + audit_log_rule_reset(table, cb->seq, entries); + + return ret; } static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_rule_dump_ctx *ctx = cb->data; + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct nft_table *table; const struct nft_chain *chain; unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = nft_pernet(net); + cb->seq = nft_base_seq(net); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; - if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) + if (ctx->table && strcmp(ctx->table, table->name) != 0) continue; - if (ctx && ctx->table && ctx->chain) { + if (ctx->table && ctx->chain) { struct rhlist_head *list, *tmp; list = rhltable_lookup(&table->chains_ht, ctx->chain, @@ -2799,129 +3862,198 @@ static int nf_tables_dump_rules(struct sk_buff *skb, } list_for_each_entry_rcu(chain, &table->chains, list) { - if (__nf_tables_dump_rules(skb, &idx, cb, table, chain)) + if (__nf_tables_dump_rules(skb, &idx, + cb, table, chain)) goto done; } - if (ctx && ctx->table) + if (ctx->table) break; } done: rcu_read_unlock(); - cb->args[0] = idx; + ctx->s_idx = idx; return skb->len; } +static int nf_tables_dumpreset_rules(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + int ret; + + /* Mutex is held is to prevent that two concurrent dump-and-reset calls + * do not underrun counters and quotas. The commit_mutex is used for + * the lack a better lock, this is not transaction path. + */ + mutex_lock(&nft_net->commit_mutex); + ret = nf_tables_dump_rules(skb, cb); + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_rules_start(struct netlink_callback *cb) { + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; - struct nft_rule_dump_ctx *ctx = NULL; - if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { - ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); - if (!ctx) - return -ENOMEM; + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - if (nla[NFTA_RULE_TABLE]) { - ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], - GFP_ATOMIC); - if (!ctx->table) { - kfree(ctx); - return -ENOMEM; - } - } - if (nla[NFTA_RULE_CHAIN]) { - ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], - GFP_ATOMIC); - if (!ctx->chain) { - kfree(ctx->table); - kfree(ctx); - return -ENOMEM; - } + if (nla[NFTA_RULE_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC); + if (!ctx->table) + return -ENOMEM; + } + if (nla[NFTA_RULE_CHAIN]) { + ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC); + if (!ctx->chain) { + kfree(ctx->table); + return -ENOMEM; } } - - cb->data = ctx; return 0; } +static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb) +{ + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; + + ctx->reset = true; + + return nf_tables_dump_rules_start(cb); +} + static int nf_tables_dump_rules_done(struct netlink_callback *cb) { - struct nft_rule_dump_ctx *ctx = cb->data; + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; - if (ctx) { - kfree(ctx->table); - kfree(ctx->chain); - kfree(ctx); - } + kfree(ctx->table); + kfree(ctx->chain); return 0; } -/* called with rcu_read_lock held */ -static int nf_tables_getrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +/* Caller must hold rcu read lock or transaction mutex */ +static struct sk_buff * +nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, + const struct nlattr * const nla[], bool reset) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; const struct nft_rule *rule; + struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start= nf_tables_dump_rules_start, - .dump = nf_tables_dump_rules, - .done = nf_tables_dump_rules_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); - } - - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); - return PTR_ERR(table); + return ERR_CAST(table); } chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + return ERR_CAST(chain); } - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); - return PTR_ERR(rule); + return ERR_CAST(rule); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule, NULL); - if (err < 0) - goto err; + err = nf_tables_fill_rule_info(skb2, net, portid, + info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, + family, table, chain, rule, 0, reset); + if (err < 0) { + kfree_skb(skb2); + return ERR_PTR(err); + } - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return skb2; +} -err: - kfree_skb(skb2); - return err; +static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start= nf_tables_dump_rules_start, + .dump = nf_tables_dump_rules, + .done = nf_tables_dump_rules_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + skb2 = nf_tables_getrule_single(portid, info, nla, false); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + return nfnetlink_unicast(skb2, net, portid); } -static void nf_tables_rule_destroy(const struct nft_ctx *ctx, - struct nft_rule *rule) +static int nf_tables_getrule_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + char *buf; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start= nf_tables_dumpreset_rules_start, + .dump = nf_tables_dumpreset_rules, + .done = nf_tables_dump_rules_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + skb2 = nf_tables_getrule_single(portid, info, nla, true); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); + + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", + nla_len(nla[NFTA_RULE_TABLE]), + (char *)nla_data(nla[NFTA_RULE_TABLE]), + nft_base_seq(net)); + audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, + AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); + kfree(buf); + + return nfnetlink_unicast(skb2, net, portid); +} + +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr, *next; @@ -2930,7 +4062,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { next = nft_expr_next(expr); nf_tables_expr_destroy(ctx, expr); expr = next; @@ -2938,17 +4070,27 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -static void nf_tables_rule_release(const struct nft_ctx *ctx, - struct nft_rule *rule) +/* can only be used if rule is no longer visible to dumps */ +static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { + WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net)); + nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); } +/** nft_chain_validate - loop detection and hook validation + * + * @ctx: context containing call depth and base chain + * @chain: chain to validate + * + * Walk through the rules of the given chain and chase all jumps/gotos + * and set lookups until either the jump limit is hit or all reachable + * chains have been validated. + */ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; - const struct nft_data *data; struct nft_rule *rule; int err; @@ -2956,6 +4098,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) return -EMLINK; list_for_each_entry(rule, &chain->rules, list) { + if (fatal_signal_pending(current)) + return -EINTR; + if (!nft_is_active_next(ctx->net, rule)) continue; @@ -2963,7 +4108,10 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (!expr->ops->validate) continue; - err = expr->ops->validate(ctx, expr, &data); + /* This may call nft_chain_validate() recursively, + * callers that do so must increment ctx->level. + */ + err = expr->ops->validate(ctx, expr); if (err < 0) return err; } @@ -2990,86 +4138,160 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) err = nft_chain_validate(&ctx, chain); if (err < 0) return err; + + cond_resched(); } return 0; } +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; + const struct nft_data *data; + int err; + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) + return 0; + + data = nft_set_ext_data(ext); + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + pctx->level++; + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + pctx->level--; + break; + default: + break; + } + + return 0; +} + +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter dummy_iter = { + .genmask = nft_genmask_next(ctx->net), + }; + struct nft_set_elem_catchall *catchall; + + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, dummy_iter.genmask)) + continue; + + ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); + if (ret < 0) + return ret; + } + + return ret; +} + static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla); #define NFT_RULE_MAXEXPRS 128 -static int nf_tables_newrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - struct nft_expr_info *info = NULL; - int family = nfmsg->nfgen_family; - struct nft_flow_rule *flow; - struct nft_table *table; - struct nft_chain *chain; + struct nftables_pernet *nft_net = nft_pernet(info->net); + struct netlink_ext_ack *extack = info->extack; + unsigned int size, i, n, ulen = 0, usize = 0; + u8 genmask = nft_genmask_next(info->net); struct nft_rule *rule, *old_rule = NULL; + struct nft_expr_info *expr_info = NULL; + u8 family = info->nfmsg->nfgen_family; + struct nft_flow_rule *flow = NULL; + struct net *net = info->net; struct nft_userdata *udata; - struct nft_trans *trans = NULL; + struct nft_table *table; + struct nft_chain *chain; + struct nft_trans *trans; + u64 handle, pos_handle; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; - unsigned int size, i, n, ulen = 0, usize = 0; int err, rem; - u64 handle, pos_handle; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); } - chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + if (nla[NFTA_RULE_CHAIN]) { + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); + return PTR_ERR(chain); + } + + } else if (nla[NFTA_RULE_CHAIN_ID]) { + chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID], + genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); + return PTR_ERR(chain); + } + } else { + return -EINVAL; } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; + if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); - rule = __nft_rule_lookup(chain, handle); + rule = __nft_rule_lookup(net, chain, handle); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) old_rule = rule; else return -EOPNOTSUPP; } else { - if (!(nlh->nlmsg_flags & NLM_F_CREATE) || - nlh->nlmsg_flags & NLM_F_REPLACE) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) || + info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); - if (chain->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nft_rule_lookup(chain, pos_handle); + old_rule = __nft_rule_lookup(net, chain, pos_handle); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); } } else if (nla[NFTA_RULE_POSITION_ID]) { - old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]); + old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]); return PTR_ERR(old_rule); @@ -3077,34 +4299,36 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } } - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); n = 0; size = 0; if (nla[NFTA_RULE_EXPRESSIONS]) { - info = kvmalloc_array(NFT_RULE_MAXEXPRS, - sizeof(struct nft_expr_info), - GFP_KERNEL); - if (!info) + expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS, + sizeof(struct nft_expr_info), + GFP_KERNEL); + if (!expr_info) return -ENOMEM; nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { err = -EINVAL; if (nla_type(tmp) != NFTA_LIST_ELEM) - goto err1; + goto err_release_expr; if (n == NFT_RULE_MAXEXPRS) - goto err1; - err = nf_tables_expr_parse(&ctx, tmp, &info[n]); - if (err < 0) - goto err1; - size += info[n].ops->size; + goto err_release_expr; + err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]); + if (err < 0) { + NL_SET_BAD_ATTR(extack, tmp); + goto err_release_expr; + } + size += expr_info[n].ops->size; n++; } } /* Check for overflow of dlen field */ err = -EFBIG; if (size >= 1 << 12) - goto err1; + goto err_release_expr; if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); @@ -3113,9 +4337,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } err = -ENOMEM; - rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); + rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT); if (rule == NULL) - goto err1; + goto err_release_expr; nft_activate_next(net, rule); @@ -3131,38 +4355,56 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, expr = nft_expr_first(rule); for (i = 0; i < n; i++) { - err = nf_tables_newexpr(&ctx, &info[i], expr); - if (err < 0) - goto err2; + err = nf_tables_newexpr(&ctx, &expr_info[i], expr); + if (err < 0) { + NL_SET_BAD_ATTR(extack, expr_info[i].attr); + goto err_release_rule; + } - if (info[i].ops->validate) - nft_validate_state_update(net, NFT_VALIDATE_NEED); + if (expr_info[i].ops->validate) + nft_validate_state_update(table, NFT_VALIDATE_NEED); - info[i].ops = NULL; + expr_info[i].ops = NULL; expr = nft_expr_next(expr); } - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(net, rule); + if (IS_ERR(flow)) { + err = PTR_ERR(flow); + goto err_release_rule; + } + } + + if (!nft_use_inc(&chain->use)) { + err = -EMFILE; + goto err_release_rule; + } + + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { + if (nft_chain_binding(chain)) { + err = -EOPNOTSUPP; + goto err_destroy_flow_rule; + } + + err = nft_delrule(&ctx, old_rule); + if (err < 0) + goto err_destroy_flow_rule; + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } - err = nft_delrule(&ctx, old_rule); - if (err < 0) { - nft_trans_destroy(trans); - goto err2; - } - list_add_tail_rcu(&rule->list, &old_rule->list); } else { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (!trans) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } - if (nlh->nlmsg_flags & NLM_F_APPEND) { + if (info->nlh->nlmsg_flags & NLM_F_APPEND) { if (old_rule) list_add_rcu(&rule->list, &old_rule->list); else @@ -3174,65 +4416,68 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, list_add_rcu(&rule->list, &chain->rules); } } - kvfree(info); - chain->use++; - - if (net->nft.validate_state == NFT_VALIDATE_DO) - return nft_table_validate(net, table); - - if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { - flow = nft_flow_rule_create(net, rule); - if (IS_ERR(flow)) - return PTR_ERR(flow); + kvfree(expr_info); + if (flow) nft_trans_flow_rule(trans) = flow; - } + + if (table->validate_state == NFT_VALIDATE_DO) + return nft_table_validate(net, table); return 0; -err2: - nf_tables_rule_release(&ctx, rule); -err1: + +err_destroy_flow_rule: + nft_use_dec_restore(&chain->use); + if (flow) + nft_flow_rule_destroy(flow); +err_release_rule: + nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); + nf_tables_rule_destroy(&ctx, rule); +err_release_expr: for (i = 0; i < n; i++) { - if (info[i].ops) { - module_put(info[i].ops->type->owner); - if (info[i].ops->type->release_ops) - info[i].ops->type->release_ops(info[i].ops); + if (expr_info[i].ops) { + module_put(expr_info[i].ops->type->owner); + if (expr_info[i].ops->type->release_ops) + expr_info[i].ops->type->release_ops(expr_info[i].ops); } } - kvfree(info); + kvfree(expr_info); + return err; } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla) { + struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; - list_for_each_entry(trans, &net->nft.commit_list, list) { - struct nft_rule *rule = nft_trans_rule(trans); - + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWRULE && + nft_trans_rule_chain(trans) == chain && id == nft_trans_rule_id(trans)) - return rule; + return nft_trans_rule(trans); } return ERR_PTR(-ENOENT); } -static int nf_tables_delrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - struct nft_table *table; + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; + struct net *net = info->net; + struct nft_table *table; struct nft_rule *rule; - int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; + int err = 0; - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); @@ -3242,24 +4487,34 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_binding(chain)) + return -EOPNOTSUPP; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } err = nft_delrule(&ctx, rule); } else if (nla[NFTA_RULE_ID]) { - rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]); + rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]); return PTR_ERR(rule); @@ -3273,6 +4528,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; + if (nft_chain_binding(chain)) + continue; ctx.chain = chain; err = nft_delrule_by_chain(&ctx); @@ -3314,23 +4571,18 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) * given, in that case the amount of memory per element is used. */ static const struct nft_set_ops * -nft_select_set_ops(const struct nft_ctx *ctx, - const struct nlattr * const nla[], - const struct nft_set_desc *desc, - enum nft_set_policies policy) +nft_select_set_ops(const struct nft_ctx *ctx, u32 flags, + const struct nft_set_desc *desc) { + struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; - u32 flags = 0; int i; - lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); - if (nla[NFTA_SET_FLAGS] != NULL) - flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); - bops = NULL; best.size = ~0; best.lookup = ~0; @@ -3345,7 +4597,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, if (!ops->estimate(desc, flags, &est)) continue; - switch (policy) { + switch (desc->policy) { case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; @@ -3398,38 +4650,22 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, + [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), + [NFTA_SET_TYPE] = { .type = NLA_REJECT }, + [NFTA_SET_COUNT] = { .type = NLA_REJECT }, +}; + +static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { + [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, - [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, + [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy), }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack, - u8 genmask) -{ - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family; - struct nft_table *table = NULL; - - if (nla[NFTA_SET_TABLE] != NULL) { - table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); - return PTR_ERR(table); - } - } - - nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); - return 0; -} - -static struct nft_set *nft_set_lookup(const struct nft_table *table, +static struct nft_set *nft_set_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -3437,7 +4673,8 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(set, &table->sets, list) { + list_for_each_entry_rcu(set, &table->sets, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; @@ -3460,19 +4697,21 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, } static struct nft_set *nft_set_lookup_byid(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { - struct nft_trans *trans; + struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans_set *trans; - list_for_each_entry(trans, &net->nft.commit_list, list) { - if (trans->msg_type == NFT_MSG_NEWSET) { - struct nft_set *set = nft_trans_set(trans); + /* its likely the id we need is at the tail, not at start */ + list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) { + struct nft_set *set = trans->set; - if (id == nft_trans_set_id(trans) && - nft_active_genmask(set, genmask)) - return set; - } + if (id == trans->set_id && + set->table == table && + nft_active_genmask(set, genmask)) + return set; } return ERR_PTR(-ENOENT); } @@ -3485,12 +4724,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net, { struct nft_set *set; - set = nft_set_lookup(table, nla_set_name, genmask); + set = nft_set_lookup(net, table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; - set = nft_set_lookup_byid(net, nla_set_id, genmask); + set = nft_set_lookup_byid(net, table, nla_set_id, genmask); } return set; } @@ -3509,6 +4748,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; + if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN) + return -EINVAL; + inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; @@ -3516,7 +4758,7 @@ cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; - if (!nft_is_active_next(ctx->net, set)) + if (!nft_is_active_next(ctx->net, i)) continue; if (!sscanf(i->name, name, &tmp)) continue; @@ -3535,7 +4777,7 @@ cont: free_page((unsigned long)inuse); } - set->name = kasprintf(GFP_KERNEL, name, min + n); + set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n); if (!set->name) return -ENOMEM; @@ -3551,7 +4793,7 @@ cont: return 0; } -static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) +int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) { u64 ms = be64_to_cpu(nla_get_be64(nla)); u64 max = (u64)(~((u64)0)); @@ -3561,11 +4803,11 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) return -ERANGE; ms *= NSEC_PER_MSEC; - *result = nsecs_to_jiffies64(ms); + *result = nsecs_to_jiffies64(ms) ? : !!ms; return 0; } -static __be64 nf_jiffies64_to_msecs(u64 input) +__be64 nf_jiffies64_to_msecs(u64 input) { return cpu_to_be64(jiffies64_to_msecs(input)); } @@ -3597,26 +4839,53 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, return 0; } +static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) +{ + if (ops->usize) + return ops->usize(size); + + return size; +} + +static noinline_for_stack int +nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set) +{ + unsigned int nelems; + char str[40]; + int ret; + + ret = snprintf(str, sizeof(str), "%ps", set->ops); + + /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped + * to userspace purely for informational/debug purposes. + */ + DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str)); + + if (nla_put_string(skb, NFTA_SET_TYPE, str)) + return -EMSGSIZE; + + nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems)); + return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { - struct nfgenmsg *nfmsg; - struct nlmsghdr *nlh; + u64 timeout = READ_ONCE(set->timeout); + u32 gc_int = READ_ONCE(set->gc_int); u32 portid = ctx->portid; + struct nlmsghdr *nlh; struct nlattr *nest; u32 seq = ctx->seq; + int i; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, ctx->family, NFNETLINK_V0, + nft_base_seq_be16(ctx->net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) @@ -3624,6 +4893,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle), NFTA_SET_PAD)) goto nla_put_failure; + + if (event == NFT_MSG_DELSET || + event == NFT_MSG_DESTROYSET) { + nlmsg_end(skb, nlh); + return 0; + } + if (set->flags != 0) if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) goto nla_put_failure; @@ -3642,13 +4918,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype))) goto nla_put_failure; - if (set->timeout && + if (timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, - nf_jiffies64_to_msecs(set->timeout), + nf_jiffies64_to_msecs(timeout), NFTA_SET_PAD)) goto nla_put_failure; - if (set->gc_int && - nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) + if (gc_int && + nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int))) goto nla_put_failure; if (set->policy != NFT_SET_POL_PERFORMANCE) { @@ -3656,14 +4932,16 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } - if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) + if (set->udata && + nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_DESC); if (!nest) goto nla_put_failure; if (set->size && - nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + nla_put_be32(skb, NFTA_SET_DESC_SIZE, + htonl(nft_set_userspace_size(set->ops, set->size)))) goto nla_put_failure; if (set->field_count > 1 && @@ -3672,12 +4950,26 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); - if (set->expr) { + if (nf_tables_fill_set_info(skb, set)) + goto nla_put_failure; + + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); - if (nf_tables_fill_expr_info(skb, set->expr) < 0) + if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) goto nla_put_failure; nla_nest_end(skb, nest); + } else if (set->num_exprs > 1) { + nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS); + if (nest == NULL) + goto nla_put_failure; + + for (i = 0; i < set->num_exprs; i++) { + if (nft_expr_dump(skb, NFTA_LIST_ELEM, + set->exprs[i], false) < 0) + goto nla_put_failure; + } + nla_nest_end(skb, nest); } nlmsg_end(skb, nlh); @@ -3692,8 +4984,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, int event, gfp_t gfp_flags) { - struct sk_buff *skb; + struct nftables_pernet *nft_net = nft_pernet(ctx->net); u32 portid = ctx->portid; + struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -3704,14 +4998,16 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_set(skb, ctx, set, event, 0); + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + + err = nf_tables_fill_set(skb, ctx, set, event, flags); if (err < 0) { kfree_skb(skb); goto err; } - nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report, - gfp_flags); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -3724,14 +5020,16 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); struct nft_ctx *ctx = cb->data, ctx_set; + struct nftables_pernet *nft_net; if (cb->args[1]) return skb->len; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = nft_pernet(net); + cb->seq = nft_base_seq(net); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && ctx->family != table->family) continue; @@ -3795,25 +5093,31 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - u8 genmask = nft_genmask_cur(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; + struct nft_table *table = NULL; + struct net *net = info->net; const struct nft_set *set; - struct nft_ctx ctx; struct sk_buff *skb2; - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_ctx ctx; int err; - /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, - genmask); - if (err < 0) - return err; + if (nla[NFTA_SET_TABLE]) { + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask, 0); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); + return PTR_ERR(table); + } + } + + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_sets_start, .dump = nf_tables_dump_sets, @@ -3822,18 +5126,20 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } /* Only accept unspec with dump */ - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); - if (IS_ERR(set)) + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) @@ -3841,19 +5147,15 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); if (err < 0) - goto err; + goto err_fill_set_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_set_info: kfree_skb(skb2); return err; } -static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { - [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, -}; - static int nft_set_desc_concat_parse(const struct nlattr *attr, struct nft_set_desc *desc) { @@ -3861,6 +5163,9 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, u32 len; int err; + if (desc->field_count >= ARRAY_SIZE(desc->field_len)) + return -E2BIG; + err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr, nft_concat_policy, NULL); if (err < 0) @@ -3870,9 +5175,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, return -EINVAL; len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN])); - - if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT) - return -E2BIG; + if (!len || len > U8_MAX) + return -EINVAL; desc->field_len[desc->field_count++] = len; @@ -3882,8 +5186,9 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { + u32 len = 0, num_regs; struct nlattr *attr; - int rem, err; + int rem, err, i; nla_for_each_nested(attr, nla, rem) { if (nla_type(attr) != NFTA_LIST_ELEM) @@ -3894,6 +5199,16 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, return err; } + for (i = 0; i < desc->field_count; i++) + len += round_up(desc->field_len[i], sizeof(u32)); + + if (len != desc->klen) + return -EINVAL; + + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); + if (num_regs > NFT_REG32_COUNT) + return -E2BIG; + return 0; } @@ -3916,28 +5231,116 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, return err; } -static int nf_tables_newset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr * const *nla, + struct nft_expr **exprs, int *num_exprs, + u32 flags) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; + struct nft_expr *expr; + int err, i; + + if (nla[NFTA_SET_EXPR]) { + expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_expr_alloc; + } + exprs[0] = expr; + (*num_exprs)++; + } else if (nla[NFTA_SET_EXPRESSIONS]) { + struct nlattr *tmp; + int left; + + if (!(flags & NFT_SET_EXPR)) { + err = -EINVAL; + goto err_set_expr_alloc; + } + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_set_expr_alloc; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_expr_alloc; + } + expr = nft_set_elem_expr_alloc(ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_expr_alloc; + } + exprs[i++] = expr; + (*num_exprs)++; + } + } + + return 0; + +err_set_expr_alloc: + for (i = 0; i < *num_exprs; i++) + nft_expr_destroy(ctx, exprs[i]); + + return err; +} + +static bool nft_set_is_same(const struct nft_set *set, + const struct nft_set_desc *desc, + struct nft_expr *exprs[], u32 num_exprs, u32 flags) +{ + int i; + + if (set->ktype != desc->ktype || + set->dtype != desc->dtype || + set->flags != flags || + set->klen != desc->klen || + set->dlen != desc->dlen || + set->field_count != desc->field_count || + set->num_exprs != num_exprs) + return false; + + for (i = 0; i < desc->field_count; i++) { + if (set->field_len[i] != desc->field_len[i]) + return false; + } + + for (i = 0; i < num_exprs; i++) { + if (set->exprs[i]->ops != exprs[i]->ops) + return false; + } + + return true; +} + +static u32 nft_set_kernel_size(const struct nft_set_ops *ops, + const struct nft_set_desc *desc) +{ + if (ops->ksize) + return ops->ksize(desc->size); + + return desc->size; +} + +static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; - struct nft_expr *expr = NULL; + struct net *net = info->net; + struct nft_set_desc desc; struct nft_table *table; + unsigned char *udata; struct nft_set *set; struct nft_ctx ctx; + size_t alloc_size; + int num_exprs = 0; char *name; - u64 size; - u64 timeout; - u32 ktype, dtype, flags, policy, gc_int, objtype; - struct nft_set_desc desc; - unsigned char *udata; + int err, i; u16 udlen; - int err; - int i; + u32 flags; + u64 size; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || @@ -3947,10 +5350,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, memset(&desc, 0, sizeof(desc)); - ktype = NFT_DATA_VALUE; + desc.ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { - ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); - if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) + desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); + if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) return -EINVAL; } @@ -3964,7 +5367,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | - NFT_SET_OBJECT | NFT_SET_CONCAT)) + NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR)) return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == @@ -3973,19 +5376,25 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; + if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) == + (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; + if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == + (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; } - dtype = 0; + desc.dtype = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; - dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); - if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && - dtype != NFT_DATA_VERDICT) + desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); + if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && + desc.dtype != NFT_DATA_VERDICT) return -EINVAL; - if (dtype != NFT_DATA_VERDICT) { + if (desc.dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); @@ -4000,76 +5409,128 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (!(flags & NFT_SET_OBJECT)) return -EINVAL; - objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); - if (objtype == NFT_OBJECT_UNSPEC || - objtype > NFT_OBJECT_MAX) + desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); + if (desc.objtype == NFT_OBJECT_UNSPEC || + desc.objtype > NFT_OBJECT_MAX) return -EOPNOTSUPP; } else if (flags & NFT_SET_OBJECT) return -EINVAL; else - objtype = NFT_OBJECT_UNSPEC; + desc.objtype = NFT_OBJECT_UNSPEC; - timeout = 0; + desc.timeout = 0; if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout); + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; } - gc_int = 0; + desc.gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); + + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } - policy = NFT_SET_POL_PERFORMANCE; - if (nla[NFTA_SET_POLICY] != NULL) - policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + desc.policy = NFT_SET_POL_PERFORMANCE; + if (nla[NFTA_SET_POLICY] != NULL) { + desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + switch (desc.policy) { + case NFT_SET_POL_PERFORMANCE: + case NFT_SET_POL_MEMORY: + break; + default: + return -EOPNOTSUPP; + } + } if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); if (err < 0) return err; + + if (desc.field_count > 1) { + if (!(flags & NFT_SET_CONCAT)) + return -EINVAL; + } else if (flags & NFT_SET_CONCAT) { + return -EINVAL; + } + } else if (flags & NFT_SET_CONCAT) { + return -EINVAL; } - if (nla[NFTA_SET_EXPR]) + if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS]) desc.expr = true; - table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {}; + + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return 0; + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); + if (err < 0) + return err; + + if (desc.size) + desc.size = nft_set_kernel_size(set->ops, &desc); + + err = 0; + if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); + err = -EEXIST; + } + + for (i = 0; i < num_exprs; i++) + nft_expr_destroy(&ctx, exprs[i]); + + if (err < 0) + return err; + + return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc); } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(&ctx, nla, &desc, policy); + ops = nft_select_set_ops(&ctx, flags, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); + if (desc.size) + desc.size = nft_set_kernel_size(ops, &desc); + udlen = 0; if (nla[NFTA_SET_USERDATA]) udlen = nla_len(nla[NFTA_SET_USERDATA]); @@ -4077,12 +5538,20 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, size = 0; if (ops->privsize != NULL) size = ops->privsize(nla, &desc); - - set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); - if (!set) + alloc_size = sizeof(*set) + size + udlen; + if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; - name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL); + if (!nft_use_inc(&table->use)) + return -EMFILE; + + set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT); + if (!set) { + err = -ENOMEM; + goto err_alloc; + } + + name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT); if (!name) { err = -ENOMEM; goto err_set_name; @@ -4091,15 +5560,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = nf_tables_set_alloc_name(&ctx, set, name); kfree(name); if (err < 0) - goto err_set_alloc_name; - - if (nla[NFTA_SET_EXPR]) { - expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]); - if (IS_ERR(expr)) { - err = PTR_ERR(expr); - goto err_set_alloc_name; - } - } + goto err_set_name; udata = NULL; if (udlen) { @@ -4108,23 +5569,23 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); + INIT_LIST_HEAD(&set->catchall_list); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); - set->ops = ops; - set->ktype = ktype; - set->klen = desc.klen; - set->dtype = dtype; - set->objtype = objtype; - set->dlen = desc.dlen; - set->expr = expr; + set->ops = ops; + set->ktype = desc.ktype; + set->klen = desc.klen; + set->dtype = desc.dtype; + set->objtype = desc.objtype; + set->dlen = desc.dlen; set->flags = flags; - set->size = desc.size; - set->policy = policy; - set->udlen = udlen; - set->udata = udata; - set->timeout = timeout; - set->gc_int = gc_int; - set->handle = nf_tables_alloc_handle(table); + set->size = desc.size; + set->policy = desc.policy; + set->udlen = udlen; + set->udata = udata; + set->timeout = desc.timeout; + set->gc_int = desc.gc_int; set->field_count = desc.field_count; for (i = 0; i < desc.field_count; i++) @@ -4134,88 +5595,133 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (err < 0) goto err_set_init; + err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags); + if (err < 0) + goto err_set_destroy; + + set->num_exprs = num_exprs; + set->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&set->pending_update); + err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) - goto err_set_trans; + goto err_set_expr_alloc; list_add_tail_rcu(&set->list, &table->sets); - table->use++; + return 0; -err_set_trans: - ops->destroy(set); +err_set_expr_alloc: + for (i = 0; i < set->num_exprs; i++) + nft_expr_destroy(&ctx, set->exprs[i]); +err_set_destroy: + ops->destroy(&ctx, set); err_set_init: - if (expr) - nft_expr_destroy(&ctx, expr); -err_set_alloc_name: kfree(set->name); err_set_name: kvfree(set); +err_alloc: + nft_use_dec_restore(&table->use); + return err; } +static void nft_set_catchall_destroy(const struct nft_ctx *ctx, + struct nft_set *set) +{ + struct nft_set_elem_catchall *next, *catchall; + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + list_del_rcu(&catchall->list); + nf_tables_set_elem_destroy(ctx, set, catchall->elem); + kfree_rcu(catchall, rcu); + } +} + +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { + int i; + if (WARN_ON(set->use > 0)) return; - if (set->expr) - nft_expr_destroy(ctx, set->expr); + for (i = 0; i < set->num_exprs; i++) + nft_expr_destroy(ctx, set->exprs[i]); - set->ops->destroy(set); - kfree(set->name); - kvfree(set); + set->ops->destroy(ctx, set); + nft_set_catchall_destroy(ctx, set); + nft_set_put(set); } -static int nf_tables_delset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - int err; - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; - if (nla[NFTA_SET_TABLE] == NULL) - return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, - genmask); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); + return PTR_ERR(table); + } if (nla[NFTA_SET_HANDLE]) { attr = nla[NFTA_SET_HANDLE]; - set = nft_set_lookup_byhandle(ctx.table, attr, genmask); + set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; - set = nft_set_lookup(ctx.table, attr, genmask); + set = nft_set_lookup(net, table, attr, genmask); } if (IS_ERR(set)) { + if (PTR_ERR(set) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(set); } if (set->use || - (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) { + (info->nlh->nlmsg_flags & NLM_F_NONREC && + atomic_read(&set->nelems) > 0)) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + return nft_delset(&ctx, set); } -static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) +static int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, + unsigned int len); + +static int nft_setelem_data_validate(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); enum nft_registers dreg; dreg = nft_type_to_reg(set->dtype); @@ -4225,14 +5731,50 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, set->dlen); } +static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + return nft_setelem_data_validate(ctx, set, elem_priv); +} + +static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + ret = nft_setelem_data_validate(ctx, set, catchall->elem); + if (ret < 0) + break; + } + + return ret; +} + int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { struct nft_set_binding *i; - struct nft_set_iter iter; - - if (set->use == UINT_MAX) - return -EOVERFLOW; + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nf_tables_bind_check_setelem, + }; if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; @@ -4247,21 +5789,20 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, goto bind; } - iter.genmask = nft_genmask_next(ctx->net); - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_bind_check_setelem; - set->ops->walk(ctx, set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_bind_check(ctx, set); + if (iter.err < 0) return iter.err; } bind: + if (!nft_use_inc(&set->use)) + return -EMFILE; + binding->chain = ctx->chain; list_add_tail_rcu(&binding->list, &set->bindings); nft_set_trans_bind(ctx, set); - set->use++; return 0; } @@ -4274,24 +5815,112 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); + set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); } } +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv); + +static int nft_mapelem_activate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + /* called from abort path, reverse check to undo changes. */ + if (nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_clear(ctx->net, ext); + nft_setelem_data_activate(ctx->net, set, elem_priv); + + return 0; +} + +static void nft_map_catchall_activate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + nft_clear(ctx->net, ext); + nft_setelem_data_activate(ctx->net, set, catchall->elem); + break; + } +} + +static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nft_mapelem_activate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_activate(ctx, set); +} + +void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) +{ + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(ctx, set); + + nft_clear(ctx->net, set); + } + + nft_use_inc_restore(&set->use); +} +EXPORT_SYMBOL_GPL(nf_tables_activate_set); + void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { + WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net)); + switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nft_set_trans_unbind(ctx, set); + if (nft_set_is_anonymous(set)) + nft_deactivate_next(ctx->net, set); + else + list_del_rcu(&binding->list); + + nft_use_dec(&set->use); + break; case NFT_TRANS_PREPARE: - set->use--; + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + + nft_deactivate_next(ctx->net, set); + } + nft_use_dec(&set->use); return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: - set->use--; - /* fall through */ + if (nft_set_is_anonymous(set) && + set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + + nft_use_dec(&set->use); + fallthrough; default: nf_tables_unbind_set(ctx, set, binding, phase == NFT_TRANS_COMMIT); @@ -4313,8 +5942,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_DATA] = { .align = __alignof__(u32), }, - [NFT_SET_EXT_EXPR] = { - .align = __alignof__(struct nft_expr), + [NFT_SET_EXT_EXPRESSIONS] = { + .align = __alignof__(struct nft_set_elem_expr), }, [NFT_SET_EXT_OBJREF] = { .len = sizeof(struct nft_object *), @@ -4325,12 +5954,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u8), }, [NFT_SET_EXT_TIMEOUT] = { - .len = sizeof(u64), - .align = __alignof__(u64), - }, - [NFT_SET_EXT_EXPIRATION] = { - .len = sizeof(u64), - .align = __alignof__(u64), + .len = sizeof(struct nft_timeout), + .align = __alignof__(struct nft_timeout), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), @@ -4357,6 +5982,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -4364,37 +5990,54 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy), [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack, - u8 genmask) +static int nft_set_elem_expr_dump(struct sk_buff *skb, + const struct nft_set *set, + const struct nft_set_ext *ext, + bool reset) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family; - struct nft_table *table; + struct nft_set_elem_expr *elem_expr; + u32 size, num_exprs = 0; + struct nft_expr *expr; + struct nlattr *nest; - table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); - return PTR_ERR(table); - } + elem_expr = nft_set_ext_expr(ext); + nft_setelem_expr_foreach(expr, elem_expr, size) + num_exprs++; + + if (num_exprs == 1) { + expr = nft_setelem_expr_at(elem_expr, 0); + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0) + return -1; - nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); + return 0; + } else if (num_exprs > 1) { + nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS); + if (nest == NULL) + goto nla_put_failure; + + nft_setelem_expr_foreach(expr, elem_expr, size) { + expr = nft_setelem_expr_at(elem_expr, size); + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } return 0; + +nla_put_failure: + return -1; } static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_elem_priv *elem_priv, + bool reset) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -4402,7 +6045,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && + nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; @@ -4413,12 +6057,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), - set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, - set->dlen) < 0) + nft_set_datatype(set), set->dlen) < 0) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) && - nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && + nft_set_elem_expr_dump(skb, set, ext, reset)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && @@ -4431,25 +6074,32 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && - nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout); + u64 set_timeout = READ_ONCE(set->timeout); + __be64 msecs = 0; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - u64 expires, now = get_jiffies_64(); + if (set_timeout != timeout) { + msecs = nf_jiffies64_to_msecs(timeout); + if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs, + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } - expires = *nft_set_ext_expiration(ext); - if (time_before64(now, expires)) - expires -= now; - else - expires = 0; + if (timeout > 0) { + u64 expires, now = get_jiffies_64(); - if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - nf_jiffies64_to_msecs(expires), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration); + if (time_before64(now, expires)) + expires -= now; + else + expires = 0; + + if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, + nf_jiffies64_to_msecs(expires), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -4473,40 +6123,97 @@ struct nft_set_dump_args { const struct netlink_callback *cb; struct nft_set_iter iter; struct sk_buff *skb; + bool reset; }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) + return 0; + args = container_of(iter, struct nft_set_dump_args, iter); - return nf_tables_fill_setelem(args->skb, set, elem); + return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset); +} + +static void audit_log_nft_set_reset(const struct nft_table *table, + unsigned int base_seq, + unsigned int nentries) +{ + char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); + + audit_log_nfcfg(buf, table->family, nentries, + AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC); + kfree(buf); } struct nft_set_dump_ctx { const struct nft_set *set; struct nft_ctx ctx; + bool reset; }; +static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, + const struct nft_set *set, bool reset, + unsigned int base_seq) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask) || + nft_set_elem_expired(ext)) + continue; + + ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset); + if (reset && !ret) + audit_log_nft_set_reset(set->table, base_seq, 1); + break; + } + + return ret; +} + static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); + struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; - struct nft_set_dump_args args; + struct nft_set_dump_args args = { + .cb = cb, + .skb = skb, + .reset = dump_ctx->reset, + .iter = { + .genmask = nft_genmask_cur(net), + .type = NFT_ITER_READ, + .skip = cb->args[0], + .fn = nf_tables_dump_setelem, + }, + }; bool set_found = false; - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; int event; rcu_read_lock(); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + nft_net = nft_pernet(net); + cb->seq = nft_base_seq(net); + + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && dump_ctx->ctx.family != table->family) continue; @@ -4532,16 +6239,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - NLM_F_MULTI); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = table->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) @@ -4551,19 +6253,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; - args.cb = cb; - args.skb = skb; - args.iter.genmask = nft_genmask_cur(net); - args.iter.skip = cb->args[0]; - args.iter.count = 0; - args.iter.err = 0; - args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&dump_ctx->ctx, set, &args.iter); - rcu_read_unlock(); + if (!args.iter.err && args.iter.count == cb->args[0]) + args.iter.err = nft_set_catchall_dump(net, skb, set, + dump_ctx->reset, cb->seq); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); + rcu_read_unlock(); + if (args.iter.err && args.iter.err != -EMSGSIZE) return args.iter.err; if (args.iter.count == cb->args[0]) @@ -4577,6 +6276,26 @@ nla_put_failure: return -ENOSPC; } +static int nf_tables_dumpreset_set(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + struct nft_set_dump_ctx *dump_ctx = cb->data; + int ret, skip = cb->args[0]; + + mutex_lock(&nft_net->commit_mutex); + + ret = nf_tables_dump_set(skb, cb); + + if (cb->args[0] > skip) + audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq, + cb->args[0] - skip); + + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_set_start(struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; @@ -4596,24 +6315,19 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_elem_priv *elem_priv, + bool reset) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; int err; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) @@ -4623,7 +6337,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - err = nf_tables_fill_setelem(skb, set, elem); + err = nf_tables_fill_setelem(skb, set, elem_priv, reset); if (err < 0) goto nla_put_failure; @@ -4644,31 +6358,28 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; *flags = ntohl(nla_get_be32(attr)); - if (*flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; + if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) + return -EOPNOTSUPP; if (!(set->flags & NFT_SET_INTERVAL) && *flags & NFT_SET_ELEM_INTERVAL_END) return -EINVAL; + if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) == + (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) + return -EINVAL; return 0; } -static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, +static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set, struct nft_data *key, struct nlattr *attr) { - struct nft_data_desc desc; - int err; - - err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); - if (err < 0) - return err; - - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { - nft_data_release(key, desc.type); - return -EINVAL; - } + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = NFT_DATA_VALUE_MAXLEN, + .len = set->klen, + }; - return 0; + return nft_data_init(ctx, key, &desc, attr); } static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, @@ -4676,28 +6387,68 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nft_data *data, struct nlattr *attr) { - int err; + u32 dtype; - err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); - if (err < 0) - return err; + if (set->dtype == NFT_DATA_VERDICT) + dtype = NFT_DATA_VERDICT; + else + dtype = NFT_DATA_VALUE; - if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { - nft_data_release(data, desc->type); - return -EINVAL; + desc->type = dtype; + desc->size = NFT_DATA_VALUE_MAXLEN; + desc->len = set->dlen; + desc->flags = NFT_DATA_DESC_SETELEM; + + return nft_data_init(ctx, data, desc, attr); +} + +static void *nft_setelem_catchall_get(const struct net *net, + const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_ext *ext; + void *priv = NULL; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask) || + nft_set_elem_expired(ext)) + continue; + + priv = catchall->elem; + break; + } + + return priv; +} + +static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set, + struct nft_set_elem *elem, u32 flags) +{ + void *priv; + + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + priv = set->ops->get(ctx->net, set, elem, flags); + if (IS_ERR(priv)) + return PTR_ERR(priv); + } else { + priv = nft_setelem_catchall_get(ctx->net, set); + if (!priv) + return -ENOENT; } + elem->priv = priv; return 0; } -static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) +static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set, + const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; - void *priv; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, @@ -4705,17 +6456,19 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (!nla[NFTA_SET_ELEM_KEY]) - return -EINVAL; - err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - return err; + if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + return -EINVAL; + + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + return err; + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, @@ -4724,92 +6477,173 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - priv = set->ops->get(ctx->net, set, &elem, flags); - if (IS_ERR(priv)) - return PTR_ERR(priv); - - elem.priv = priv; + err = nft_setelem_get(ctx, set, &elem, flags); + if (err < 0) + return err; err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) - goto err1; + return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, - NFT_MSG_NEWSETELEM, 0, set, &elem); + NFT_MSG_NEWSETELEM, 0, set, elem.priv, + reset); if (err < 0) - goto err2; + goto err_fill_setelem; - err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT); - /* This avoids a loop in nfnetlink. */ - if (err < 0) - goto err1; + return nfnetlink_unicast(skb, ctx->net, ctx->portid); - return 0; -err2: +err_fill_setelem: kfree_skb(skb); -err1: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return err; } -/* called with rcu_read_lock held */ -static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx, + const struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[], + bool reset) { - u8 genmask = nft_genmask_cur(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; + struct net *net = info->net; + struct nft_table *table; struct nft_set *set; - struct nlattr *attr; - struct nft_ctx ctx; - int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, 0); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); - if (IS_ERR(set)) + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); + } + + nft_ctx_init(&dump_ctx->ctx, net, skb, + info->nlh, family, table, NULL, nla); + dump_ctx->set = set; + dump_ctx->reset = reset; + return 0; +} - if (nlh->nlmsg_flags & NLM_F_DUMP) { +/* called with rcu_read_lock held */ +static int nf_tables_getsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct netlink_ext_ack *extack = info->extack; + struct nft_set_dump_ctx dump_ctx; + struct nlattr *attr; + int rem, err = 0; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, .dump = nf_tables_dump_set, .done = nf_tables_dump_set_done, .module = THIS_MODULE, }; - struct nft_set_dump_ctx dump_ctx = { - .set = set, - .ctx = ctx, + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + if (err) + return err; + + c.data = &dump_ctx; + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) + return -EINVAL; + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + if (err) + return err; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false); + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); + break; + } + } + + return err; +} + +static int nf_tables_getsetelem_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + struct netlink_ext_ack *extack = info->extack; + struct nft_set_dump_ctx dump_ctx; + int rem, err = 0, nelems = 0; + struct nlattr *attr; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dump_set_start, + .dump = nf_tables_dumpreset_set, + .done = nf_tables_dump_set_done, + .module = THIS_MODULE, }; + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); + if (err) + return err; + c.data = &dump_ctx; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + rcu_read_lock(); + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); + if (err) + goto out_unlock; + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&ctx, set, attr); - if (err < 0) + err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true); + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); break; + } + nelems++; } + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); + +out_unlock: + rcu_read_unlock(); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); return err; } static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, - const struct nft_set_elem *elem, - int event, u16 flags) + const struct nft_elem_priv *elem_priv, + int event) { + struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -4819,31 +6653,38 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, - set, elem); + set, elem_priv, false); if (err < 0) { kfree_skb(skb); goto err; } - nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, - GFP_KERNEL); + nft_net = nft_pernet(net); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } -static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, +static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { + struct nft_trans_elem *te; struct nft_trans *trans; - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1)); if (trans == NULL) return NULL; - nft_trans_elem_set(trans) = set; + te = nft_trans_container_elem(trans); + te->nelems = 1; + te->set = set; + return trans; } @@ -4859,9 +6700,6 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, return expr; err = -EOPNOTSUPP; - if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL)) - goto err_set_elem_expr; - if (expr->ops->type->flags & NFT_EXPR_GC) { if (set->flags & NFT_SET_TIMEOUT) goto err_set_elem_expr; @@ -4877,39 +6715,77 @@ err_set_elem_expr: return ERR_PTR(err); } -void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *key_end, - const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) +static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len) +{ + len += nft_set_ext_types[id].len; + if (len > tmpl->ext_len[id] || + len > U8_MAX) + return -1; + + return 0; +} + +static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id, + void *to, const void *from, u32 len) +{ + if (nft_set_ext_check(tmpl, id, len) < 0) + return -1; + + memcpy(to, from, len); + + return 0; +} + +struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *key_end, + const u32 *data, + u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); if (elem == NULL) - return NULL; + return ERR_PTR(-ENOMEM); ext = nft_set_elem_ext(set, elem); nft_set_ext_init(ext, tmpl); - memcpy(nft_set_ext_key(ext), key, set->klen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) - memcpy(nft_set_ext_key_end(ext), key_end, set->klen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) - memcpy(nft_set_ext_data(ext), data, set->dlen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY, + nft_set_ext_key(ext), key, set->klen) < 0) + goto err_ext_check; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END, + nft_set_ext_key_end(ext), key_end, set->klen) < 0) + goto err_ext_check; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA, + nft_set_ext_data(ext), data, set->dlen) < 0) + goto err_ext_check; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + nft_set_ext_timeout(ext)->timeout = timeout; + if (expiration == 0) - *nft_set_ext_expiration(ext) += timeout; + expiration = timeout; + + nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration; } - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) - *nft_set_ext_timeout(ext) = timeout; return elem; + +err_ext_check: + kfree(elem); + + return ERR_PTR(-EINVAL); } -static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, - struct nft_expr *expr) +static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) { if (expr->ops->destroy_clone) { expr->ops->destroy_clone(ctx, expr); @@ -4919,77 +6795,459 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } } -void nft_set_elem_destroy(const struct nft_set *set, void *elem, +static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_set_elem_expr *elem_expr) +{ + struct nft_expr *expr; + u32 size; + + nft_setelem_expr_foreach(expr, elem_expr, size) + __nft_set_elem_expr_destroy(ctx, expr); +} + +/* Drop references and destroy. Called from gc, dynset and abort path. */ +static void __nft_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + nft_data_release(nft_set_ext_data(ext), set->dtype); + if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) + nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + nft_use_dec(&(*nft_set_ext_obj(ext))->use); + + kfree(elem_priv); +} + +/* Drop references and destroy. Called from gc and dynset. */ +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, bool destroy_expr) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem); struct nft_ctx ctx = { .net = read_pnet(&set->net), .family = set->table->family, }; - nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); - if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) - nft_data_release(nft_set_ext_data(ext), set->dtype); - if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) - nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); - - if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; - kfree(elem); + __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_set_elem_deactivate() already deals with - * the refcounting from the preparation phase. +/* Drop references and destroy. Called from abort path. */ +static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + /* skip update request, see nft_trans_elems_new_abort() */ + if (!te->elems[i].priv) + continue; + + __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); + } +} + +/* Destroy element. References have been already dropped in the preparation + * path via nft_setelem_data_deactivate(). */ -static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_elem_priv *elem_priv) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); - kfree(elem); + kfree(elem_priv); +} + +static void nft_trans_elems_destroy(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) + nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv); +} + +int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_expr *expr_array[]) +{ + struct nft_expr *expr; + int err, i, k; + + for (i = 0; i < set->num_exprs; i++) { + expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT); + if (!expr) + goto err_expr; + + err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT); + if (err < 0) { + kfree(expr); + goto err_expr; + } + expr_array[i] = expr; + } + + return 0; + +err_expr: + for (k = i - 1; k >= 0; k--) + nft_expr_destroy(ctx, expr_array[k]); + + return -ENOMEM; +} + +static int nft_set_elem_expr_setup(struct nft_ctx *ctx, + const struct nft_set_ext_tmpl *tmpl, + const struct nft_set_ext *ext, + struct nft_expr *expr_array[], + u32 num_exprs) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + u32 len = sizeof(struct nft_set_elem_expr); + struct nft_expr *expr; + int i, err; + + if (num_exprs == 0) + return 0; + + for (i = 0; i < num_exprs; i++) + len += expr_array[i]->ops->size; + + if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0) + return -EINVAL; + + for (i = 0; i < num_exprs; i++) { + expr = nft_setelem_expr_at(elem_expr, elem_expr->size); + err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT); + if (err < 0) + goto err_elem_expr_setup; + + elem_expr->size += expr_array[i]->ops->size; + nft_expr_destroy(ctx, expr_array[i]); + expr_array[i] = NULL; + } + + return 0; + +err_elem_expr_setup: + for (; i < num_exprs; i++) { + nft_expr_destroy(ctx, expr_array[i]); + expr_array[i] = NULL; + } + + return -ENOMEM; +} + +struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (nft_set_elem_active(ext, genmask) && + !nft_set_elem_expired(ext) && + !nft_set_elem_is_dead(ext)) + return ext; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); + +static int nft_setelem_catchall_insert(const struct net *net, + struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_elem_priv **priv) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_next(net); + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (nft_set_elem_active(ext, genmask)) { + *priv = catchall->elem; + return -EEXIST; + } + } + + catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT); + if (!catchall) + return -ENOMEM; + + catchall->elem = elem->priv; + list_add_tail_rcu(&catchall->list, &set->catchall_list); + + return 0; +} + +static int nft_setelem_insert(const struct net *net, + struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_elem_priv **elem_priv, + unsigned int flags) +{ + int ret; + + if (flags & NFT_SET_ELEM_CATCHALL) + ret = nft_setelem_catchall_insert(net, set, elem, elem_priv); + else + ret = set->ops->insert(net, set, elem, elem_priv); + + return ret; +} + +static bool nft_setelem_is_catchall(const struct nft_set *set, + const struct nft_elem_priv *elem_priv) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL) + return true; + + return false; +} + +static void nft_setelem_activate(struct net *net, struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (nft_setelem_is_catchall(set, elem_priv)) { + nft_clear(net, ext); + } else { + set->ops->activate(net, set, elem_priv); + } +} + +static void nft_trans_elem_update(const struct nft_set *set, + const struct nft_trans_one_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_elem_update *update = elem->update; + + if (update->flags & NFT_TRANS_UPD_TIMEOUT) + WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout); + + if (update->flags & NFT_TRANS_UPD_EXPIRATION) + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration); +} + +static void nft_trans_elems_add(const struct nft_ctx *ctx, + struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + struct nft_trans_one_elem *elem = &te->elems[i]; + + if (elem->update) + nft_trans_elem_update(te->set, elem); + else + nft_setelem_activate(ctx->net, te->set, elem->priv); + + nf_tables_setelem_notify(ctx, te->set, elem->priv, + NFT_MSG_NEWSETELEM); + kfree(elem->update); + } +} + +static int nft_setelem_catchall_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem) +{ + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_is_active_next(net, ext)) + continue; + + kfree(elem->priv); + elem->priv = catchall->elem; + nft_set_elem_change_active(net, set, ext); + return 0; + } + + return -ENOENT; +} + +static int __nft_setelem_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem) +{ + void *priv; + + priv = set->ops->deactivate(net, set, elem); + if (!priv) + return -ENOENT; + + kfree(elem->priv); + elem->priv = priv; + set->ndeact++; + + return 0; +} + +static int nft_setelem_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem, u32 flags) +{ + int ret; + + if (flags & NFT_SET_ELEM_CATCHALL) + ret = nft_setelem_catchall_deactivate(net, set, elem); + else + ret = __nft_setelem_deactivate(net, set, elem); + + return ret; +} + +static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall) +{ + list_del_rcu(&catchall->list); + kfree_rcu(catchall, rcu); +} + +static void nft_setelem_catchall_remove(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + struct nft_set_elem_catchall *catchall, *next; + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + if (catchall->elem == elem_priv) { + nft_setelem_catchall_destroy(catchall); + break; + } + } +} + +static void nft_setelem_remove(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + if (nft_setelem_is_catchall(set, elem_priv)) + nft_setelem_catchall_remove(net, set, elem_priv); + else + set->ops->remove(net, set, elem_priv); +} + +static void nft_trans_elems_remove(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + WARN_ON_ONCE(te->elems[i].update); + + nf_tables_setelem_notify(ctx, te->set, + te->elems[i].priv, + te->nft_trans.msg_type); + + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) { + atomic_dec(&te->set->nelems); + te->set->ndeact--; + } + } +} + +static bool nft_setelem_valid_key_end(const struct nft_set *set, + struct nlattr **nla, u32 flags) +{ + if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) == + (NFT_SET_CONCAT | NFT_SET_INTERVAL)) { + if (flags & NFT_SET_ELEM_INTERVAL_END) + return false; + + if (nla[NFTA_SET_ELEM_KEY_END] && + flags & NFT_SET_ELEM_CATCHALL) + return false; + } else { + if (nla[NFTA_SET_ELEM_KEY_END]) + return false; + } + + return true; +} + +static u32 nft_set_maxsize(const struct nft_set *set) +{ + u32 maxsize, delta; + + if (!set->size) + return UINT_MAX; + + if (set->ops->adjust_maxsize) + delta = set->ops->adjust_maxsize(set); + else + delta = 0; + + if (check_add_overflow(set->size, set->ndeact, &maxsize)) + return UINT_MAX; + + if (check_add_overflow(maxsize, delta, &maxsize)) + return UINT_MAX; + + return maxsize; } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { + struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); + u32 flags = 0, size = 0, num_exprs = 0; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; + struct nft_elem_priv *elem_priv; struct nft_object *obj = NULL; - struct nft_expr *expr = NULL; struct nft_userdata *udata; struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u32 flags = 0; - u64 timeout; u64 expiration; + u64 timeout; + int err, i; u8 ulen; - int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) return err; - if (nla[NFTA_SET_ELEM_KEY] == NULL) - return -EINVAL; - nft_set_ext_prepare(&tmpl); err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; - if (flags != 0) - nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + + if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) || + (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY])) + return -EINVAL; + + if (flags != 0) { + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + if (err < 0) + return err; + } if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && @@ -5000,13 +7258,27 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return -EINVAL; } + if (set->flags & NFT_SET_OBJECT) { + if (!nla[NFTA_SET_ELEM_OBJREF] && + !(flags & NFT_SET_ELEM_INTERVAL_END)) + return -EINVAL; + } else { + if (nla[NFTA_SET_ELEM_OBJREF]) + return -EINVAL; + } + + if (!nft_setelem_valid_key_end(set, nla, flags)) + return -EINVAL; + if ((flags & NFT_SET_ELEM_INTERVAL_END) && (nla[NFTA_SET_ELEM_DATA] || nla[NFTA_SET_ELEM_OBJREF] || nla[NFTA_SET_ELEM_TIMEOUT] || nla[NFTA_SET_ELEM_EXPIRATION] || nla[NFTA_SET_ELEM_USERDATA] || - nla[NFTA_SET_ELEM_EXPR])) + nla[NFTA_SET_ELEM_EXPR] || + nla[NFTA_SET_ELEM_KEY_END] || + nla[NFTA_SET_ELEM_EXPRESSIONS])) return -EINVAL; timeout = 0; @@ -5017,7 +7289,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, &timeout); if (err) return err; - } else if (set->flags & NFT_SET_TIMEOUT) { + } else if (set->flags & NFT_SET_TIMEOUT && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { timeout = set->timeout; } @@ -5025,37 +7298,89 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (timeout == 0) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], &expiration); if (err) return err; + + if (expiration > timeout) + return -ERANGE; } - if (nla[NFTA_SET_ELEM_EXPR] != NULL) { + if (nla[NFTA_SET_ELEM_EXPR]) { + struct nft_expr *expr; + + if (set->num_exprs && set->num_exprs != 1) + return -EOPNOTSUPP; + expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_ELEM_EXPR]); if (IS_ERR(expr)) return PTR_ERR(expr); - err = -EOPNOTSUPP; - if (set->expr && set->expr->ops != expr->ops) + expr_array[0] = expr; + num_exprs = 1; + + if (set->num_exprs && set->exprs[0]->ops != expr->ops) { + err = -EOPNOTSUPP; goto err_set_elem_expr; - } else if (set->expr) { - expr = kzalloc(set->expr->ops->size, GFP_KERNEL); - if (!expr) - return -ENOMEM; + } + } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) { + struct nft_expr *expr; + struct nlattr *tmp; + int left; + + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX || + (set->num_exprs && set->num_exprs == i)) { + err = -E2BIG; + goto err_set_elem_expr; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_elem_expr; + } + expr = nft_set_elem_expr_alloc(ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_elem_expr; + } + expr_array[i] = expr; + num_exprs++; - err = nft_expr_clone(expr, set->expr); - if (err < 0) + if (set->num_exprs && expr->ops != set->exprs[i]->ops) { + err = -EOPNOTSUPP; + goto err_set_elem_expr; + } + i++; + } + if (set->num_exprs && set->num_exprs != i) { + err = -EOPNOTSUPP; goto err_set_elem_expr; + } + } else if (set->num_exprs > 0 && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { + err = nft_set_elem_expr_clone(ctx, set, expr_array); + if (err < 0) + goto err_set_elem_expr_clone; + + num_exprs = set->num_exprs; } - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - goto err_set_elem_expr; + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + goto err_set_elem_expr; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + if (err < 0) + goto err_parse_key; + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, @@ -5063,32 +7388,46 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err_parse_key; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + if (err < 0) + goto err_parse_key_end; } - if (timeout > 0) { - nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); - if (timeout != set->timeout) - nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); + if (set->flags & NFT_SET_TIMEOUT) { + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); + if (err < 0) + goto err_parse_key_end; } - if (expr) - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR, - expr->ops->size); + if (num_exprs) { + for (i = 0; i < num_exprs; i++) + size += expr_array[i]->ops->size; - if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { - if (!(set->flags & NFT_SET_OBJECT)) { - err = -EINVAL; + err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS, + sizeof(struct nft_set_elem_expr) + size); + if (err < 0) goto err_parse_key_end; - } + } + + if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); + obj = NULL; goto err_parse_key_end; } - nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); + + if (!nft_use_inc(&obj->use)) { + err = -EMFILE; + obj = NULL; + goto err_parse_key_end; + } + + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); + if (err < 0) + goto err_parse_key_end; } if (nla[NFTA_SET_ELEM_DATA] != NULL) { @@ -5118,11 +7457,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (desc.type == NFT_DATA_VERDICT && (elem.data.val.verdict.code == NFT_GOTO || elem.data.val.verdict.code == NFT_JUMP)) - nft_validate_state_update(ctx->net, + nft_validate_state_update(ctx->table, NFT_VALIDATE_NEED); } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); + err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); + if (err < 0) + goto err_parse_data; } /* The full maximum length of userdata can exceed the maximum @@ -5132,51 +7473,59 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ulen = 0; if (nla[NFTA_SET_ELEM_USERDATA] != NULL) { ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]); - if (ulen > 0) - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA, - ulen); + if (ulen > 0) { + err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA, + ulen); + if (err < 0) + goto err_parse_data; + } } - err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, elem.data.val.data, - timeout, expiration, GFP_KERNEL); - if (elem.priv == NULL) + timeout, expiration, GFP_KERNEL_ACCOUNT); + if (IS_ERR(elem.priv)) { + err = PTR_ERR(elem.priv); goto err_parse_data; + } ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; + + if (obj) + *nft_set_ext_obj(ext) = obj; + if (ulen > 0) { + if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { + err = -EINVAL; + goto err_elem_free; + } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } - if (obj) { - *nft_set_ext_obj(ext) = obj; - obj->use++; - } - if (expr) { - memcpy(nft_set_ext_expr(ext), expr, expr->ops->size); - kfree(expr); - expr = NULL; - } + err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); + if (err < 0) + goto err_elem_free; trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); - if (trans == NULL) - goto err_trans; + if (trans == NULL) { + err = -ENOMEM; + goto err_elem_free; + } - ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(ctx->net, set, &elem, &ext2); + ext->genmask = nft_genmask_cur(ctx->net); + + err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags); if (err) { if (err == -EEXIST) { + ext2 = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ - nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { - err = -EBUSY; + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) goto err_element_clash; - } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), @@ -5184,9 +7533,41 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) - err = -EBUSY; - else if (!(nlmsg_flags & NLM_F_EXCL)) + goto err_element_clash; + else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; + if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { + struct nft_elem_update update = { }; + + if (timeout != nft_set_ext_timeout(ext2)->timeout) { + update.timeout = timeout; + if (expiration == 0) + expiration = timeout; + + update.flags |= NFT_TRANS_UPD_TIMEOUT; + } + if (expiration) { + update.expiration = expiration; + update.flags |= NFT_TRANS_UPD_EXPIRATION; + } + + if (update.flags) { + struct nft_trans_one_elem *ue; + + ue = &nft_trans_container_elem(trans)->elems[0]; + + ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL); + if (!ue->update) { + err = -ENOMEM; + goto err_element_clash; + } + + ue->priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans); + goto err_elem_free; + } + } + } } else if (err == -ENOTEMPTY) { /* ENOTEMPTY reports overlapping between this element * and an existing one. @@ -5196,46 +7577,52 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_element_clash; } - if (set->size && - !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { - err = -ENFILE; - goto err_set_full; + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + unsigned int max = nft_set_maxsize(set); + + if (!atomic_add_unless(&set->nelems, 1, max)) { + err = -ENFILE; + goto err_set_full; + } } - nft_trans_elem(trans) = elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; err_set_full: - set->ops->remove(ctx->net, set, &elem); + nft_setelem_remove(ctx->net, set, elem.priv); err_element_clash: kfree(trans); -err_trans: - if (obj) - obj->use--; - +err_elem_free: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); err_parse_key_end: + if (obj) + nft_use_dec_restore(&obj->use); + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); err_set_elem_expr: - if (expr != NULL) - nft_expr_destroy(ctx, expr); - + for (i = 0; i < num_exprs && expr_array[i]; i++) + nft_expr_destroy(ctx, expr_array[i]); +err_set_elem_expr_clone: return err; } -static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - u8 genmask = nft_genmask_next(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err; @@ -5243,27 +7630,36 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET], nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); - if (IS_ERR(set)) + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); + } - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); - if (err < 0) + err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); return err; + } } - if (net->nft.validate_state == NFT_VALIDATE_DO) - return nft_table_validate(net, ctx.table); + if (table->validate_state == NFT_VALIDATE_DO) + return nft_table_validate(net, table); return 0; } @@ -5281,38 +7677,100 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { + struct nft_chain *chain; + if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use++; + chain = data->verdict.chain; + nft_use_inc_restore(&chain->use); break; } } } -static void nft_set_elem_activate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +static int nft_setelem_active_next(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + u8 genmask = nft_genmask_next(net); + + return nft_set_elem_active(ext, genmask); +} + +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_hold(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use++; + nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } -static void nft_set_elem_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; + nft_use_dec(&(*nft_set_ext_obj(ext))->use); +} + +/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem. + * No notifications and no ndeact changes. + * + * Returns true if set had been added to (i.e., elements need to be removed again). + */ +static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, + struct nft_trans_elem *te) +{ + bool removed = false; + int i; + + for (i = 0; i < te->nelems; i++) { + if (te->elems[i].update) { + kfree(te->elems[i].update); + te->elems[i].update = NULL; + /* Update request, so do not release this element */ + te->elems[i].priv = NULL; + continue; + } + + if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + atomic_dec(&te->set->nelems); + + removed = true; + } + + return removed; +} + +/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */ +static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) { + nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv); + nft_setelem_activate(ctx->net, te->set, te->elems[i].priv); + } + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + te->set->ndeact--; + } } static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, @@ -5324,7 +7782,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_ext *ext; struct nft_trans *trans; u32 flags = 0; - void *priv; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, @@ -5332,39 +7789,54 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (nla[NFTA_SET_ELEM_KEY] == NULL) + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + + if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + return -EINVAL; + + if (!nft_setelem_valid_key_end(set, nla, flags)) return -EINVAL; nft_set_ext_prepare(&tmpl); - err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); - if (err < 0) - return err; - if (flags != 0) - nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + if (flags != 0) { + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + if (err < 0) + return err; + } - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - return err; + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + return err; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + if (err < 0) + goto fail_elem; + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, nla[NFTA_SET_ELEM_KEY_END]); if (err < 0) - return err; + goto fail_elem; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + if (err < 0) + goto fail_elem_key_end; } err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, NULL, 0, 0, - GFP_KERNEL); - if (elem.priv == NULL) - goto fail_elem; + GFP_KERNEL_ACCOUNT); + if (IS_ERR(elem.priv)) { + err = PTR_ERR(elem.priv); + goto fail_elem_key_end; + } ext = nft_set_elem_ext(set, elem.priv); if (flags) @@ -5374,122 +7846,161 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (trans == NULL) goto fail_trans; - priv = set->ops->deactivate(ctx->net, set, &elem); - if (priv == NULL) { - err = -ENOENT; + err = nft_setelem_deactivate(ctx->net, set, &elem, flags); + if (err < 0) goto fail_ops; - } - kfree(elem.priv); - elem.priv = priv; - nft_set_elem_deactivate(ctx->net, set, &elem); + nft_setelem_data_deactivate(ctx->net, set, elem.priv); - nft_trans_elem(trans) = elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; fail_ops: kfree(trans); fail_trans: kfree(elem.priv); +fail_elem_key_end: + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); fail_elem: nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } -static int nft_flush_set(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) +static int nft_setelem_flush(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; - int err; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_ATOMIC); + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM, + struct_size_t(struct nft_trans_elem, elems, 1)); if (!trans) return -ENOMEM; - if (!set->ops->flush(ctx->net, set, elem->priv)) { - err = -ENOENT; - goto err1; - } + set->ops->flush(ctx->net, set, elem_priv); set->ndeact++; - nft_set_elem_deactivate(ctx->net, set, elem); + nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; - nft_trans_elem(trans) = *elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_container_elem(trans)->nelems = 1; + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; -err1: - kfree(trans); - return err; } -static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int __nft_set_catchall_flush(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_elem_priv *elem_priv) { - u8 genmask = nft_genmask_next(net); + struct nft_trans *trans; + + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); + if (!trans) + return -ENOMEM; + + nft_setelem_data_deactivate(ctx->net, set, elem_priv); + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans); + + return 0; +} + +static int nft_set_catchall_flush(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + ret = __nft_set_catchall_flush(ctx, set, catchall->elem); + if (ret < 0) + break; + nft_set_elem_change_active(ctx->net, set, ext); + } + + return ret; +} + +static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) +{ + struct nft_set_iter iter = { + .genmask = genmask, + .type = NFT_ITER_UPDATE, + .fn = nft_setelem_flush, + }; + + set->ops->walk(ctx, set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_flush(ctx, set); + + return iter.err; +} + +static int nf_tables_delsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); - if (IS_ERR(set)) + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + } + + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + + if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT)) return -EBUSY; - if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) { - struct nft_set_iter iter = { - .genmask = genmask, - .fn = nft_flush_set, - }; - set->ops->walk(&ctx, set, &iter); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - return iter.err; - } + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) + return nft_set_flush(&ctx, set, genmask); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); - if (err < 0) - break; + if (err == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) + continue; - set->ndeact++; + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); + return err; + } } - return err; -} - -void nft_set_gc_batch_release(struct rcu_head *rcu) -{ - struct nft_set_gc_batch *gcb; - unsigned int i; - - gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); - for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); - kfree(gcb); -} - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp) -{ - struct nft_set_gc_batch *gcb; - gcb = kzalloc(sizeof(*gcb), gfp); - if (gcb == NULL) - return gcb; - gcb->head.set = set; - return gcb; + return 0; } /* @@ -5498,7 +8009,7 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, /** * nft_register_obj- register nf_tables stateful object type - * @obj: object type + * @obj_type: object type * * Registers the object type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. @@ -5517,7 +8028,7 @@ EXPORT_SYMBOL_GPL(nft_register_obj); /** * nft_unregister_obj - unregister nf_tables object type - * @obj: object type + * @obj_type: object type * * Unregisters the object type for use with nf_tables. */ @@ -5539,7 +8050,7 @@ struct nft_object *nft_obj_lookup(const struct net *net, struct rhlist_head *tmp, *list; struct nft_object *obj; - nla_strlcpy(search, nla, sizeof(search)); + nla_strscpy(search, nla, sizeof(search)); k.name = search; WARN_ON_ONCE(!rcu_read_lock_held() && @@ -5586,6 +8097,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, + [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, @@ -5621,7 +8134,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, } err = -ENOMEM; - obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL); + obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT); if (!obj) goto err2; @@ -5658,11 +8171,15 @@ nla_put_failure: return -1; } -static const struct nft_object_type *__nft_obj_type_get(u32 objtype) +static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; - list_for_each_entry(type, &nf_tables_objects, list) { + list_for_each_entry_rcu(type, &nf_tables_objects, list) { + if (type->family != NFPROTO_UNSPEC && + type->family != family) + continue; + if (objtype == type->type) return type; } @@ -5670,13 +8187,17 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype) } static const struct nft_object_type * -nft_obj_type_get(struct net *net, u32 objtype) +nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; - type = __nft_obj_type_get(objtype); - if (type != NULL && try_module_get(type->owner)) + rcu_read_lock(); + type = __nft_obj_type_get(objtype, family); + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -5695,12 +8216,13 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, { struct nft_object *newobj; struct nft_trans *trans; - int err; + int err = -ENOMEM; + /* caller must have obtained type->owner reference. */ trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) - return -ENOMEM; + goto err_trans; newobj = nft_obj_init(ctx, type, attr); if (IS_ERR(newobj)) { @@ -5711,24 +8233,25 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, nft_trans_obj(trans) = obj; nft_trans_obj_update(trans) = true; nft_trans_obj_newobj(trans) = newobj; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_free_trans: kfree(trans); +err_trans: + module_put(type->owner); return err; } -static int nf_tables_newobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_object_type *type; - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; + struct net *net = info->net; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; @@ -5740,7 +8263,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_DATA]) return -EINVAL; - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); @@ -5755,63 +8279,88 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, return err; } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - type = __nft_obj_type_get(objtype); - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + if (!obj->ops->update) + return 0; + + type = nft_obj_type_get(net, objtype, family); + if (WARN_ON_ONCE(IS_ERR(type))) + return PTR_ERR(type); + + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + /* type->owner reference is put when transaction object is released. */ return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - type = nft_obj_type_get(net, objtype); - if (IS_ERR(type)) - return PTR_ERR(type); + if (!nft_use_inc(&table->use)) + return -EMFILE; + + type = nft_obj_type_get(net, objtype, family); + if (IS_ERR(type)) { + err = PTR_ERR(type); + goto err_type; + } obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err1; + goto err_init; } obj->key.table = table; obj->handle = nf_tables_alloc_handle(table); - obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); + obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT); if (!obj->key.name) { err = -ENOMEM; - goto err2; + goto err_strdup; + } + + if (nla[NFTA_OBJ_USERDATA]) { + obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT); + if (obj->udata == NULL) + goto err_userdata; + + obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]); } err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); if (err < 0) - goto err3; + goto err_trans; err = rhltable_insert(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); if (err < 0) - goto err4; + goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); - table->use++; + return 0; -err4: +err_obj_ht: /* queued in transaction log */ INIT_LIST_HEAD(&obj->list); return err; -err3: +err_trans: + kfree(obj->udata); +err_userdata: kfree(obj->key.name); -err2: +err_strdup: if (obj->ops->destroy) obj->ops->destroy(&ctx, obj); kfree(obj); -err1: +err_init: module_put(type->owner); +err_type: + nft_use_dec_restore(&table->use); + return err; } @@ -5820,28 +8369,35 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, int family, const struct nft_table *table, struct nft_object *obj, bool reset) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || - nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || - nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) || nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle), NFTA_OBJ_PAD)) goto nla_put_failure; + if (event == NFT_MSG_DELOBJ || + event == NFT_MSG_DESTROYOBJ) { + nlmsg_end(skb, nlh); + return 0; + } + + if (nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || + nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) + goto nla_put_failure; + + if (obj->udata && + nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -5850,168 +8406,247 @@ nla_put_failure: return -1; } -struct nft_obj_filter { +static void audit_log_obj_reset(const struct nft_table *table, + unsigned int base_seq, unsigned int nentries) +{ + char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); + + audit_log_nfcfg(buf, table->family, nentries, + AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); + kfree(buf); +} + +struct nft_obj_dump_ctx { + unsigned int s_idx; char *table; u32 type; + bool reset; }; static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_table *table; - unsigned int idx = 0, s_idx = cb->args[0]; - struct nft_obj_filter *filter = cb->data; + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; + const struct nft_table *table; + unsigned int entries = 0; struct nft_object *obj; - bool reset = false; - - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - reset = true; + unsigned int idx = 0; + int rc = 0; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = nft_pernet(net); + cb->seq = nft_base_seq(net); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; + entries = 0; list_for_each_entry_rcu(obj, &table->objects, list) { if (!nft_is_active(net, obj)) goto cont; - if (idx < s_idx) + if (idx < ctx->s_idx) goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (filter && filter->table && - strcmp(filter->table, table->name)) + if (ctx->table && strcmp(ctx->table, table->name)) goto cont; - if (filter && - filter->type != NFT_OBJECT_UNSPEC && - obj->ops->type->type != filter->type) + if (ctx->type != NFT_OBJECT_UNSPEC && + obj->ops->type->type != ctx->type) goto cont; - if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWOBJ, - NLM_F_MULTI | NLM_F_APPEND, - table->family, table, - obj, reset) < 0) - goto done; + rc = nf_tables_fill_obj_info(skb, net, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWOBJ, + NLM_F_MULTI | NLM_F_APPEND, + table->family, table, + obj, ctx->reset); + if (rc < 0) + break; + entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } + if (ctx->reset && entries) + audit_log_obj_reset(table, nft_base_seq(net), entries); + if (rc < 0) + break; } -done: rcu_read_unlock(); - cb->args[0] = idx; + ctx->s_idx = idx; return skb->len; } +static int nf_tables_dumpreset_obj(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + int ret; + + mutex_lock(&nft_net->commit_mutex); + ret = nf_tables_dump_obj(skb, cb); + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_obj_start(struct netlink_callback *cb) { + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; - struct nft_obj_filter *filter = NULL; - if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) { - filter = kzalloc(sizeof(*filter), GFP_ATOMIC); - if (!filter) - return -ENOMEM; + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - if (nla[NFTA_OBJ_TABLE]) { - filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); - if (!filter->table) { - kfree(filter); - return -ENOMEM; - } - } - - if (nla[NFTA_OBJ_TYPE]) - filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + if (nla[NFTA_OBJ_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); + if (!ctx->table) + return -ENOMEM; } - cb->data = filter; + if (nla[NFTA_OBJ_TYPE]) + ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + return 0; } +static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) +{ + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; + + ctx->reset = true; + + return nf_tables_dump_obj_start(cb); +} + static int nf_tables_dump_obj_done(struct netlink_callback *cb) { - struct nft_obj_filter *filter = cb->data; + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; - if (filter) { - kfree(filter->table); - kfree(filter); - } + kfree(ctx->table); return 0; } -/* called with rcu_read_lock held */ -static int nf_tables_getobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +/* Caller must hold rcu read lock or transaction mutex */ +static struct sk_buff * +nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, + const struct nlattr * const nla[], bool reset) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); - int family = nfmsg->nfgen_family; + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; + struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; - bool reset = false; u32 objtype; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start = nf_tables_dump_obj_start, - .dump = nf_tables_dump_obj, - .done = nf_tables_dump_obj_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); - } - if (!nla[NFTA_OBJ_NAME] || !nla[NFTA_OBJ_TYPE]) - return -EINVAL; + return ERR_PTR(-EINVAL); - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); - return PTR_ERR(table); + return ERR_CAST(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); - return PTR_ERR(obj); + return ERR_CAST(obj); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) - return -ENOMEM; - - if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - reset = true; + return ERR_PTR(-ENOMEM); - err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + err = nf_tables_fill_obj_info(skb2, net, portid, + info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); - if (err < 0) - goto err; + if (err < 0) { + kfree_skb(skb2); + return ERR_PTR(err); + } - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: - kfree_skb(skb2); - return err; + return skb2; +} + +static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + u32 portid = NETLINK_CB(skb).portid; + struct sk_buff *skb2; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dump_obj_start, + .dump = nf_tables_dump_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + skb2 = nf_tables_getobj_single(portid, info, nla, false); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + return nfnetlink_unicast(skb2, info->net, portid); +} + +static int nf_tables_getobj_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + char *buf; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dumpreset_obj_start, + .dump = nf_tables_dumpreset_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + skb2 = nf_tables_getobj_single(portid, info, nla, true); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); + + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", + nla_len(nla[NFTA_OBJ_TABLE]), + (char *)nla_data(nla[NFTA_OBJ_TABLE]), + nft_base_seq(net)); + audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, + AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); + kfree(buf); + + return nfnetlink_unicast(skb2, net, portid); } static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) @@ -6021,17 +8656,17 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) module_put(obj->ops->type->owner); kfree(obj->key.name); + kfree(obj->udata); kfree(obj); } -static int nf_tables_delobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_object *obj; @@ -6042,7 +8677,8 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); @@ -6058,6 +8694,10 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, } if (IS_ERR(obj)) { + if (PTR_ERR(obj) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); } @@ -6066,15 +8706,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, return -EBUSY; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_delobj(&ctx, obj); } -void nft_obj_notify(struct net *net, const struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, int event, - int family, int report, gfp_t gfp) +static void +__nft_obj_notify(struct net *net, const struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + u16 flags, int family, int report, gfp_t gfp) { + struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; int err; @@ -6086,25 +8728,47 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, if (skb == NULL) goto err; - err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family, - table, obj, false); + err = nf_tables_fill_obj_info(skb, net, portid, seq, event, + flags & (NLM_F_CREATE | NLM_F_EXCL), + family, table, obj, false); if (err < 0) { kfree_skb(skb); goto err; } - nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); + nft_notify_enqueue(skb, report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } + +void nft_obj_notify(struct net *net, const struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + u16 flags, int family, int report, gfp_t gfp) +{ + char *buf = kasprintf(gfp, "%s:%u", + table->name, nft_base_seq(net)); + + audit_log_nfcfg(buf, + family, + obj->handle, + event == NFT_MSG_NEWOBJ ? + AUDIT_NFT_OP_OBJ_REGISTER : + AUDIT_NFT_OP_OBJ_UNREGISTER, + gfp); + kfree(buf); + + __nft_obj_notify(net, table, obj, portid, seq, event, + flags, family, report, gfp); +} EXPORT_SYMBOL_GPL(nft_obj_notify); static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { - nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->family, ctx->report, GFP_KERNEL); + __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, + ctx->seq, event, ctx->flags, ctx->family, + ctx->report, GFP_KERNEL); } /* @@ -6136,12 +8800,14 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; @@ -6155,11 +8821,12 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: - flowtable->use--; - /* fall through */ + nft_use_dec(&flowtable->use); + fallthrough; default: return; } @@ -6193,26 +8860,31 @@ static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX }; static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, - const struct nlattr *attr, + const struct nlattr * const nla[], struct nft_flowtable_hook *flowtable_hook, - struct nft_flowtable *flowtable, bool add) + struct nft_flowtable *flowtable, + struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; struct nft_hook *hook; int hooknum, priority; int err; INIT_LIST_HEAD(&flowtable_hook->list); - err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, + err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, + nla[NFTA_FLOWTABLE_HOOK], nft_flowtable_hook_policy, NULL); if (err < 0) return err; if (add) { if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || - !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) - return -EINVAL; + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); + return -ENOENT; + } hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); if (hooknum != NF_NETDEV_INGRESS) @@ -6242,27 +8914,32 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(ctx->net, tb[NFTA_FLOWTABLE_HOOK_DEVS], - &flowtable_hook->list); + &flowtable_hook->list, + extack); if (err < 0) return err; } list_for_each_entry(hook, &flowtable_hook->list, list) { - hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = flowtable_hook->num; - hook->ops.priority = flowtable_hook->priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + list_for_each_entry(ops, &hook->ops_list, list) { + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable_hook->num; + ops->priority = flowtable_hook->priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->hook_ops_type = NF_HOOK_OP_NFT_FT; + } } return err; } +/* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; - list_for_each_entry(type, &nf_tables_flowtables, list) { + list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } @@ -6274,9 +8951,13 @@ nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; + rcu_read_lock(); type = __nft_flowtable_type_get(family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -6289,22 +8970,58 @@ nft_flowtable_type_get(struct net *net, u8 family) } /* Only called from error and netdev event paths. */ -static void nft_unregister_flowtable_hook(struct net *net, - struct nft_flowtable *flowtable, - struct nft_hook *hook) +static void nft_unregister_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + nf_unregister_net_hook(net, ops); + flowtable->data.type->setup(&flowtable->data, ops->dev, FLOW_BLOCK_UNBIND); } +static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, + struct list_head *hook_list, + bool release_netdev) +{ + struct nft_hook *hook, *next; + struct nf_hook_ops *ops; + + list_for_each_entry_safe(hook, next, hook_list, list) { + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); + if (release_netdev) { + list_del(&hook->list); + nft_netdev_hook_free_rcu(hook); + } + } +} + static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list) { - struct nft_hook *hook; + __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); +} - list_for_each_entry(hook, hook_list, list) - nf_unregister_net_hook(net, &hook->ops); +static int nft_register_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) +{ + int err; + + err = flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_BIND); + if (err < 0) + return err; + + err = nf_register_net_hook(net, ops); + if (!err) + return 0; + + flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_UNBIND); + return err; } static int nft_register_flowtable_net_hooks(struct net *net, @@ -6312,83 +9029,111 @@ static int nft_register_flowtable_net_hooks(struct net *net, struct list_head *hook_list, struct nft_flowtable *flowtable) { - struct nft_hook *hook, *hook2, *next; + struct nft_hook *hook, *next; struct nft_flowtable *ft; + struct nf_hook_ops *ops; int err, i = 0; list_for_each_entry(hook, hook_list, list) { list_for_each_entry(ft, &table->flowtables, list) { - list_for_each_entry(hook2, &ft->hook_list, list) { - if (hook->ops.dev == hook2->ops.dev && - hook->ops.pf == hook2->ops.pf) { - err = -EBUSY; - goto err_unregister_net_hooks; - } + if (!nft_is_active_next(net, ft)) + continue; + + if (nft_hook_list_find(&ft->hook_list, hook)) { + err = -EEXIST; + goto err_unregister_net_hooks; } } - err = flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_BIND); - if (err < 0) - goto err_unregister_net_hooks; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nft_register_flowtable_ops(net, flowtable, ops); + if (err < 0) + goto err_unregister_net_hooks; - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) { - flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_UNBIND); - goto err_unregister_net_hooks; + i++; } - - i++; } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - nft_unregister_flowtable_hook(net, flowtable, hook); + nft_unregister_flowtable_ops(net, flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; } -static void nft_flowtable_hooks_destroy(struct list_head *hook_list) +static void nft_hooks_destroy(struct list_head *hook_list) { struct nft_hook *hook, *next; list_for_each_entry_safe(hook, next, hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, - struct nft_flowtable *flowtable) + struct nft_flowtable *flowtable, + struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; + struct nftables_pernet *nft_net; struct nft_hook *hook, *next; + struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; + u32 flags; int err; - err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, flowtable, false); + err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable, + extack, false); if (err < 0) return err; list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); + continue; } + + nft_net = nft_pernet(ctx->net); + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWFLOWTABLE || + trans->table != ctx->table || + !nft_trans_flowtable_update(trans)) + continue; + + if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) { + err = -EEXIST; + goto err_flowtable_update_hook; + } + } + } + + if (nla[NFTA_FLOWTABLE_FLAGS]) { + flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); + if (flags & ~NFT_FLOWTABLE_MASK) { + err = -EOPNOTSUPP; + goto err_flowtable_update_hook; + } + if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^ + (flags & NFT_FLOWTABLE_HW_OFFLOAD)) { + err = -EOPNOTSUPP; + goto err_flowtable_update_hook; + } + } else { + flags = flowtable->data.flags; } err = nft_register_flowtable_net_hooks(ctx->net, ctx->table, @@ -6404,41 +9149,44 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, goto err_flowtable_update_hook; } + nft_trans_flowtable_flags(trans) = flags; nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans)); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { - if (unregister) - nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + if (unregister) { + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(ctx->net, + flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; } -static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct netlink_ext_ack *extack = info->extack; struct nft_flowtable_hook flowtable_hook; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; - struct nft_hook *hook, *next; + struct net *net = info->net; struct nft_table *table; + struct nft_trans *trans; struct nft_ctx ctx; int err; @@ -6448,13 +9196,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); + genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); @@ -6463,27 +9211,32 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return err; } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -EEXIST; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - return nft_flowtable_update(&ctx, nlh, flowtable); + return nft_flowtable_update(&ctx, info->nlh, flowtable, extack); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL); - if (!flowtable) - return -ENOMEM; + if (!nft_use_inc(&table->use)) + return -EMFILE; + + flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT); + if (!flowtable) { + err = -ENOMEM; + goto flowtable_alloc; + } flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); INIT_LIST_HEAD(&flowtable->hook_list); - flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); + flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT); if (!flowtable->name) { err = -ENOMEM; goto err1; @@ -6498,8 +9251,10 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (nla[NFTA_FLOWTABLE_FLAGS]) { flowtable->data.flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); - if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) + if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) { + err = -EOPNOTSUPP; goto err3; + } } write_pnet(&flowtable->data.net, net); @@ -6508,38 +9263,37 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err3; - err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, flowtable, true); + err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable, + extack, true); if (err < 0) - goto err4; + goto err_flowtable_parse_hooks; list_splice(&flowtable_hook.list, &flowtable->hook_list); flowtable->data.priority = flowtable_hook.priority; flowtable->hooknum = flowtable_hook.num; + trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto err_flowtable_trans; + } + + /* This must be LAST to ensure no packets are walking over this flowtable. */ err = nft_register_flowtable_net_hooks(ctx.net, table, &flowtable->hook_list, flowtable); - if (err < 0) { - nft_flowtable_hooks_destroy(&flowtable->hook_list); - goto err4; - } - - err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err5; + goto err_flowtable_hooks; list_add_tail_rcu(&flowtable->list, &table->flowtables); - table->use++; return 0; -err5: - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - nft_unregister_flowtable_hook(net, flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); - } -err4: + +err_flowtable_hooks: + nft_trans_destroy(trans); +err_flowtable_trans: + nft_hooks_destroy(&flowtable->hook_list); +err_flowtable_parse_hooks: flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); @@ -6547,64 +9301,80 @@ err2: kfree(flowtable->name); err1: kfree(flowtable); +flowtable_alloc: + nft_use_dec_restore(&table->use); + return err; } +static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook) +{ + struct nft_hook *this, *next; + + list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { + list_del(&this->list); + nft_netdev_hook_free(this); + } +} + static int nft_delflowtable_hook(struct nft_ctx *ctx, - struct nft_flowtable *flowtable) + struct nft_flowtable *flowtable, + struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; - struct nft_hook *this, *next, *hook; + LIST_HEAD(flowtable_del_list); + struct nft_hook *this, *hook; struct nft_trans *trans; int err; - err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, flowtable, false); + err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable, + extack, false); if (err < 0) return err; - list_for_each_entry_safe(this, next, &flowtable_hook.list, list) { + list_for_each_entry(this, &flowtable_hook.list, list) { hook = nft_hook_list_find(&flowtable->hook_list, this); if (!hook) { err = -ENOENT; goto err_flowtable_del_hook; } - hook->inactive = true; - list_del(&this->list); - kfree(this); + list_move(&hook->list, &flowtable_del_list); } trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, sizeof(struct nft_trans_flowtable)); - if (!trans) - return -ENOMEM; + if (!trans) { + err = -ENOMEM; + goto err_flowtable_del_hook; + } nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans)); + nft_flowtable_hook_release(&flowtable_hook); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_flowtable_del_hook: - list_for_each_entry(hook, &flowtable_hook.list, list) - hook->inactive = false; + list_splice(&flowtable_del_list, &flowtable->hook_list); + nft_flowtable_hook_release(&flowtable_hook); return err; } -static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - int family = nfmsg->nfgen_family; + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; @@ -6615,7 +9385,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); + genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); @@ -6626,18 +9396,22 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_FLOWTABLE_NAME]; - flowtable = nft_flowtable_lookup(table, attr, genmask); + flowtable = nft_flowtable_lookup(net, table, attr, genmask); } if (IS_ERR(flowtable)) { + if (PTR_ERR(flowtable) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (nla[NFTA_FLOWTABLE_HOOK]) - return nft_delflowtable_hook(&ctx, flowtable); + return nft_delflowtable_hook(&ctx, flowtable, extack); if (flowtable->use > 0) { NL_SET_BAD_ATTR(extack, attr); @@ -6654,25 +9428,29 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, struct list_head *hook_list) { struct nlattr *nest, *nest_devs; - struct nfgenmsg *nfmsg; struct nft_hook *hook; struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || - nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), - NFTA_FLOWTABLE_PAD) || + NFTA_FLOWTABLE_PAD)) + goto nla_put_failure; + + if (!hook_list && + (event == NFT_MSG_DELFLOWTABLE || + event == NFT_MSG_DESTROYFLOWTABLE)) { + nlmsg_end(skb, nlh); + return 0; + } + + if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags))) goto nla_put_failure; @@ -6687,8 +9465,12 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest_devs) goto nla_put_failure; - list_for_each_entry_rcu(hook, hook_list, list) { - if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) + if (!hook_list) + hook_list = &flowtable->hook_list; + + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -6715,12 +9497,14 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; + struct nftables_pernet *nft_net; const struct nft_table *table; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = nft_pernet(net); + cb->seq = nft_base_seq(net); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -6741,8 +9525,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, table->family, - flowtable, - &flowtable->hook_list) < 0) + flowtable, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -6793,21 +9576,20 @@ static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); - int family = nfmsg->nfgen_family; + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; const struct nft_table *table; + struct net *net = info->net; struct sk_buff *skb2; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_flowtable_start, .dump = nf_tables_dump_flowtable, @@ -6816,48 +9598,54 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, .data = (void *)nla, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_FLOWTABLE_NAME]) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); - if (IS_ERR(table)) + genmask, 0); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); + } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); - if (IS_ERR(flowtable)) + if (IS_ERR(flowtable)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return PTR_ERR(flowtable); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, + info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, - flowtable, &flowtable->hook_list); + flowtable, NULL); if (err < 0) - goto err; + goto err_fill_flowtable_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_flowtable_info: kfree_skb(skb2); return err; } static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, - struct list_head *hook_list, - int event) + struct list_head *hook_list, int event) { + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; + u16 flags = 0; int err; - if (ctx->report && + if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; @@ -6865,16 +9653,18 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, - ctx->seq, event, 0, + ctx->seq, event, flags, ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); goto err; } - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -6886,10 +9676,8 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); - kfree(hook); + nft_netdev_hook_free_rcu(hook); } kfree(flowtable->name); module_put(flowtable->data.type->owner); @@ -6900,20 +9688,15 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, + NFNETLINK_V0, nft_base_seq_be16(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -6926,44 +9709,132 @@ nla_put_failure: return -EMSGSIZE; } -static void nft_flowtable_event(unsigned long event, struct net_device *dev, - struct nft_flowtable *flowtable) +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops); + +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry_rcu(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); + +static int nft_flowtable_event(unsigned long event, struct net_device *dev, + struct nft_flowtable *flowtable, bool changename) { + struct nf_hook_ops *ops; struct nft_hook *hook; + bool match; list_for_each_entry(hook, &flowtable->hook_list, list) { - if (hook->ops.dev != dev) - continue; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - /* flow_offload_netdev_event() cleans up entries for us. */ - nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + /* flow_offload_netdev_event() cleans up entries for us. */ + nft_unregister_flowtable_ops(dev_net(dev), + flowtable, ops); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kzalloc(sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable->hooknum; + ops->priority = flowtable->data.priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->hook_ops_type = NF_HOOK_OP_NFT_FT; + ops->dev = dev; + if (nft_register_flowtable_ops(dev_net(dev), + flowtable, ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } break; } + return 0; +} + +static int __nf_tables_flowtable_event(unsigned long event, + struct net_device *dev, + bool changename) +{ + struct nftables_pernet *nft_net = nft_pernet(dev_net(dev)); + struct nft_flowtable *flowtable; + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + if (nft_flowtable_event(event, dev, + flowtable, changename)) + return 1; + } + } + return 0; } static int nf_tables_flowtable_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_flowtable *flowtable; - struct nft_table *table; + struct nftables_pernet *nft_net; + int ret = NOTIFY_DONE; struct net *net; - if (event != NETDEV_UNREGISTER) - return 0; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; net = dev_net(dev); - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - nft_flowtable_event(event, dev, flowtable); + nft_net = nft_pernet(net); + mutex_lock(&nft_net->commit_mutex); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; } + __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_flowtable_event(event, dev, false)) { + ret = NOTIFY_BAD; } - mutex_unlock(&net->nft.commit_mutex); - - return NOTIFY_DONE; +out_unlock: + mutex_unlock(&nft_net->commit_mutex); + return ret; } static struct notifier_block nf_tables_flowtable_notifier = { @@ -6977,7 +9848,7 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, struct sk_buff *skb2; int err; - if (nlmsg_report(nlh) && + if (!nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; @@ -7000,10 +9871,8 @@ err: -ENOBUFS); } -static int nf_tables_getgen(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { struct sk_buff *skb2; int err; @@ -7012,128 +9881,206 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, if (skb2 == NULL) return -ENOMEM; - err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq); + err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq); if (err < 0) - goto err; + goto err_fill_gen_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + +err_fill_gen_info: kfree_skb(skb2); return err; } static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { - .call_batch = nf_tables_newtable, + .call = nf_tables_newtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_GETTABLE] = { - .call_rcu = nf_tables_gettable, + .call = nf_tables_gettable, + .type = NFNL_CB_RCU, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_DELTABLE] = { - .call_batch = nf_tables_deltable, + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, + [NFT_MSG_DESTROYTABLE] = { + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_NEWCHAIN] = { - .call_batch = nf_tables_newchain, + .call = nf_tables_newchain, + .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_GETCHAIN] = { - .call_rcu = nf_tables_getchain, + .call = nf_tables_getchain, + .type = NFNL_CB_RCU, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_DELCHAIN] = { - .call_batch = nf_tables_delchain, + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, + [NFT_MSG_DESTROYCHAIN] = { + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_NEWRULE] = { - .call_batch = nf_tables_newrule, + .call = nf_tables_newrule, + .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_GETRULE] = { - .call_rcu = nf_tables_getrule, + .call = nf_tables_getrule, + .type = NFNL_CB_RCU, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, + [NFT_MSG_GETRULE_RESET] = { + .call = nf_tables_getrule_reset, + .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_DELRULE] = { - .call_batch = nf_tables_delrule, + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, + [NFT_MSG_DESTROYRULE] = { + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_NEWSET] = { - .call_batch = nf_tables_newset, + .call = nf_tables_newset, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_GETSET] = { - .call_rcu = nf_tables_getset, + .call = nf_tables_getset, + .type = NFNL_CB_RCU, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_DELSET] = { - .call_batch = nf_tables_delset, + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, + [NFT_MSG_DESTROYSET] = { + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_NEWSETELEM] = { - .call_batch = nf_tables_newsetelem, + .call = nf_tables_newsetelem, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM] = { - .call_rcu = nf_tables_getsetelem, + .call = nf_tables_getsetelem, + .type = NFNL_CB_RCU, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, + [NFT_MSG_GETSETELEM_RESET] = { + .call = nf_tables_getsetelem_reset, + .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_DELSETELEM] = { - .call_batch = nf_tables_delsetelem, + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, + [NFT_MSG_DESTROYSETELEM] = { + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETGEN] = { - .call_rcu = nf_tables_getgen, + .call = nf_tables_getgen, + .type = NFNL_CB_RCU, }, [NFT_MSG_NEWOBJ] = { - .call_batch = nf_tables_newobj, + .call = nf_tables_newobj, + .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ] = { - .call_rcu = nf_tables_getobj, + .call = nf_tables_getobj, + .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_DELOBJ] = { - .call_batch = nf_tables_delobj, + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, + [NFT_MSG_DESTROYOBJ] = { + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call_rcu = nf_tables_getobj, + .call = nf_tables_getobj_reset, + .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_NEWFLOWTABLE] = { - .call_batch = nf_tables_newflowtable, + .call = nf_tables_newflowtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_GETFLOWTABLE] = { - .call_rcu = nf_tables_getflowtable, + .call = nf_tables_getflowtable, + .type = NFNL_CB_RCU, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_DELFLOWTABLE] = { - .call_batch = nf_tables_delflowtable, + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, + [NFT_MSG_DESTROYFLOWTABLE] = { + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, @@ -7141,20 +10088,23 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { static int nf_tables_validate(struct net *net) { + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; - switch (net->nft.validate_state) { - case NFT_VALIDATE_SKIP: - break; - case NFT_VALIDATE_NEED: - nft_validate_state_update(net, NFT_VALIDATE_DO); - /* fall through */ - case NFT_VALIDATE_DO: - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { + switch (table->validate_state) { + case NFT_VALIDATE_SKIP: + continue; + case NFT_VALIDATE_NEED: + nft_validate_state_update(table, NFT_VALIDATE_DO); + fallthrough; + case NFT_VALIDATE_DO: if (nft_table_validate(net, table) < 0) return -EAGAIN; + + nft_validate_state_update(table, NFT_VALIDATE_SKIP); + break; } - break; } return 0; @@ -7167,51 +10117,53 @@ static int nf_tables_validate(struct net *net) * * We defer the drop policy until the transaction has been finalized. */ -static void nft_chain_commit_drop_policy(struct nft_trans *trans) +static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans) { struct nft_base_chain *basechain; - if (nft_trans_chain_policy(trans) != NF_DROP) + if (trans->policy != NF_DROP) return; - if (!nft_is_base_chain(trans->ctx.chain)) + if (!nft_is_base_chain(trans->chain)) return; - basechain = nft_base_chain(trans->ctx.chain); + basechain = nft_base_chain(trans->chain); basechain->policy = NF_DROP; } -static void nft_chain_commit_update(struct nft_trans *trans) +static void nft_chain_commit_update(struct nft_trans_chain *trans) { + struct nft_table *table = trans->nft_trans_binding.nft_trans.table; struct nft_base_chain *basechain; - if (nft_trans_chain_name(trans)) { - rhltable_remove(&trans->ctx.table->chains_ht, - &trans->ctx.chain->rhlhead, + if (trans->name) { + rhltable_remove(&table->chains_ht, + &trans->chain->rhlhead, nft_chain_ht_params); - swap(trans->ctx.chain->name, nft_trans_chain_name(trans)); - rhltable_insert_key(&trans->ctx.table->chains_ht, - trans->ctx.chain->name, - &trans->ctx.chain->rhlhead, + swap(trans->chain->name, trans->name); + rhltable_insert_key(&table->chains_ht, + trans->chain->name, + &trans->chain->rhlhead, nft_chain_ht_params); } - if (!nft_is_base_chain(trans->ctx.chain)) + if (!nft_is_base_chain(trans->chain)) return; nft_chain_stats_replace(trans); - basechain = nft_base_chain(trans->ctx.chain); + basechain = nft_base_chain(trans->chain); - switch (nft_trans_chain_policy(trans)) { + switch (trans->policy) { case NF_DROP: case NF_ACCEPT: - basechain->policy = nft_trans_chain_policy(trans); + basechain->policy = trans->policy; break; } } -static void nft_obj_commit_update(struct nft_trans *trans) +static void nft_obj_commit_update(const struct nft_ctx *ctx, + struct nft_trans *trans) { struct nft_object *newobj; struct nft_object *obj; @@ -7219,60 +10171,76 @@ static void nft_obj_commit_update(struct nft_trans *trans) obj = nft_trans_obj(trans); newobj = nft_trans_obj_newobj(trans); - if (obj->ops->update) - obj->ops->update(obj, newobj); + if (WARN_ON_ONCE(!obj->ops->update)) + return; - kfree(newobj); + obj->ops->update(obj, newobj); + nft_obj_destroy(ctx, newobj); } static void nft_commit_release(struct nft_trans *trans) { + struct nft_ctx ctx = { + .net = trans->net, + }; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_DELTABLE: - nf_tables_table_destroy(&trans->ctx); + case NFT_MSG_DESTROYTABLE: + nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: - nf_tables_chain_destroy(&trans->ctx); + case NFT_MSG_DESTROYCHAIN: + if (nft_trans_chain_update(trans)) + nft_hooks_destroy(&nft_trans_chain_hooks(trans)); + else + nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: - nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + case NFT_MSG_DESTROYRULE: + nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: - nft_set_destroy(&trans->ctx, nft_trans_set(trans)); + case NFT_MSG_DESTROYSET: + nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: - nf_tables_set_elem_destroy(&trans->ctx, - nft_trans_elem_set(trans), - nft_trans_elem(trans).priv); + case NFT_MSG_DESTROYSETELEM: + nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_DELOBJ: - nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); + case NFT_MSG_DESTROYOBJ: + nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) - nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } if (trans->put_net) - put_net(trans->ctx.net); + put_net(trans->net); kfree(trans); } static void nf_tables_trans_destroy_work(struct work_struct *w) { + struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work); struct nft_trans *trans, *next; LIST_HEAD(head); spin_lock(&nf_tables_destroy_list_lock); - list_splice_init(&nf_tables_destroy_list, &head); + list_splice_init(&nft_net->destroy_list, &head); spin_unlock(&nf_tables_destroy_list_lock); if (list_empty(&head)) @@ -7281,93 +10249,149 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nft_commit_release(trans); } } +void nf_tables_trans_destroy_flush_work(struct net *net) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + + flush_work(&nft_net->destroy_work); +} +EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); + +static bool nft_expr_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + return false; +} + static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) { + const struct nft_expr *expr, *last; + struct nft_regs_track track = {}; + unsigned int size, data_size; + void *data, *data_boundary; + struct nft_rule_dp *prule; struct nft_rule *rule; - unsigned int alloc = 0; - int i; /* already handled or inactive chain? */ - if (chain->rules_next || !nft_is_active_next(net, chain)) + if (chain->blob_next || !nft_is_active_next(net, chain)) return 0; - rule = list_entry(&chain->rules, struct nft_rule, list); - i = 0; - - list_for_each_entry_continue(rule, &chain->rules, list) { - if (nft_is_active_next(net, rule)) - alloc++; + data_size = 0; + list_for_each_entry(rule, &chain->rules, list) { + if (nft_is_active_next(net, rule)) { + data_size += sizeof(*prule) + rule->dlen; + if (data_size > INT_MAX) + return -ENOMEM; + } } - chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc); - if (!chain->rules_next) + chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size); + if (!chain->blob_next) return -ENOMEM; - list_for_each_entry_continue(rule, &chain->rules, list) { - if (nft_is_active_next(net, rule)) - chain->rules_next[i++] = rule; + data = (void *)chain->blob_next->data; + data_boundary = data + data_size; + size = 0; + + list_for_each_entry(rule, &chain->rules, list) { + if (!nft_is_active_next(net, rule)) + continue; + + prule = (struct nft_rule_dp *)data; + data += offsetof(struct nft_rule_dp, data); + if (WARN_ON_ONCE(data > data_boundary)) + return -ENOMEM; + + size = 0; + track.last = nft_expr_last(rule); + nft_rule_for_each_expr(expr, last, rule) { + track.cur = expr; + + if (nft_expr_reduce(&track, expr)) { + expr = track.cur; + continue; + } + + if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary)) + return -ENOMEM; + + memcpy(data + size, expr, expr->ops->size); + size += expr->ops->size; + } + if (WARN_ON_ONCE(size >= 1 << 12)) + return -ENOMEM; + + prule->handle = rule->handle; + prule->dlen = size; + prule->is_last = 0; + + data += size; + size = 0; + chain->blob_next->size += (unsigned long)(data - (void *)prule); } - chain->rules_next[i] = NULL; + if (WARN_ON_ONCE(data > data_boundary)) + return -ENOMEM; + + prule = (struct nft_rule_dp *)data; + nft_last_rule(chain, prule); + return 0; } static void nf_tables_commit_chain_prepare_cancel(struct net *net) { + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { - struct nft_chain *chain = trans->ctx.chain; - + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { - kvfree(chain->rules_next); - chain->rules_next = NULL; + struct nft_chain *chain = nft_trans_rule_chain(trans); + + kvfree(chain->blob_next); + chain->blob_next = NULL; } } } -static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h) +static void __nf_tables_commit_chain_free_rules(struct rcu_head *h) { - struct nft_rules_old *o = container_of(h, struct nft_rules_old, h); + struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h); - kvfree(o->start); + kvfree(l->blob); } -static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules) +static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob) { - struct nft_rule **r = rules; - struct nft_rules_old *old; - - while (*r) - r++; + struct nft_rule_dp_last *last; - r++; /* rcu_head is after end marker */ - old = (void *) r; - old->start = rules; + /* last rule trailer is after end marker */ + last = (void *)blob + sizeof(*blob) + blob->size; + last->blob = blob; - call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old); + call_rcu(&last->h, __nf_tables_commit_chain_free_rules); } static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) { - struct nft_rule **g0, **g1; + struct nft_rule_blob *g0, *g1; bool next_genbit; next_genbit = nft_gencursor_next(net); - g0 = rcu_dereference_protected(chain->rules_gen_0, + g0 = rcu_dereference_protected(chain->blob_gen_0, lockdep_commit_lock_is_held(net)); - g1 = rcu_dereference_protected(chain->rules_gen_1, + g1 = rcu_dereference_protected(chain->blob_gen_1, lockdep_commit_lock_is_held(net)); /* No changes to this chain? */ - if (chain->rules_next == NULL) { + if (chain->blob_next == NULL) { /* chain had no change in last or next generation */ if (g0 == g1) return; @@ -7376,10 +10400,10 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) * one uses same rules as current generation. */ if (next_genbit) { - rcu_assign_pointer(chain->rules_gen_1, g0); + rcu_assign_pointer(chain->blob_gen_1, g0); nf_tables_commit_chain_free_rules_old(g1); } else { - rcu_assign_pointer(chain->rules_gen_0, g1); + rcu_assign_pointer(chain->blob_gen_0, g1); nf_tables_commit_chain_free_rules_old(g0); } @@ -7387,11 +10411,11 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) } if (next_genbit) - rcu_assign_pointer(chain->rules_gen_1, chain->rules_next); + rcu_assign_pointer(chain->blob_gen_1, chain->blob_next); else - rcu_assign_pointer(chain->rules_gen_0, chain->rules_next); + rcu_assign_pointer(chain->blob_gen_0, chain->blob_next); - chain->rules_next = NULL; + chain->blob_next = NULL; if (g0 == g1) return; @@ -7408,7 +10432,7 @@ static void nft_obj_del(struct nft_object *obj) list_del_rcu(&obj->list); } -static void nft_chain_del(struct nft_chain *chain) +void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; @@ -7417,23 +10441,250 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } -static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, - struct list_head *hook_list) +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) { - struct nft_hook *hook, *next; + struct nft_elem_priv **priv = trans->priv; + unsigned int i; - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - if (hook->inactive) - list_move(&hook->list, hook_list); + for (i = 0; i < trans->count; i++) { + nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]); + nft_setelem_remove(ctx->net, trans->set, priv[i]); } } +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_elem_priv *elem_priv; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem_priv = trans->priv[i]; + if (!nft_setelem_is_catchall(trans->set, elem_priv)) + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct nftables_pernet *nft_net; + struct nft_ctx ctx = {}; + + nft_net = nft_pernet(trans->net); + + mutex_lock(&nft_net->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&nft_net->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&nft_net->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_gc_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + trans->net = maybe_get_net(net); + if (!trans->net) { + kfree(trans); + return NULL; + } + + refcount_inc(&set->refs); + trans->set = set; + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + struct nft_set *set; + + if (nft_trans_gc_space(gc)) + return gc; + + set = gc->set; + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + struct nft_set *set; + + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + set = gc->set; + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq) +{ + struct nft_set_elem_catchall *catchall; + const struct nft_set *set = gc->set; + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + if (nft_set_elem_is_dead(ext)) + goto dead_elem; + + nft_set_elem_dead(ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + return NULL; + + nft_trans_gc_elem_add(gc, catchall->elem); + } + + return gc; +} + +struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) +{ + struct nft_set_elem_catchall *catchall, *next; + u64 tstamp = nft_net_tstamp(gc->net); + const struct nft_set *set = gc->set; + struct nft_elem_priv *elem_priv; + struct nft_set_ext *ext; + + WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!__nft_set_elem_expired(ext, tstamp)) + continue; + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return NULL; + + elem_priv = catchall->elem; + nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); + nft_setelem_catchall_destroy(catchall); + nft_trans_gc_elem_add(gc, elem_priv); + } + + return gc; +} + static void nf_tables_module_autoload_cleanup(struct net *net) { + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; - WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); - list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + list_for_each_entry_safe(req, next, &nft_net->module_list, list) { WARN_ON_ONCE(!req->done); list_del(&req->list); kfree(req); @@ -7442,6 +10693,7 @@ static void nf_tables_module_autoload_cleanup(struct net *net) static void nf_tables_commit_release(struct net *net) { + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; /* all side effects have to be made visible. @@ -7451,67 +10703,253 @@ static void nf_tables_commit_release(struct net *net) * Memory reclaim happens asynchronously from work queue * to prevent expensive synchronize_rcu() in commit phase. */ - if (list_empty(&net->nft.commit_list)) { + if (list_empty(&nft_net->commit_list)) { nf_tables_module_autoload_cleanup(net); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return; } - trans = list_last_entry(&net->nft.commit_list, + trans = list_last_entry(&nft_net->commit_list, struct nft_trans, list); - get_net(trans->ctx.net); + get_net(trans->net); WARN_ON_ONCE(trans->put_net); trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); - list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); + list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); - mutex_unlock(&net->nft.commit_mutex); + schedule_work(&nft_net->destroy_work); + + mutex_unlock(&nft_net->commit_mutex); +} + +static void nft_commit_notify(struct net *net, u32 portid) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + struct sk_buff *batch_skb = NULL, *nskb, *skb; + unsigned char *data; + int len; - schedule_work(&trans_destroy_work); + list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) { + if (!batch_skb) { +new_batch: + batch_skb = skb; + len = NLMSG_GOODSIZE - skb->len; + list_del(&skb->list); + continue; + } + len -= skb->len; + if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) { + data = skb_put(batch_skb, skb->len); + memcpy(data, skb->data, skb->len); + list_del(&skb->list); + kfree_skb(skb); + continue; + } + nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, + NFT_CB(batch_skb).report, GFP_KERNEL); + goto new_batch; + } + + if (batch_skb) { + nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, + NFT_CB(batch_skb).report, GFP_KERNEL); + } + + WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); +} + +static int nf_tables_commit_audit_alloc(struct list_head *adl, + struct nft_table *table) +{ + struct nft_audit_data *adp; + + list_for_each_entry(adp, adl, list) { + if (adp->table == table) + return 0; + } + adp = kzalloc(sizeof(*adp), GFP_KERNEL); + if (!adp) + return -ENOMEM; + adp->table = table; + list_add(&adp->list, adl); + return 0; +} + +static void nf_tables_commit_audit_free(struct list_head *adl) +{ + struct nft_audit_data *adp, *adn; + + list_for_each_entry_safe(adp, adn, adl, list) { + list_del(&adp->list); + kfree(adp); + } +} + +/* nft audit emits the number of elements that get added/removed/updated, + * so NEW/DELSETELEM needs to increment based on the total elem count. + */ +static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans) +{ + switch (trans->msg_type) { + case NFT_MSG_NEWSETELEM: + case NFT_MSG_DELSETELEM: + return nft_trans_container_elem(trans)->nelems; + } + + return 1; +} + +static void nf_tables_commit_audit_collect(struct list_head *adl, + const struct nft_trans *trans, u32 op) +{ + const struct nft_table *table = trans->table; + struct nft_audit_data *adp; + + list_for_each_entry(adp, adl, list) { + if (adp->table == table) + goto found; + } + WARN_ONCE(1, "table=%s not expected in commit list", table->name); + return; +found: + adp->entries += nf_tables_commit_audit_entrycount(trans); + if (!adp->op || adp->op > op) + adp->op = op; +} + +#define AUNFTABLENAMELEN (NFT_TABLE_MAXNAMELEN + 22) + +static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) +{ + struct nft_audit_data *adp, *adn; + char aubuf[AUNFTABLENAMELEN]; + + list_for_each_entry_safe(adp, adn, adl, list) { + snprintf(aubuf, AUNFTABLENAMELEN, "%s:%u", adp->table->name, + generation); + audit_log_nfcfg(aubuf, adp->table->family, adp->entries, + nft2audit_op[adp->op], GFP_KERNEL); + list_del(&adp->list); + kfree(adp); + } +} + +static void nft_set_commit_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->commit || set->dead) + continue; + + set->ops->commit(set); + } +} + +static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) +{ + unsigned int gc_seq; + + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + + return gc_seq; +} + +static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) +{ + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); } static int nf_tables_commit(struct net *net, struct sk_buff *skb) { + struct nftables_pernet *nft_net = nft_pernet(net); + const struct nlmsghdr *nlh = nlmsg_hdr(skb); + struct nft_trans_binding *trans_binding; struct nft_trans *trans, *next; + unsigned int base_seq, gc_seq; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + struct nft_ctx ctx; + LIST_HEAD(adl); int err; - if (list_empty(&net->nft.commit_list)) { - mutex_unlock(&net->nft.commit_mutex); + if (list_empty(&nft_net->commit_list)) { + mutex_unlock(&nft_net->commit_mutex); return 0; } + nft_ctx_init(&ctx, net, skb, nlh, NFPROTO_UNSPEC, NULL, NULL, NULL); + + list_for_each_entry(trans_binding, &nft_net->binding_list, binding_list) { + trans = &trans_binding->nft_trans; + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans)) && + !nft_trans_set_bound(trans)) { + pr_warn_once("nftables ruleset with unbound set\n"); + return -EINVAL; + } + break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans)) && + !nft_trans_chain_bound(trans)) { + pr_warn_once("nftables ruleset with unbound chain\n"); + return -EINVAL; + } + break; + default: + WARN_ONCE(1, "Unhandled bind type %d", trans->msg_type); + break; + } + } + /* 0. Validate ruleset, otherwise roll back for error reporting. */ - if (nf_tables_validate(net) < 0) + if (nf_tables_validate(net) < 0) { + nft_net->validate_state = NFT_VALIDATE_DO; return -EAGAIN; + } err = nft_flow_rule_offload_commit(net); if (err < 0) return err; /* 1. Allocate space for next generation rules_gen_X[] */ - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { + struct nft_table *table = trans->table; int ret; + ret = nf_tables_commit_audit_alloc(&adl, table); + if (ret) { + nf_tables_commit_chain_prepare_cancel(net); + nf_tables_commit_audit_free(&adl); + return ret; + } if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { - chain = trans->ctx.chain; + chain = nft_trans_rule_chain(trans); ret = nf_tables_commit_chain_prepare(net, chain); if (ret < 0) { nf_tables_commit_chain_prepare_cancel(net); + nf_tables_commit_audit_free(&adl); return ret; } } } /* step 2. Make rules_gen_X visible to packet path */ - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(chain, &table->chains, list) nf_tables_commit_chain(net, chain); } @@ -7520,124 +10958,178 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - while (++net->nft.base_seq == 0); + base_seq = nft_base_seq(net); + while (++base_seq == 0) + ; + + /* pairs with smp_load_acquire in nft_lookup_eval */ + smp_store_release(&net->nft.base_seq, base_seq); + + gc_seq = nft_gc_seq_begin(nft_net); /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { + struct nft_table *table = trans->table; + + nft_ctx_update(&ctx, trans); + + nf_tables_commit_audit_collect(&adl, trans, trans->msg_type); switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (!nft_trans_table_enable(trans)) { - nf_tables_table_disable(net, - trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + if (!(table->flags & __NFT_TABLE_F_UPDATE)) { + nft_trans_destroy(trans); + break; } + if (table->flags & NFT_TABLE_F_DORMANT) + nf_tables_table_disable(net, table); + + table->flags &= ~__NFT_TABLE_F_UPDATE; } else { - nft_clear(net, trans->ctx.table); + nft_clear(net, table); } - nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); + nf_tables_table_notify(&ctx, NFT_MSG_NEWTABLE); nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: - list_del_rcu(&trans->ctx.table->list); - nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + case NFT_MSG_DESTROYTABLE: + list_del_rcu(&table->list); + nf_tables_table_notify(&ctx, trans->msg_type); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - nft_chain_commit_update(trans); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_chain_commit_update(nft_trans_container_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, + &nft_trans_chain_hooks(trans)); + list_splice(&nft_trans_chain_hooks(trans), + &nft_trans_basechain(trans)->hook_list); /* trans destroyed after rcu grace period */ } else { - nft_chain_commit_drop_policy(trans); - nft_clear(net, trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_chain_commit_drop_policy(nft_trans_container_chain(trans)); + nft_clear(net, nft_trans_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL); nft_trans_destroy(trans); } break; case NFT_MSG_DELCHAIN: - nft_chain_del(trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); - nf_tables_unregister_hook(trans->ctx.net, - trans->ctx.table, - trans->ctx.chain); + case NFT_MSG_DESTROYCHAIN: + if (nft_trans_chain_update(trans)) { + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, + &nft_trans_chain_hooks(trans)); + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } + } else { + nft_chain_del(nft_trans_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, + NULL); + nf_tables_unregister_hook(ctx.net, ctx.table, + nft_trans_chain(trans)); + } break; case NFT_MSG_NEWRULE: - nft_clear(trans->ctx.net, nft_trans_rule(trans)); - nf_tables_rule_notify(&trans->ctx, - nft_trans_rule(trans), + nft_clear(net, nft_trans_rule(trans)); + nf_tables_rule_notify(&ctx, nft_trans_rule(trans), NFT_MSG_NEWRULE); + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: list_del_rcu(&nft_trans_rule(trans)->list); - nf_tables_rule_notify(&trans->ctx, - nft_trans_rule(trans), - NFT_MSG_DELRULE); - nft_rule_expr_deactivate(&trans->ctx, - nft_trans_rule(trans), + nf_tables_rule_notify(&ctx, nft_trans_rule(trans), + trans->msg_type); + nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); + + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: - nft_clear(net, nft_trans_set(trans)); - /* This avoids hitting -EBUSY when deleting the table - * from the transaction. - */ - if (nft_set_is_anonymous(nft_trans_set(trans)) && - !list_empty(&nft_trans_set(trans)->bindings)) - trans->ctx.table->use--; + list_del(&nft_trans_container_set(trans)->list_trans_newset); + if (nft_trans_set_update(trans)) { + struct nft_set *set = nft_trans_set(trans); + + WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans)); + WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans)); - nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + if (nft_trans_set_size(trans)) + WRITE_ONCE(set->size, nft_trans_set_size(trans)); + } else { + nft_clear(net, nft_trans_set(trans)); + /* This avoids hitting -EBUSY when deleting the table + * from the transaction. + */ + if (nft_set_is_anonymous(nft_trans_set(trans)) && + !list_empty(&nft_trans_set(trans)->bindings)) + nft_use_dec(&table->use); + } + nf_tables_set_notify(&ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); - nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), - NFT_MSG_DELSET, GFP_KERNEL); + nf_tables_set_notify(&ctx, nft_trans_set(trans), + trans->msg_type, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: - te = (struct nft_trans_elem *)trans->data; + te = nft_trans_container_elem(trans); + + nft_trans_elems_add(&ctx, te); - te->set->ops->activate(net, te->set, &te->elem); - nf_tables_setelem_notify(&trans->ctx, te->set, - &te->elem, - NFT_MSG_NEWSETELEM, 0); + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: - te = (struct nft_trans_elem *)trans->data; + case NFT_MSG_DESTROYSETELEM: + te = nft_trans_container_elem(trans); - nf_tables_setelem_notify(&trans->ctx, te->set, - &te->elem, - NFT_MSG_DELSETELEM, 0); - te->set->ops->remove(net, te->set, &te->elem); - atomic_dec(&te->set->nelems); - te->set->ndeact--; + nft_trans_elems_remove(&ctx, te); + + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { - nft_obj_commit_update(trans); - nf_tables_obj_notify(&trans->ctx, + nft_obj_commit_update(&ctx, trans); + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), NFT_MSG_NEWOBJ); } else { nft_clear(net, nft_trans_obj(trans)); - nf_tables_obj_notify(&trans->ctx, + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), NFT_MSG_NEWOBJ); nft_trans_destroy(trans); } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_del(nft_trans_obj(trans)); - nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), - NFT_MSG_DELOBJ); + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), + trans->msg_type); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans)->data.flags = + nft_trans_flowtable_flags(trans); + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), NFT_MSG_NEWFLOWTABLE); @@ -7645,37 +11137,45 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) &nft_trans_flowtable(trans)->hook_list); } else { nft_clear(net, nft_trans_flowtable(trans)); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), - &nft_trans_flowtable(trans)->hook_list, + NULL, NFT_MSG_NEWFLOWTABLE); } nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - nft_flowtable_hooks_del(nft_trans_flowtable(trans), - &nft_trans_flowtable_hooks(trans)); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), - &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_DELFLOWTABLE); + NULL, + trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; } } + nft_set_commit_update(&set_update_list); + + nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); + + nft_gc_seq_end(nft_net, gc_seq); + nft_net->validate_state = NFT_VALIDATE_SKIP; nf_tables_commit_release(net); return 0; @@ -7683,44 +11183,51 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) static void nf_tables_module_autoload(struct net *net) { + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; LIST_HEAD(module_list); - list_splice_init(&net->nft.module_list, &module_list); - mutex_unlock(&net->nft.commit_mutex); + list_splice_init(&nft_net->module_list, &module_list); + mutex_unlock(&nft_net->commit_mutex); list_for_each_entry_safe(req, next, &module_list, list) { request_module("%s", req->module); req->done = true; } - mutex_lock(&net->nft.commit_mutex); - list_splice(&module_list, &net->nft.module_list); + mutex_lock(&nft_net->commit_mutex); + list_splice(&module_list, &nft_net->module_list); } static void nf_tables_abort_release(struct nft_trans *trans) { + struct nft_ctx ctx = { }; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_NEWTABLE: - nf_tables_table_destroy(&trans->ctx); + nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: - nf_tables_chain_destroy(&trans->ctx); + if (nft_trans_chain_update(trans)) + nft_hooks_destroy(&nft_trans_chain_hooks(trans)); + else + nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_NEWRULE: - nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_NEWSET: - nft_set_destroy(&trans->ctx, nft_trans_set(trans)); + nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: - nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv, true); + nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_NEWOBJ: - nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); + nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) - nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; @@ -7728,73 +11235,145 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int __nf_tables_abort(struct net *net, bool autoload) +static void nft_set_abort_update(struct list_head *set_update_list) { + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->abort) + continue; + + set->ops->abort(set); + } +} + +static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) +{ + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; - struct nft_hook *hook; + struct nft_ctx ctx = { + .net = net, + }; + int err = 0; + + if (action == NFNL_ABORT_VALIDATE && + nf_tables_validate(net) < 0) + err = -EAGAIN; - list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, + list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { + struct nft_table *table = trans->table; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (nft_trans_table_enable(trans)) { - nf_tables_table_disable(net, - trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + if (!(table->flags & __NFT_TABLE_F_UPDATE)) { + nft_trans_destroy(trans); + break; } + if (table->flags & __NFT_TABLE_F_WAS_DORMANT) { + nf_tables_table_disable(net, table); + table->flags |= NFT_TABLE_F_DORMANT; + } else if (table->flags & __NFT_TABLE_F_WAS_AWAKEN) { + table->flags &= ~NFT_TABLE_F_DORMANT; + } + if (table->flags & __NFT_TABLE_F_WAS_ORPHAN) { + table->flags &= ~NFT_TABLE_F_OWNER; + table->nlpid = 0; + } + table->flags &= ~__NFT_TABLE_F_UPDATE; nft_trans_destroy(trans); } else { - list_del_rcu(&trans->ctx.table->list); + list_del_rcu(&table->list); } break; case NFT_MSG_DELTABLE: - nft_clear(trans->ctx.net, trans->ctx.table); + case NFT_MSG_DESTROYTABLE: + nft_clear(trans->net, table); nft_trans_destroy(trans); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { - trans->ctx.table->use--; - nft_chain_del(trans->ctx.chain); - nf_tables_unregister_hook(trans->ctx.net, - trans->ctx.table, - trans->ctx.chain); + if (nft_trans_chain_bound(trans)) { + nft_trans_destroy(trans); + break; + } + nft_use_dec_restore(&table->use); + nft_chain_del(nft_trans_chain(trans)); + nf_tables_unregister_hook(trans->net, table, + nft_trans_chain(trans)); } break; case NFT_MSG_DELCHAIN: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, trans->ctx.chain); + case NFT_MSG_DESTROYCHAIN: + if (nft_trans_chain_update(trans)) { + list_splice(&nft_trans_chain_hooks(trans), + &nft_trans_basechain(trans)->hook_list); + } else { + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_chain(trans)); + } nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: - trans->ctx.chain->use--; + if (nft_trans_rule_bound(trans)) { + nft_trans_destroy(trans); + break; + } + nft_use_dec_restore(&nft_trans_rule_chain(trans)->use); list_del_rcu(&nft_trans_rule(trans)->list); - nft_rule_expr_deactivate(&trans->ctx, + nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans), NFT_TRANS_ABORT); + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: - trans->ctx.chain->use++; - nft_clear(trans->ctx.net, nft_trans_rule(trans)); - nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); + case NFT_MSG_DESTROYRULE: + nft_use_inc_restore(&nft_trans_rule_chain(trans)->use); + nft_clear(trans->net, nft_trans_rule(trans)); + nft_rule_expr_activate(&ctx, nft_trans_rule(trans)); + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: - trans->ctx.table->use--; + list_del(&nft_trans_container_set(trans)->list_trans_newset); + if (nft_trans_set_update(trans)) { + nft_trans_destroy(trans); + break; + } + nft_use_dec_restore(&table->use); if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; } + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_set(trans)); + case NFT_MSG_DESTROYSET: + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_set(trans)); + if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(&ctx, nft_trans_set(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -7802,96 +11381,125 @@ static int __nf_tables_abort(struct net *net, bool autoload) nft_trans_destroy(trans); break; } - te = (struct nft_trans_elem *)trans->data; - te->set->ops->remove(net, te->set, &te->elem); - atomic_dec(&te->set->nelems); + te = nft_trans_container_elem(trans); + if (!nft_trans_elems_new_abort(&ctx, te)) { + nft_trans_destroy(trans); + break; + } + + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_DELSETELEM: - te = (struct nft_trans_elem *)trans->data; + case NFT_MSG_DESTROYSETELEM: + te = nft_trans_container_elem(trans); - nft_set_elem_activate(net, te->set, &te->elem); - te->set->ops->activate(net, te->set, &te->elem); - te->set->ndeact--; + nft_trans_elems_destroy_abort(&ctx, te); + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { - kfree(nft_trans_obj_newobj(trans)); + nft_obj_destroy(&ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&table->use); nft_obj_del(nft_trans_obj(trans)); } break; case NFT_MSG_DELOBJ: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_obj(trans)); + case NFT_MSG_DESTROYOBJ: + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - list_for_each_entry(hook, &nft_trans_flowtable(trans)->hook_list, list) - hook->inactive = false; + list_splice(&nft_trans_flowtable_hooks(trans), + &nft_trans_flowtable(trans)->hook_list); } else { - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_flowtable(trans)); } nft_trans_destroy(trans); break; } } + WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); + + nft_set_abort_update(&set_update_list); + synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, - &net->nft.commit_list, list) { - list_del(&trans->list); + &nft_net->commit_list, list) { + nft_trans_list_del(trans); nf_tables_abort_release(trans); } - if (autoload) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - - return 0; + return err; } -static void nf_tables_cleanup(struct net *net) +static int nf_tables_abort(struct net *net, struct sk_buff *skb, + enum nfnl_abort_action action) { - nft_validate_state_update(net, NFT_VALIDATE_SKIP); -} + struct nftables_pernet *nft_net = nft_pernet(net); + unsigned int gc_seq; + int ret; -static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload) -{ - int ret = __nf_tables_abort(net, autoload); + gc_seq = nft_gc_seq_begin(nft_net); + ret = __nf_tables_abort(net, action); + nft_gc_seq_end(nft_net, gc_seq); - mutex_unlock(&net->nft.commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + + mutex_unlock(&nft_net->commit_mutex); return ret; } static bool nf_tables_valid_genid(struct net *net, u32 genid) { + struct nftables_pernet *nft_net = nft_pernet(net); bool genid_ok; - mutex_lock(&net->nft.commit_mutex); + mutex_lock(&nft_net->commit_mutex); + nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || net->nft.base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); /* else, commit mutex has to be released by commit or abort function */ return genid_ok; @@ -7904,7 +11512,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, - .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, .owner = THIS_MODULE, }; @@ -7941,106 +11548,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain, } EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); -/* - * Loop detection - walk through the ruleset beginning at the destination chain - * of a new jump until either the source chain is reached (loop) or all - * reachable chains have been traversed. - * - * The loop check is performed whenever a new jump verdict is added to an - * expression or verdict map or a verdict map is bound to a new chain. - */ - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain); - -static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - const struct nft_data *data; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - return nf_tables_check_loops(ctx, data->verdict.chain); - default: - return 0; - } -} - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain) -{ - const struct nft_rule *rule; - const struct nft_expr *expr, *last; - struct nft_set *set; - struct nft_set_binding *binding; - struct nft_set_iter iter; - - if (ctx->chain == chain) - return -ELOOP; - - list_for_each_entry(rule, &chain->rules, list) { - nft_rule_for_each_expr(expr, last, rule) { - struct nft_immediate_expr *priv; - const struct nft_data *data; - int err; - - if (strcmp(expr->ops->type->name, "immediate")) - continue; - - priv = nft_expr_priv(expr); - if (priv->dreg != NFT_REG_VERDICT) - continue; - - data = &priv->data; - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - err = nf_tables_check_loops(ctx, - data->verdict.chain); - if (err < 0) - return err; - default: - break; - } - } - } - - list_for_each_entry(set, &ctx->table->sets, list) { - if (!nft_is_active_next(ctx->net, set)) - continue; - if (!(set->flags & NFT_SET_MAP) || - set->dtype != NFT_DATA_VERDICT) - continue; - - list_for_each_entry(binding, &set->bindings, list) { - if (!(binding->flags & NFT_SET_MAP) || - binding->chain != chain) - continue; - - iter.genmask = nft_genmask_next(ctx->net); - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_loop_check_setelem; - - set->ops->walk(ctx, set, &iter); - if (iter.err < 0) - return iter.err; - } - } - - return 0; -} - /** * nft_parse_u32_check - fetch u32 attribute and check for maximum value * @@ -8066,28 +11573,24 @@ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) } EXPORT_SYMBOL_GPL(nft_parse_u32_check); -/** - * nft_parse_register - parse a register value from a netlink attribute - * - * @attr: netlink attribute - * - * Parse and translate a register value from a netlink attribute. - * Registers used to be 128 bit wide, these register numbers will be - * mapped to the corresponding 32 bit register numbers. - */ -unsigned int nft_parse_register(const struct nlattr *attr) +static int nft_parse_register(const struct nlattr *attr, u32 *preg) { unsigned int reg; reg = ntohl(nla_get_be32(attr)); switch (reg) { case NFT_REG_VERDICT...NFT_REG_4: - return reg * NFT_REG_SIZE / NFT_REG32_SIZE; + *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE; + break; + case NFT_REG32_00...NFT_REG32_15: + *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + break; default: - return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + return -ERANGE; } + + return 0; } -EXPORT_SYMBOL_GPL(nft_parse_register); /** * nft_dump_register - dump a register value to a netlink attribute @@ -8111,16 +11614,7 @@ int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg) } EXPORT_SYMBOL_GPL(nft_dump_register); -/** - * nft_validate_register_load - validate a load from a register - * - * @reg: the register number - * @len: the length of the data - * - * Validate that the input register is one of the general purpose - * registers and that the length of the load is within the bounds. - */ -int nft_validate_register_load(enum nft_registers reg, unsigned int len) +static int nft_validate_register_load(enum nft_registers reg, unsigned int len) { if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; @@ -8131,26 +11625,56 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len) return 0; } -EXPORT_SYMBOL_GPL(nft_validate_register_load); -/** - * nft_validate_register_store - validate an expressions' register store - * - * @ctx: context of the expression performing the load - * @reg: the destination register number - * @data: the data to load - * @type: the data type - * @len: the length of the data - * - * Validate that a data load uses the appropriate data type for - * the destination register and the length is within the bounds. - * A value of NULL for the data means that its runtime gathered - * data. - */ -int nft_validate_register_store(const struct nft_ctx *ctx, - enum nft_registers reg, - const struct nft_data *data, - enum nft_data_types type, unsigned int len) +int nft_parse_register_load(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *sreg, u32 len) +{ + int err, invalid_reg; + u32 reg, next_register; + + err = nft_parse_register(attr, ®); + if (err < 0) + return err; + + err = nft_validate_register_load(reg, len); + if (err < 0) + return err; + + next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg; + + /* Can't happen: nft_validate_register_load() should have failed */ + if (WARN_ON_ONCE(next_register > NFT_REG32_NUM)) + return -EINVAL; + + /* find first register that did not see an earlier store. */ + invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg); + + /* invalid register within the range that we're loading from? */ + if (invalid_reg < next_register) + return -ENODATA; + + *sreg = reg; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_register_load); + +static void nft_saw_register_store(const struct nft_ctx *__ctx, + int reg, unsigned int len) +{ + unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE); + struct nft_ctx *ctx = (struct nft_ctx *)__ctx; + + if (WARN_ON_ONCE(len == 0 || reg < 0)) + return; + + bitmap_set(ctx->reg_inited, reg, registers); +} + +static int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, + unsigned int len) { int err; @@ -8162,13 +11686,16 @@ int nft_validate_register_store(const struct nft_ctx *ctx, if (data != NULL && (data->verdict.code == NFT_GOTO || data->verdict.code == NFT_JUMP)) { - err = nf_tables_check_loops(ctx, data->verdict.chain); + err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; } - return 0; + break; default: + if (type != NFT_DATA_VALUE) + return -EINVAL; + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; if (len == 0) @@ -8177,17 +11704,39 @@ int nft_validate_register_store(const struct nft_ctx *ctx, sizeof_field(struct nft_regs, data)) return -ERANGE; - if (data != NULL && type != NFT_DATA_VALUE) - return -EINVAL; - return 0; + break; } + + nft_saw_register_store(ctx, reg, len); + return 0; +} + +int nft_parse_register_store(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *dreg, + const struct nft_data *data, + enum nft_data_types type, unsigned int len) +{ + int err; + u32 reg; + + err = nft_parse_register(attr, ®); + if (err < 0) + return err; + + err = nft_validate_register_store(ctx, reg, data, type, len); + if (err < 0) + return err; + + *dreg = reg; + return 0; } -EXPORT_SYMBOL_GPL(nft_validate_register_store); +EXPORT_SYMBOL_GPL(nft_parse_register_store); static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 }, }; static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, @@ -8205,50 +11754,68 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CODE]) return -EINVAL; + + /* zero padding hole for memcmp */ + memset(data, 0, sizeof(*data)); data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict.code) { - default: - switch (data->verdict.code & NF_VERDICT_MASK) { - case NF_ACCEPT: - case NF_DROP: - case NF_QUEUE: - break; - default: - return -EINVAL; - } - /* fall through */ + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + break; case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: break; case NFT_JUMP: case NFT_GOTO: - if (!tb[NFTA_VERDICT_CHAIN]) + if (tb[NFTA_VERDICT_CHAIN]) { + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], + genmask); + } else if (tb[NFTA_VERDICT_CHAIN_ID]) { + chain = nft_chain_lookup_byid(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN_ID], + genmask); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } else { return -EINVAL; - chain = nft_chain_lookup(ctx->net, ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + } + if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) return -EOPNOTSUPP; + if (nft_chain_is_bound(chain)) + return -EINVAL; + if (desc->flags & NFT_DATA_DESC_SETELEM && + chain->flags & NFT_CHAIN_BINDING) + return -EINVAL; + if (!nft_use_inc(&chain->use)) + return -EMFILE; - chain->use++; data->verdict.chain = chain; break; + default: + return -EINVAL; } desc->len = sizeof(data->verdict); - desc->type = NFT_DATA_VERDICT; + return 0; } static void nft_verdict_uninit(const struct nft_data *data) { + struct nft_chain *chain; + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use--; + chain = data->verdict.chain; + nft_use_dec(&chain->use); break; } } @@ -8279,20 +11846,25 @@ nla_put_failure: } static int nft_value_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, - struct nft_data_desc *desc, const struct nlattr *nla) + struct nft_data *data, struct nft_data_desc *desc, + const struct nlattr *nla) { unsigned int len; len = nla_len(nla); if (len == 0) return -EINVAL; - if (len > size) + if (len > desc->size) return -EOVERFLOW; + if (desc->len) { + if (len != desc->len) + return -EINVAL; + } else { + desc->len = len; + } nla_memcpy(data->data, nla, len); - desc->type = NFT_DATA_VALUE; - desc->len = len; + return 0; } @@ -8312,7 +11884,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * * @ctx: context of the expression using the data * @data: destination struct nft_data - * @size: maximum data length * @desc: data description * @nla: netlink attribute containing data * @@ -8322,24 +11893,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * The caller can indicate that it only wants to accept data of type * NFT_DATA_VALUE by passing NULL for the ctx argument. */ -int nft_data_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { struct nlattr *tb[NFTA_DATA_MAX + 1]; int err; + if (WARN_ON_ONCE(!desc->size)) + return -EINVAL; + err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla, nft_data_policy, NULL); if (err < 0) return err; - if (tb[NFTA_DATA_VALUE]) - return nft_value_init(ctx, data, size, desc, - tb[NFTA_DATA_VALUE]); - if (tb[NFTA_DATA_VERDICT] && ctx != NULL) - return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); - return -EINVAL; + if (tb[NFTA_DATA_VALUE]) { + if (desc->type != NFT_DATA_VALUE) + return -EINVAL; + + err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); + } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) { + if (desc->type != NFT_DATA_VERDICT) + return -EINVAL; + + err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); + } else { + err = -EINVAL; + } + + return err; } EXPORT_SYMBOL_GPL(nft_data_init); @@ -8392,31 +11974,35 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -int __nft_release_basechain(struct nft_ctx *ctx) +static void __nft_release_hook(struct net *net, struct nft_table *table) { - struct nft_rule *rule, *nr; + struct nft_flowtable *flowtable; + struct nft_chain *chain; - if (WARN_ON(!nft_is_base_chain(ctx->chain))) - return 0; + list_for_each_entry(chain, &table->chains, list) + __nf_tables_unregister_hook(net, table, chain, true); + list_for_each_entry(flowtable, &table->flowtables, list) + __nft_unregister_flowtable_net_hooks(net, flowtable, + &flowtable->hook_list, + true); +} - nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); - list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { - list_del(&rule->list); - ctx->chain->use--; - nf_tables_rule_release(ctx, rule); - } - nft_chain_del(ctx->chain); - ctx->table->use--; - nf_tables_chain_destroy(ctx); +static void __nft_release_hooks(struct net *net) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + struct nft_table *table; - return 0; + list_for_each_entry(table, &nft_net->tables, list) { + if (nft_table_has_owner(table)) + continue; + + __nft_release_hook(net, table); + } } -EXPORT_SYMBOL_GPL(__nft_release_basechain); -static void __nft_release_tables(struct net *net) +static void __nft_release_table(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable, *nf; - struct nft_table *table, *nt; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; struct nft_rule *rule, *nr; @@ -8426,123 +12012,249 @@ static void __nft_release_tables(struct net *net) .family = NFPROTO_NETDEV, }; - list_for_each_entry_safe(table, nt, &net->nft.tables, list) { - ctx.family = table->family; + ctx.family = table->family; + ctx.table = table; + list_for_each_entry(chain, &table->chains, list) { + if (nft_chain_binding(chain)) + continue; - list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hook(net, table, chain); - /* No packets are walking on these chains anymore. */ - ctx.table = table; - list_for_each_entry(chain, &table->chains, list) { - ctx.chain = chain; - list_for_each_entry_safe(rule, nr, &chain->rules, list) { - list_del(&rule->list); - chain->use--; - nf_tables_rule_release(&ctx, rule); - } - } - list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { - list_del(&flowtable->list); - table->use--; - nf_tables_flowtable_destroy(flowtable); - } - list_for_each_entry_safe(set, ns, &table->sets, list) { - list_del(&set->list); - table->use--; - nft_set_destroy(&ctx, set); - } - list_for_each_entry_safe(obj, ne, &table->objects, list) { - nft_obj_del(obj); - table->use--; - nft_obj_destroy(&ctx, obj); - } - list_for_each_entry_safe(chain, nc, &table->chains, list) { - ctx.chain = chain; - nft_chain_del(chain); - table->use--; - nf_tables_chain_destroy(&ctx); + ctx.chain = chain; + list_for_each_entry_safe(rule, nr, &chain->rules, list) { + list_del(&rule->list); + nft_use_dec(&chain->use); + nf_tables_rule_release(&ctx, rule); } + } + list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { + list_del(&flowtable->list); + nft_use_dec(&table->use); + nf_tables_flowtable_destroy(flowtable); + } + list_for_each_entry_safe(set, ns, &table->sets, list) { + list_del(&set->list); + nft_use_dec(&table->use); + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(&ctx, set); + + nft_set_destroy(&ctx, set); + } + list_for_each_entry_safe(obj, ne, &table->objects, list) { + nft_obj_del(obj); + nft_use_dec(&table->use); + nft_obj_destroy(&ctx, obj); + } + list_for_each_entry_safe(chain, nc, &table->chains, list) { + nft_chain_del(chain); + nft_use_dec(&table->use); + nf_tables_chain_destroy(chain); + } + nf_tables_table_destroy(table); +} + +static void __nft_release_tables(struct net *net) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + struct nft_table *table, *nt; + + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { + if (nft_table_has_owner(table)) + continue; + list_del(&table->list); - nf_tables_table_destroy(&ctx); + + __nft_release_table(net, table); + } +} + +static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct nft_table *table, *to_delete[8]; + struct nftables_pernet *nft_net; + struct netlink_notify *n = ptr; + struct net *net = n->net; + unsigned int deleted; + bool restart = false; + unsigned int gc_seq; + + if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) + return NOTIFY_DONE; + + nft_net = nft_pernet(net); + deleted = 0; + mutex_lock(&nft_net->commit_mutex); + + gc_seq = nft_gc_seq_begin(nft_net); + + nf_tables_trans_destroy_flush_work(net); +again: + list_for_each_entry(table, &nft_net->tables, list) { + if (nft_table_has_owner(table) && + n->portid == table->nlpid) { + if (table->flags & NFT_TABLE_F_PERSIST) { + table->flags &= ~NFT_TABLE_F_OWNER; + continue; + } + __nft_release_hook(net, table); + list_del_rcu(&table->list); + to_delete[deleted++] = table; + if (deleted >= ARRAY_SIZE(to_delete)) + break; + } } + if (deleted) { + restart = deleted >= ARRAY_SIZE(to_delete); + synchronize_rcu(); + while (deleted) + __nft_release_table(net, to_delete[--deleted]); + + if (restart) + goto again; + } + nft_gc_seq_end(nft_net, gc_seq); + + mutex_unlock(&nft_net->commit_mutex); + + return NOTIFY_DONE; } +static struct notifier_block nft_nl_notifier = { + .notifier_call = nft_rcv_nl_event, +}; + static int __net_init nf_tables_init_net(struct net *net) { - INIT_LIST_HEAD(&net->nft.tables); - INIT_LIST_HEAD(&net->nft.commit_list); - INIT_LIST_HEAD(&net->nft.module_list); - mutex_init(&net->nft.commit_mutex); + struct nftables_pernet *nft_net = nft_pernet(net); + + INIT_LIST_HEAD(&nft_net->tables); + INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->destroy_list); + INIT_LIST_HEAD(&nft_net->commit_set_list); + INIT_LIST_HEAD(&nft_net->binding_list); + INIT_LIST_HEAD(&nft_net->module_list); + INIT_LIST_HEAD(&nft_net->notify_list); + mutex_init(&nft_net->commit_mutex); net->nft.base_seq = 1; - net->nft.validate_state = NFT_VALIDATE_SKIP; + nft_net->gc_seq = 0; + nft_net->validate_state = NFT_VALIDATE_SKIP; + INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); return 0; } +static void __net_exit nf_tables_pre_exit_net(struct net *net) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + + mutex_lock(&nft_net->commit_mutex); + __nft_release_hooks(net); + mutex_unlock(&nft_net->commit_mutex); +} + static void __net_exit nf_tables_exit_net(struct net *net) { - mutex_lock(&net->nft.commit_mutex); - if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net, false); + struct nftables_pernet *nft_net = nft_pernet(net); + unsigned int gc_seq; + + mutex_lock(&nft_net->commit_mutex); + + gc_seq = nft_gc_seq_begin(nft_net); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); + + cancel_work_sync(&nft_net->destroy_work); __nft_release_tables(net); - mutex_unlock(&net->nft.commit_mutex); - WARN_ON_ONCE(!list_empty(&net->nft.tables)); - WARN_ON_ONCE(!list_empty(&net->nft.module_list)); + + nft_gc_seq_end(nft_net, gc_seq); + + mutex_unlock(&nft_net->commit_mutex); + + WARN_ON_ONCE(!list_empty(&nft_net->tables)); + WARN_ON_ONCE(!list_empty(&nft_net->module_list)); + WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); + WARN_ON_ONCE(!list_empty(&nft_net->destroy_list)); +} + +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); } static struct pernet_operations nf_tables_net_ops = { - .init = nf_tables_init_net, - .exit = nf_tables_exit_net, + .init = nf_tables_init_net, + .pre_exit = nf_tables_pre_exit_net, + .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, + .id = &nf_tables_net_id, + .size = sizeof(struct nftables_pernet), }; static int __init nf_tables_module_init(void) { int err; - spin_lock_init(&nf_tables_destroy_list_lock); + BUILD_BUG_ON(offsetof(struct nft_trans_table, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_chain, nft_trans_binding.nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_rule, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_set, nft_trans_binding.nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_elem, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_obj, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_flowtable, nft_trans) != 0); + err = register_pernet_subsys(&nf_tables_net_ops); if (err < 0) return err; err = nft_chain_filter_init(); if (err < 0) - goto err1; + goto err_chain_filter; err = nf_tables_core_module_init(); if (err < 0) - goto err2; + goto err_core_module; err = register_netdevice_notifier(&nf_tables_flowtable_notifier); if (err < 0) - goto err3; + goto err_netdev_notifier; err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params); if (err < 0) - goto err4; + goto err_rht_objname; err = nft_offload_init(); if (err < 0) - goto err5; + goto err_offload; + + err = netlink_register_notifier(&nft_nl_notifier); + if (err < 0) + goto err_netlink_notifier; /* must be last */ err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) - goto err6; + goto err_nfnl_subsys; nft_chain_route_init(); return err; -err6: + +err_nfnl_subsys: + netlink_unregister_notifier(&nft_nl_notifier); +err_netlink_notifier: nft_offload_exit(); -err5: +err_offload: rhltable_destroy(&nft_objname_ht); -err4: +err_rht_objname: unregister_netdevice_notifier(&nf_tables_flowtable_notifier); -err3: +err_netdev_notifier: nf_tables_core_module_exit(); -err2: +err_core_module: nft_chain_filter_fini(); -err1: +err_chain_filter: unregister_pernet_subsys(&nf_tables_net_ops); return err; } @@ -8550,12 +12262,13 @@ err1: static void __exit nf_tables_module_exit(void) { nfnetlink_subsys_unregister(&nf_tables_subsys); + netlink_unregister_notifier(&nft_nl_notifier); nft_offload_exit(); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); - cancel_work_sync(&trans_destroy_work); + cancel_work_sync(&trans_gc_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); nf_tables_core_module_exit(); @@ -8566,4 +12279,5 @@ module_exit(nf_tables_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Framework for packet filtering and classification"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 96c74c4c7176..6557a4018c09 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,43 +21,126 @@ #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> -static noinline void __nft_trace_packet(struct nft_traceinfo *info, - const struct nft_chain *chain, - enum nft_trace_types type) +#ifdef CONFIG_MITIGATION_RETPOLINE +static struct static_key_false nf_tables_skip_direct_calls; + +static inline bool nf_skip_indirect_calls(void) { - const struct nft_pktinfo *pkt = info->pkt; + return static_branch_likely(&nf_tables_skip_direct_calls); +} - if (!info->trace || !pkt->skb->nf_trace) +static inline void __init nf_skip_indirect_calls_enable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + static_branch_enable(&nf_tables_skip_direct_calls); +} +#else +static inline void nf_skip_indirect_calls_enable(void) { } +#endif /* CONFIG_MITIGATION_RETPOLINE */ + +static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_rule_dp *rule, + struct nft_traceinfo *info, + enum nft_trace_types type) +{ + if (!info->trace || !info->nf_trace) return; - info->chain = chain; info->type = type; - nft_trace_notify(info); + nft_trace_notify(pkt, verdict, rule, info); } -static inline void nft_trace_packet(struct nft_traceinfo *info, - const struct nft_chain *chain, - const struct nft_rule *rule, +static inline void nft_trace_packet(const struct nft_pktinfo *pkt, + struct nft_verdict *verdict, + struct nft_traceinfo *info, + const struct nft_rule_dp *rule, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { - info->rule = rule; - __nft_trace_packet(info, chain, type); + info->nf_trace = pkt->skb->nf_trace; + __nft_trace_packet(pkt, verdict, rule, info, type); } } +static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt, + struct nft_traceinfo *info) +{ + if (static_branch_unlikely(&nft_trace_enabled)) + info->nf_trace = pkt->skb->nf_trace; +} + +static void nft_bitwise_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs) +{ + const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); + u32 *src = ®s->data[priv->sreg]; + u32 *dst = ®s->data[priv->dreg]; + + *dst = (*src & priv->mask) ^ priv->xor; +} + static void nft_cmp_fast_eval(const struct nft_expr *expr, struct nft_regs *regs) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - u32 mask = nft_cmp_fast_mask(priv->len); - if ((regs->data[priv->sreg] & mask) == priv->data) + if (((regs->data[priv->sreg] & priv->mask) == priv->data) ^ priv->inv) return; regs->verdict.code = NFT_BREAK; } +static void nft_cmp16_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs) +{ + const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); + const u64 *reg_data = (const u64 *)®s->data[priv->sreg]; + const u64 *mask = (const u64 *)&priv->mask; + const u64 *data = (const u64 *)&priv->data; + + if (((reg_data[0] & mask[0]) == data[0] && + ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv) + return; + regs->verdict.code = NFT_BREAK; +} + +static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt, + struct nft_traceinfo *info, + const struct nft_rule_dp *rule, + const struct nft_regs *regs) +{ + enum nft_trace_types type; + + switch (regs->verdict.code & NF_VERDICT_MASK) { + case NFT_CONTINUE: + case NFT_RETURN: + type = NFT_TRACETYPE_RETURN; + break; + case NF_STOLEN: + type = NFT_TRACETYPE_RULE; + /* can't access skb->nf_trace; use copy */ + break; + default: + type = NFT_TRACETYPE_RULE; + + if (info->trace) + info->nf_trace = pkt->skb->nf_trace; + break; + } + + __nft_trace_packet(pkt, ®s->verdict, rule, info, type); +} + +static inline void nft_trace_verdict(const struct nft_pktinfo *pkt, + struct nft_traceinfo *info, + const struct nft_rule_dp *rule, + const struct nft_regs *regs) +{ + if (static_branch_unlikely(&nft_trace_enabled)) + __nft_trace_verdict(pkt, info, rule, regs); +} + static bool nft_payload_fast_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -70,9 +153,9 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) ptr = skb_network_header(skb); else { - if (!pkt->tprot_set) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) return false; - ptr = skb_network_header(skb) + pkt->xt.thoff; + ptr = skb->data + nft_thoff(pkt); } ptr += priv->offset; @@ -101,7 +184,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, base_chain = nft_base_chain(chain); - rcu_read_lock(); pstats = READ_ONCE(base_chain->stats); if (pstats) { local_bh_disable(); @@ -112,70 +194,92 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, u64_stats_update_end(&stats->syncp); local_bh_enable(); } - rcu_read_unlock(); } struct nft_jumpstack { - const struct nft_chain *chain; - struct nft_rule *const *rules; + const struct nft_rule_dp *rule; }; static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_pktinfo *pkt) { -#ifdef CONFIG_RETPOLINE - unsigned long e = (unsigned long)expr->ops->eval; +#ifdef CONFIG_MITIGATION_RETPOLINE + unsigned long e; + + if (nf_skip_indirect_calls()) + goto indirect_call; + + e = (unsigned long)expr->ops->eval; #define X(e, fun) \ do { if ((e) == (unsigned long)(fun)) \ return fun(expr, regs, pkt); } while (0) X(e, nft_payload_eval); X(e, nft_cmp_eval); + X(e, nft_counter_eval); X(e, nft_meta_get_eval); X(e, nft_lookup_eval); +#if IS_ENABLED(CONFIG_NFT_CT) + X(e, nft_ct_get_fast_eval); +#endif X(e, nft_range_eval); X(e, nft_immediate_eval); X(e, nft_byteorder_eval); X(e, nft_dynset_eval); X(e, nft_rt_get_eval); X(e, nft_bitwise_eval); + X(e, nft_objref_eval); + X(e, nft_objref_map_eval); #undef X -#endif /* CONFIG_RETPOLINE */ +indirect_call: +#endif /* CONFIG_MITIGATION_RETPOLINE */ expr->ops->eval(expr, regs, pkt); } +#define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0] +#define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size +#define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen] + +#define nft_rule_dp_for_each_expr(expr, last, rule) \ + for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \ + (expr) != (last); \ + (expr) = nft_rule_expr_next(expr)) + unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv) { const struct nft_chain *chain = priv, *basechain = chain; const struct net *net = nft_net(pkt); - struct nft_rule *const *rules; - const struct nft_rule *rule; const struct nft_expr *expr, *last; + const struct nft_rule_dp *rule; struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; bool genbit = READ_ONCE(net->nft.gencursor); + struct nft_rule_blob *blob; struct nft_traceinfo info; info.trace = false; if (static_branch_unlikely(&nft_trace_enabled)) - nft_trace_init(&info, pkt, ®s.verdict, basechain); + nft_trace_init(&info, pkt, basechain); do_chain: if (genbit) - rules = rcu_dereference(chain->rules_gen_1); + blob = rcu_dereference(chain->blob_gen_1); else - rules = rcu_dereference(chain->rules_gen_0); + blob = rcu_dereference(chain->blob_gen_0); + rule = (struct nft_rule_dp *)blob->data; next_rule: - rule = *rules; regs.verdict.code = NFT_CONTINUE; - for (; *rules ; rules++) { - rule = *rules; - nft_rule_for_each_expr(expr, last, rule) { + for (; !rule->is_last ; rule = nft_rule_next(rule)) { + nft_rule_dp_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) nft_cmp_fast_eval(expr, ®s); + else if (expr->ops == &nft_cmp16_fast_ops) + nft_cmp16_fast_eval(expr, ®s); + else if (expr->ops == &nft_bitwise_fast_ops) + nft_bitwise_fast_eval(expr, ®s); else if (expr->ops != &nft_payload_fast_ops || !nft_payload_fast_eval(expr, ®s, pkt)) expr_call_ops_eval(expr, ®s, pkt); @@ -187,60 +291,58 @@ next_rule: switch (regs.verdict.code) { case NFT_BREAK: regs.verdict.code = NFT_CONTINUE; + nft_trace_copy_nftrace(pkt, &info); continue; case NFT_CONTINUE: - nft_trace_packet(&info, chain, rule, + nft_trace_packet(pkt, ®s.verdict, &info, rule, NFT_TRACETYPE_RULE); continue; } break; } + nft_trace_verdict(pkt, &info, rule, ®s); + switch (regs.verdict.code & NF_VERDICT_MASK) { case NF_ACCEPT: - case NF_DROP: case NF_QUEUE: case NF_STOLEN: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RULE); return regs.verdict.code; + case NF_DROP: + return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM); } switch (regs.verdict.code) { case NFT_JUMP: if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE)) return NF_DROP; - jumpstack[stackptr].chain = chain; - jumpstack[stackptr].rules = rules + 1; + jumpstack[stackptr].rule = nft_rule_next(rule); stackptr++; - /* fall through */ + fallthrough; case NFT_GOTO: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RULE); - chain = regs.verdict.chain; goto do_chain; case NFT_CONTINUE: case NFT_RETURN: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RETURN); break; default: - WARN_ON(1); + WARN_ON_ONCE(1); } if (stackptr > 0) { stackptr--; - chain = jumpstack[stackptr].chain; - rules = jumpstack[stackptr].rules; + rule = jumpstack[stackptr].rule; goto next_rule; } - nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY); + nft_trace_packet(pkt, ®s.verdict, &info, NULL, NFT_TRACETYPE_POLICY); if (static_branch_unlikely(&nft_counters_enabled)) nft_update_chain_stats(basechain, pkt); + if (nft_base_chain(basechain)->policy == NF_DROP) + return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM); + return nft_base_chain(basechain)->policy; } EXPORT_SYMBOL_GPL(nft_do_chain); @@ -257,18 +359,25 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_meta_type, &nft_rt_type, &nft_exthdr_type, + &nft_last_type, + &nft_counter_type, + &nft_objref_type, + &nft_inner_type, }; static struct nft_object_type *nft_basic_objects[] = { #ifdef CONFIG_NETWORK_SECMARK &nft_secmark_obj_type, #endif + &nft_counter_obj_type, }; int __init nf_tables_core_module_init(void) { int err, i, j = 0; + nft_counter_init_seqcount(); + for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) { err = nft_register_obj(nft_basic_objects[i]); if (err) @@ -281,6 +390,8 @@ int __init nf_tables_core_module_init(void) goto err; } + nf_skip_indirect_calls_enable(); + return 0; err: diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 185fc82c99aa..fd30e205de84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -28,6 +28,63 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) return flow; } +void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, + enum flow_dissector_key_id addr_type) +{ + struct nft_flow_match *match = &flow->match; + struct nft_flow_key *mask = &match->mask; + struct nft_flow_key *key = &match->key; + + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) + return; + + key->control.addr_type = addr_type; + mask->control.addr_type = 0xffff; + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); + match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = + offsetof(struct nft_flow_key, control); +} + +struct nft_offload_ethertype { + __be16 value; + __be16 mask; +}; + +static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow) +{ + struct nft_flow_match *match = &flow->match; + struct nft_offload_ethertype ethertype = { + .value = match->key.basic.n_proto, + .mask = match->mask.basic.n_proto, + }; + + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && + (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || + match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { + match->key.basic.n_proto = match->key.cvlan.vlan_tpid; + match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; + match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; + match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; + match->key.vlan.vlan_tpid = ethertype.value; + match->mask.vlan.vlan_tpid = ethertype.mask; + match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = + offsetof(struct nft_flow_key, cvlan); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); + } else if (match->dissector.used_keys & + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && + (match->key.basic.n_proto == htons(ETH_P_8021Q) || + match->key.basic.n_proto == htons(ETH_P_8021AD))) { + match->key.basic.n_proto = match->key.vlan.vlan_tpid; + match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; + match->key.vlan.vlan_tpid = ethertype.value; + match->mask.vlan.vlan_tpid = ethertype.mask; + match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = + offsetof(struct nft_flow_key, vlan); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); + } +} + struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { @@ -37,8 +94,9 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr->ops && expr != nft_expr_last(rule)) { - if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + while (nft_expr_more(rule, expr)) { + if (expr->ops->offload_action && + expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); @@ -61,7 +119,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; - while (expr->ops && expr != nft_expr_last(rule)) { + while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; @@ -72,6 +130,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, expr = nft_expr_next(expr); } + nft_flow_rule_transfer_vlan(ctx, flow); + flow->proto = ctx->dep.l3num; kfree(ctx); @@ -149,7 +209,7 @@ static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, return 0; } -int nft_chain_offload_priority(struct nft_base_chain *basechain) +static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) @@ -158,6 +218,31 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain) return 0; } +bool nft_chain_offload_support(const struct nft_base_chain *basechain) +{ + struct nf_hook_ops *ops; + struct net_device *dev; + struct nft_hook *hook; + + if (nft_chain_offload_priority(basechain) < 0) + return false; + + list_for_each_entry(hook, &basechain->hook_list, list) { + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->pf != NFPROTO_NETDEV || + ops->hooknum != NF_NETDEV_INGRESS) + return false; + + dev = ops->dev; + if (!dev->netdev_ops->ndo_setup_tc && + !flow_indr_dev_exists()) + return false; + } + } + + return true; +} + static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, @@ -180,26 +265,56 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, cls_flow->rule = flow->rule; } -static int nft_flow_offload_rule(struct nft_chain *chain, - struct nft_rule *rule, - struct nft_flow_rule *flow, - enum flow_cls_command command) +static int nft_flow_offload_cmd(const struct nft_chain *chain, + const struct nft_rule *rule, + struct nft_flow_rule *flow, + enum flow_cls_command command, + struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; - struct flow_cls_offload cls_flow; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); - nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack, + nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); - return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, + return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } +static int nft_flow_offload_rule(const struct nft_chain *chain, + struct nft_rule *rule, + struct nft_flow_rule *flow, + enum flow_cls_command command) +{ + struct flow_cls_offload cls_flow; + + return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); +} + +int nft_flow_rule_stats(const struct nft_chain *chain, + const struct nft_rule *rule) +{ + struct flow_cls_offload cls_flow = {}; + struct nft_expr *expr, *next; + int err; + + err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, + &cls_flow); + if (err < 0) + return err; + + nft_rule_for_each_expr(expr, next, rule) { + if (expr->ops->offload_stats) + expr->ops->offload_stats(expr, &cls_flow.stats); + } + + return 0; +} + static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { @@ -265,6 +380,7 @@ static void nft_flow_block_offload_init(struct flow_block_offload *bo, bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; + bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } @@ -290,15 +406,18 @@ static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; + struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); - mutex_lock(&net->nft.commit_mutex); + nft_net = nft_pernet(net); + mutex_lock(&nft_net->commit_mutex); + list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, @@ -311,7 +430,7 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); - err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, basechain, &bo, + err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; @@ -322,8 +441,6 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, return nft_block_setup(basechain, &bo, cmd); } -#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK - static int nft_chain_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) @@ -342,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { - struct net_device *dev; + struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { - dev = hook->ops.dev; - if (this_dev && this_dev != dev) - continue; + list_for_each_entry(ops, &hook->ops_list, list) { + if (this_dev && this_dev != ops->dev) + continue; - err = nft_chain_offload_cmd(basechain, dev, cmd); - if (err < 0 && cmd == FLOW_BLOCK_BIND) { - if (!this_dev) - goto err_flow_block; + err = nft_chain_offload_cmd(basechain, ops->dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; - return err; + return err; + } + i++; } - i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - dev = hook->ops.dev; - nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + nft_chain_offload_cmd(basechain, ops->dev, + FLOW_BLOCK_UNBIND); + } } return err; } @@ -396,41 +516,42 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { + struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; - list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) + list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { + if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; - err = nft_flow_offload_chain(trans->ctx.chain, NULL, + err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_chain(trans->ctx.chain, NULL, + err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); @@ -444,51 +565,52 @@ static void nft_flow_rule_offload_abort(struct net *net, int nft_flow_rule_offload_commit(struct net *net) { + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; - list_for_each_entry(trans, &net->nft.commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); - err = nft_flow_offload_chain(trans->ctx.chain, &policy, + err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); - err = nft_flow_offload_chain(trans->ctx.chain, &policy, + err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - if (trans->ctx.flags & NLM_F_REPLACE || - !(trans->ctx.flags & NLM_F_APPEND)) { + if (trans->flags & NLM_F_REPLACE || + !(trans->flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; @@ -500,35 +622,18 @@ int nft_flow_rule_offload_commit(struct net *net) } } - list_for_each_entry(trans, &net->nft.commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) - continue; - - switch (trans->msg_type) { - case NFT_MSG_NEWRULE: - case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) - continue; - - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); - break; - default: - break; - } - } - return err; } -static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) +static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, + struct net_device *dev) { struct nft_base_chain *basechain; - struct net *net = dev_net(dev); struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -540,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops(hook, dev)) continue; found = hook; @@ -560,19 +665,21 @@ static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); + nft_net = nft_pernet(net); + mutex_lock(&nft_net->commit_mutex); + chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 87b36da5cd98..a88abae5a9de 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -7,7 +7,7 @@ #include <linux/module.h> #include <linux/static_key.h> #include <linux/hash.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/skbuff.h> @@ -15,6 +15,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -25,22 +26,6 @@ DEFINE_STATIC_KEY_FALSE(nft_trace_enabled); EXPORT_SYMBOL_GPL(nft_trace_enabled); -static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb) -{ - __be32 id; - - /* using skb address as ID results in a limited number of - * values (and quick reuse). - * - * So we attempt to use as many skb members that will not - * change while skb is with netfilter. - */ - id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb), - skb->skb_iif); - - return nla_put_be32(nlskb, NFTA_TRACE_ID, id); -} - static int trace_fill_header(struct sk_buff *nlskb, u16 type, const struct sk_buff *skb, int off, unsigned int len) @@ -106,6 +91,52 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb, return 0; } +static int nf_trace_fill_ct_info(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + const struct nf_ct_hook *ct_hook; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + u32 state; + + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) + return 0; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { + if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */ + return 0; + + state = NF_CT_STATE_UNTRACKED_BIT; + } else { + state = NF_CT_STATE_BIT(ctinfo); + } + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state))) + return -1; + + if (ct) { + u32 id = ct_hook->get_id(&ct->ct_general); + u32 status = READ_ONCE(ct->status); + u8 dir = CTINFO2DIR(ctinfo); + + if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir)) + return -1; + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id)) + return -1; + + /* Kernel implementation detail, withhold this from userspace for now */ + status &= ~IPS_NAT_CLASH; + + if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status))) + return -1; + } + + return 0; +} + static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { @@ -113,17 +144,17 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, int off = skb_network_offset(skb); unsigned int len, nh_end; - nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len; + nh_end = pkt->flags & NFT_PKTINFO_L4PROTO ? nft_thoff(pkt) : skb->len; len = min_t(unsigned int, nh_end - skb_network_offset(skb), NFT_TRACETYPE_NETWORK_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) return -1; - if (pkt->tprot_set) { - len = min_t(unsigned int, skb->len - pkt->xt.thoff, + if (pkt->flags & NFT_PKTINFO_L4PROTO) { + len = min_t(unsigned int, skb->len - nft_thoff(pkt), NFT_TRACETYPE_TRANSPORT_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, - pkt->xt.thoff, len)) + nft_thoff(pkt), len)) return -1; } @@ -140,9 +171,11 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, } static int nf_trace_fill_rule_info(struct sk_buff *nlskb, + const struct nft_verdict *verdict, + const struct nft_rule_dp *rule, const struct nft_traceinfo *info) { - if (!info->rule) + if (!rule || rule->is_last) return 0; /* a continue verdict with ->type == RETURN means that this is @@ -151,15 +184,16 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb, * Since no rule matched, the ->rule pointer is invalid. */ if (info->type == NFT_TRACETYPE_RETURN && - info->verdict->code == NFT_CONTINUE) + verdict->code == NFT_CONTINUE) return 0; return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE, - cpu_to_be64(info->rule->handle), + cpu_to_be64(rule->handle), NFTA_TRACE_PAD); } -static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) +static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict, + struct nft_traceinfo *info) { switch (info->type) { case NFT_TRACETYPE_RETURN: @@ -169,7 +203,7 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) return false; } - switch (info->verdict->code) { + switch (verdict->code) { case NFT_JUMP: case NFT_GOTO: break; @@ -180,26 +214,54 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) return true; } -void nft_trace_notify(struct nft_traceinfo *info) +static const struct nft_chain *nft_trace_get_chain(const struct nft_rule_dp *rule, + const struct nft_traceinfo *info) { - const struct nft_pktinfo *pkt = info->pkt; - struct nfgenmsg *nfmsg; + const struct nft_rule_dp_last *last; + + if (!rule) + return &info->basechain->chain; + + while (!rule->is_last) + rule = nft_rule_next(rule); + + last = (const struct nft_rule_dp_last *)rule; + + if (WARN_ON_ONCE(!last->chain)) + return &info->basechain->chain; + + return last->chain; +} + +void nft_trace_notify(const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_rule_dp *rule, + struct nft_traceinfo *info) +{ + const struct nft_chain *chain; struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; + u32 mark = 0; u16 event; if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE)) return; + chain = nft_trace_get_chain(rule, info); + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + - nla_total_size(strlen(info->chain->table->name)) + - nla_total_size(strlen(info->chain->name)) + + nla_total_size(strlen(chain->table->name)) + + nla_total_size(strlen(chain->name)) + nla_total_size_64bit(sizeof(__be64)) + /* rule handle */ nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(sizeof(u32)) + /* ct id */ + nla_total_size(sizeof(u8)) + /* ct direction */ + nla_total_size(sizeof(u32)) + /* ct state */ + nla_total_size(sizeof(u32)) + /* ct status */ + nla_total_size(sizeof(u32)) + /* trace id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + @@ -211,39 +273,35 @@ void nft_trace_notify(struct nft_traceinfo *info) nla_total_size(sizeof(u32)) + /* nfproto */ nla_total_size(sizeof(u32)); /* policy */ - if (nft_trace_have_verdict_chain(info)) - size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */ + if (nft_trace_have_verdict_chain(verdict, info)) + size += nla_total_size(strlen(verdict->chain->name)); /* jump target */ skb = nlmsg_new(size, GFP_ATOMIC); if (!skb) return; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE); - nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(skb, 0, 0, event, 0, info->basechain->type->family, + NFNETLINK_V0, 0); if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = info->basechain->type->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt)))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type))) goto nla_put_failure; - if (trace_fill_id(skb, pkt->skb)) + if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid)) goto nla_put_failure; - if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name)) + if (nla_put_string(skb, NFTA_TRACE_CHAIN, chain->name)) goto nla_put_failure; - if (nla_put_string(skb, NFTA_TRACE_TABLE, info->chain->table->name)) + if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name)) goto nla_put_failure; - if (nf_trace_fill_rule_info(skb, info)) + if (nf_trace_fill_rule_info(skb, verdict, rule, info)) goto nla_put_failure; switch (info->type) { @@ -251,19 +309,31 @@ void nft_trace_notify(struct nft_traceinfo *info) case __NFT_TRACETYPE_MAX: break; case NFT_TRACETYPE_RETURN: - case NFT_TRACETYPE_RULE: - if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict)) + case NFT_TRACETYPE_RULE: { + unsigned int v; + + if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, verdict)) goto nla_put_failure; + + /* pkt->skb undefined iff NF_STOLEN, disable dump */ + v = verdict->code & NF_VERDICT_MASK; + if (v == NF_STOLEN) + info->packet_dumped = true; + else + mark = pkt->skb->mark; + break; + } case NFT_TRACETYPE_POLICY: + mark = pkt->skb->mark; + if (nla_put_be32(skb, NFTA_TRACE_POLICY, htonl(info->basechain->policy))) goto nla_put_failure; break; } - if (pkt->skb->mark && - nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark))) + if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark))) goto nla_put_failure; if (!info->packet_dumped) { @@ -272,6 +342,10 @@ void nft_trace_notify(struct nft_traceinfo *info) if (nf_trace_fill_pkt_info(skb, pkt)) goto nla_put_failure; + + if (nf_trace_fill_ct_info(skb, pkt->skb)) + goto nla_put_failure; + info->packet_dumped = true; } @@ -285,12 +359,20 @@ void nft_trace_notify(struct nft_traceinfo *info) } void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, - const struct nft_verdict *verdict, const struct nft_chain *chain) { + static siphash_key_t trace_key __read_mostly; + struct sk_buff *skb = pkt->skb; + info->basechain = nft_base_chain(chain); info->trace = true; + info->nf_trace = pkt->skb->nf_trace; info->packet_dumped = false; - info->pkt = pkt; - info->verdict = verdict; + + net_get_random_once(&trace_key, sizeof(trace_key)); + + info->skbid = (u32)siphash_3u32(hash32_ptr(skb), + skb_get_hash_net(nft_net(pkt), skb), + skb->skb_iif, + &trace_key); } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 99127e2d95a8..811d02b4c4f7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -28,11 +28,14 @@ #include <linux/sched/signal.h> #include <net/netlink.h> +#include <net/netns/generic.h> +#include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); +MODULE_DESCRIPTION("Netfilter messages via netlink socket"); #define nfnl_dereference_protected(id) \ rcu_dereference_protected(table[(id)].subsys, \ @@ -40,11 +43,39 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); #define NFNL_MAX_ATTR_COUNT 32 +static unsigned int nfnetlink_pernet_id __read_mostly; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static DEFINE_SPINLOCK(nfnl_grp_active_lock); +#endif + +struct nfnl_net { + struct sock *nfnl; +}; + static struct { struct mutex mutex; const struct nfnetlink_subsystem __rcu *subsys; } table[NFNL_SUBSYS_COUNT]; +static struct lock_class_key nfnl_lockdep_keys[NFNL_SUBSYS_COUNT]; + +static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = { + [NFNL_SUBSYS_NONE] = "nfnl_subsys_none", + [NFNL_SUBSYS_CTNETLINK] = "nfnl_subsys_ctnetlink", + [NFNL_SUBSYS_CTNETLINK_EXP] = "nfnl_subsys_ctnetlink_exp", + [NFNL_SUBSYS_QUEUE] = "nfnl_subsys_queue", + [NFNL_SUBSYS_ULOG] = "nfnl_subsys_ulog", + [NFNL_SUBSYS_OSF] = "nfnl_subsys_osf", + [NFNL_SUBSYS_IPSET] = "nfnl_subsys_ipset", + [NFNL_SUBSYS_ACCT] = "nfnl_subsys_acct", + [NFNL_SUBSYS_CTNETLINK_TIMEOUT] = "nfnl_subsys_cttimeout", + [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper", + [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables", + [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat", + [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook", +}; + static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK, [NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK, @@ -57,6 +88,11 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, }; +static struct nfnl_net *nfnl_pernet(struct net *net) +{ + return net_generic(net, nfnetlink_pernet_id); +} + void nfnl_lock(__u8 subsys_id) { mutex_lock(&table[subsys_id].mutex); @@ -131,30 +167,51 @@ nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss) int nfnetlink_has_listeners(struct net *net, unsigned int group) { - return netlink_has_listeners(net->nfnl, group); + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + return netlink_has_listeners(nfnlnet->nfnl, group); } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { - return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + return nlmsg_notify(nfnlnet->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { - return netlink_set_err(net->nfnl, portid, group, error); + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + return netlink_set_err(nfnlnet->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) { - return netlink_unicast(net->nfnl, skb, portid, flags); + struct nfnl_net *nfnlnet = nfnl_pernet(net); + int err; + + err = nlmsg_unicast(nfnlnet->nfnl, skb, portid); + if (err == -EAGAIN) + err = -ENOBUFS; + + return err; } EXPORT_SYMBOL_GPL(nfnetlink_unicast); +void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, + __u32 group, gfp_t allocation) +{ + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + netlink_broadcast(nfnlnet->nfnl, skb, portid, group, allocation); +} +EXPORT_SYMBOL_GPL(nfnetlink_broadcast); + /* Process one complete nfnetlink message. */ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) @@ -171,6 +228,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, type = nlh->nlmsg_type; replay: rcu_read_lock(); + ss = nfnetlink_get_subsys(type); if (!ss) { #ifdef CONFIG_MODULES @@ -194,11 +252,19 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + struct nfnl_net *nfnlnet = nfnl_pernet(net); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; __u8 subsys_id = NFNL_SUBSYS_ID(type); + struct nfnl_info info = { + .net = net, + .sk = nfnlnet->nfnl, + .nlh = nlh, + .nfmsg = nlmsg_data(nlh), + .extack = extack, + }; /* Sanity-check NFNL_MAX_ATTR_COUNT */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { @@ -214,24 +280,32 @@ replay: return err; } - if (nc->call_rcu) { - err = nc->call_rcu(net, net->nfnl, skb, nlh, - (const struct nlattr **)cda, - extack); + if (!nc->call) { rcu_read_unlock(); - } else { + return -EINVAL; + } + + switch (nc->type) { + case NFNL_CB_RCU: + err = nc->call(skb, &info, (const struct nlattr **)cda); + rcu_read_unlock(); + break; + case NFNL_CB_MUTEX: rcu_read_unlock(); nfnl_lock(subsys_id); if (nfnl_dereference_protected(subsys_id) != ss || - nfnetlink_find_client(type, ss) != nc) + nfnetlink_find_client(type, ss) != nc) { + nfnl_unlock(subsys_id); err = -EAGAIN; - else if (nc->call) - err = nc->call(net, net->nfnl, skb, nlh, - (const struct nlattr **)cda, - extack); - else - err = -EINVAL; + break; + } + err = nc->call(skb, &info, (const struct nlattr **)cda); nfnl_unlock(subsys_id); + break; + default: + rcu_read_unlock(); + err = -EINVAL; + break; } if (err == -EAGAIN) goto replay; @@ -302,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; + struct nlmsghdr *onlh = nlh; LIST_HEAD(err_list); u32 status; int err; @@ -310,8 +385,9 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, return netlink_ack(skb, nlh, -EINVAL, NULL); replay: status = 0; - +replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); + nlh = onlh; if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); @@ -328,31 +404,36 @@ replay: { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } } if (!ss->valid_genid || !ss->commit || !ss->abort) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } if (!try_module_get(ss->owner)) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } if (!ss->valid_genid(net, genid)) { module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); - return kfree_skb(skb); + return consume_skb(skb); } nfnl_unlock(subsys_id); + if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); + nfnl_err_add(&err_list, nlh, 0, &extack); + } + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -409,12 +490,25 @@ replay: goto ack; } + if (nc->type != NFNL_CB_BATCH) { + err = -EINVAL; + goto ack; + } + { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + struct nfnl_net *nfnlnet = nfnl_pernet(net); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; + u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); int attrlen = nlh->nlmsg_len - min_len; + struct nfnl_info info = { + .net = net, + .sk = nfnlnet->nfnl, + .nlh = nlh, + .nfmsg = nlmsg_data(nlh), + .extack = &extack, + }; /* Sanity-check NFTA_MAX_ATTR */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { @@ -425,15 +519,11 @@ replay: err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, attr, attrlen, - ss->cb[cb_id].policy, NULL); + ss->cb[cb_id].policy, &extack); if (err < 0) goto ack; - if (nc->call_batch) { - err = nc->call_batch(net, net->nfnl, skb, nlh, - (const struct nlattr **)cda, - &extack); - } + err = nc->call(skb, &info, (const struct nlattr **)cda); /* The lock was released to autoload some module, we * have to abort and start from scratch using the @@ -450,7 +540,8 @@ ack: * processed, this avoids that the same error is * reported several times when replaying the batch. */ - if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) { + if (err == -ENOMEM || + nfnl_err_add(&err_list, nlh, err, &extack) < 0) { /* We failed to enqueue an error, reset the * list of errors and send OOM to userspace * pointing to the batch header. @@ -476,9 +567,9 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb, true); + ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD); nfnl_err_reset(&err_list); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { @@ -487,17 +578,32 @@ done: status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { - ss->abort(net, oskb, false); + ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); + } else if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); + nfnl_err_add(&err_list, nlh, 0, &extack); } } else { - ss->abort(net, oskb, false); + enum nfnl_abort_action abort_action; + + if (status & NFNL_BATCH_FAILURE) + abort_action = NFNL_ABORT_NONE; + else + abort_action = NFNL_ABORT_VALIDATE; + + err = ss->abort(net, oskb, abort_action); + if (err == -EAGAIN) { + nfnl_err_reset(&err_list); + consume_skb(skb); + module_put(ss->owner); + status |= NFNL_BATCH_FAILURE; + goto replay_abort; + } } - if (ss->cleanup) - ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); } @@ -535,7 +641,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); /* Work around old nft using host byte order */ - if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) + if (nfgenmsg->res_id == (__force __be16)NFNL_SUBSYS_NFTABLES) res_id = NFNL_SUBSYS_NFTABLES; else res_id = ntohs(nfgenmsg->res_id); @@ -563,7 +669,44 @@ static void nfnetlink_rcv(struct sk_buff *skb) netlink_rcv_skb(skb, nfnetlink_rcv_msg); } -#ifdef CONFIG_MODULES +static void nfnetlink_bind_event(struct net *net, unsigned int group) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + int type, group_bit; + u8 v; + + /* All NFNLGRP_CONNTRACK_* group bits fit into u8. + * The other groups are not relevant and can be ignored. + */ + if (group >= 8) + return; + + type = nfnl_group2type[group]; + + switch (type) { + case NFNL_SUBSYS_CTNETLINK: + break; + case NFNL_SUBSYS_CTNETLINK_EXP: + break; + default: + return; + } + + group_bit = (1 << group); + + spin_lock(&nfnl_grp_active_lock); + v = READ_ONCE(nf_ctnetlink_has_listener); + if ((v & group_bit) == 0) { + v |= group_bit; + + /* read concurrently without nfnl_grp_active_lock held. */ + WRITE_ONCE(nf_ctnetlink_has_listener, v); + } + + spin_unlock(&nfnl_grp_active_lock); +#endif +} + static int nfnetlink_bind(struct net *net, int group) { const struct nfnetlink_subsystem *ss; @@ -579,43 +722,82 @@ static int nfnetlink_bind(struct net *net, int group) rcu_read_unlock(); if (!ss) request_module_nowait("nfnetlink-subsys-%d", type); + + nfnetlink_bind_event(net, group); return 0; } + +static void nfnetlink_unbind(struct net *net, int group) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + int type, group_bit; + + if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX) + return; + + type = nfnl_group2type[group]; + + switch (type) { + case NFNL_SUBSYS_CTNETLINK: + break; + case NFNL_SUBSYS_CTNETLINK_EXP: + break; + default: + return; + } + + /* ctnetlink_has_listener is u8 */ + if (group >= 8) + return; + + group_bit = (1 << group); + + spin_lock(&nfnl_grp_active_lock); + if (!nfnetlink_has_listeners(net, group)) { + u8 v = READ_ONCE(nf_ctnetlink_has_listener); + + v &= ~group_bit; + + /* read concurrently without nfnl_grp_active_lock held. */ + WRITE_ONCE(nf_ctnetlink_has_listener, v); + } + spin_unlock(&nfnl_grp_active_lock); #endif +} static int __net_init nfnetlink_net_init(struct net *net) { - struct sock *nfnl; + struct nfnl_net *nfnlnet = nfnl_pernet(net); struct netlink_kernel_cfg cfg = { .groups = NFNLGRP_MAX, .input = nfnetlink_rcv, -#ifdef CONFIG_MODULES .bind = nfnetlink_bind, -#endif + .unbind = nfnetlink_unbind, }; - nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); - if (!nfnl) + nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); + if (!nfnlnet->nfnl) return -ENOMEM; - net->nfnl_stash = nfnl; - rcu_assign_pointer(net->nfnl, nfnl); return 0; } static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) { + struct nfnl_net *nfnlnet; struct net *net; - list_for_each_entry(net, net_exit_list, exit_list) - RCU_INIT_POINTER(net->nfnl, NULL); - synchronize_net(); - list_for_each_entry(net, net_exit_list, exit_list) - netlink_kernel_release(net->nfnl_stash); + list_for_each_entry(net, net_exit_list, exit_list) { + nfnlnet = nfnl_pernet(net); + + netlink_kernel_release(nfnlnet->nfnl); + } } static struct pernet_operations nfnetlink_net_ops = { .init = nfnetlink_net_init, .exit_batch = nfnetlink_net_exit_batch, + .id = &nfnetlink_pernet_id, + .size = sizeof(struct nfnl_net), }; static int __init nfnetlink_init(void) @@ -626,7 +808,7 @@ static int __init nfnetlink_init(void) BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE); for (i=0; i<NFNL_SUBSYS_COUNT; i++) - mutex_init(&table[i].mutex); + __mutex_init(&table[i].mutex, nfnl_lockdep_names[i], &nfnl_lockdep_keys[i]); return register_pernet_subsys(&nfnetlink_net_ops); } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5827117f2635..505f46a32173 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> - * (C) 2011 Intra2net AG <http://www.intra2net.com> + * (C) 2011 Intra2net AG <https://www.intra2net.com> */ #include <linux/init.h> #include <linux/module.h> @@ -16,6 +16,7 @@ #include <linux/errno.h> #include <net/netlink.h> #include <net/sock.h> +#include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> @@ -41,17 +42,27 @@ struct nfacct_filter { u32 mask; }; +struct nfnl_acct_net { + struct list_head nfnl_acct_list; +}; + +static unsigned int nfnl_acct_net_id __read_mostly; + +static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net) +{ + return net_generic(net, nfnl_acct_net_id); +} + #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ -static int nfnl_acct_new(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *nfacct, *matching = NULL; - char *acct_name; unsigned int size = 0; + char *acct_name; u32 flags = 0; if (!tb[NFACCT_NAME]) @@ -61,11 +72,11 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, if (strlen(acct_name) == 0) return -EINVAL; - list_for_each_entry(nfacct, &net->nfnl_acct_list, head) { + list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = nfacct; @@ -73,7 +84,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, } if (matching) { - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* reset counters if you request a replacement. */ atomic64_set(&matching->pkts, 0); atomic64_set(&matching->bytes, 0); @@ -112,7 +123,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, nfacct->flags = flags; } - nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); + nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, @@ -123,7 +134,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } refcount_set(&nfacct->refcnt, 1); - list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list); + list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list); return 0; } @@ -132,21 +143,16 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_acct *acct) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; u64 pkts, bytes; u32 old_flags; event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFACCT_NAME, acct->name)) goto nla_put_failure; @@ -188,6 +194,7 @@ static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; @@ -199,7 +206,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; @@ -264,16 +271,15 @@ static int nfnl_acct_start(struct netlink_callback *cb) return 0; } -static int nfnl_acct_get(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, .start = nfnl_acct_start, @@ -281,14 +287,14 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, .data = (void *)tb[NFACCT_FILTER], }; - return netlink_dump_start(nfnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!tb[NFACCT_NAME]) return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &net->nfnl_acct_list, head) { + list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) @@ -301,21 +307,18 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, } ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), - NFNL_MSG_ACCT_NEW, cur); + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), + NFNL_MSG_ACCT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } @@ -337,24 +340,23 @@ static int nfnl_acct_try_del(struct nf_acct *cur) return ret; } -static int nfnl_acct_del(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *cur, *tmp; int ret = -ENOENT; char *acct_name; if (!tb[NFACCT_NAME]) { - list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) + list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &net->nfnl_acct_list, head) { + list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -377,18 +379,30 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { - [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_NEW] = { + .call = nfnl_acct_new, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_GET] = { + .call = nfnl_acct_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_GET_CTRZERO] = { + .call = nfnl_acct_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_DEL] = { + .call = nfnl_acct_del, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, }; static const struct nfnetlink_subsystem nfnl_acct_subsys = { @@ -402,10 +416,11 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *acct = NULL; rcu_read_lock(); - list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; @@ -457,8 +472,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct) kfree_skb(skb); return; } - netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, - GFP_ATOMIC); + nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC); } int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct) @@ -488,16 +502,17 @@ EXPORT_SYMBOL_GPL(nfnl_acct_overquota); static int __net_init nfnl_acct_net_init(struct net *net) { - INIT_LIST_HEAD(&net->nfnl_acct_list); + INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list); return 0; } static void __net_exit nfnl_acct_net_exit(struct net *net) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *tmp; - list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) { + list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) { list_del_rcu(&cur->head); if (refcount_dec_and_test(&cur->refcnt)) @@ -508,6 +523,8 @@ static void __net_exit nfnl_acct_net_exit(struct net *net) static struct pernet_operations nfnl_acct_ops = { .init = nfnl_acct_net_init, .exit = nfnl_acct_net_exit, + .id = &nfnl_acct_net_id, + .size = sizeof(struct nfnl_acct_net), }; static int __init nfnl_acct_init(void) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 5b0d0a77379c..97248963a7d3 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -96,11 +96,13 @@ static int nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); + const struct nf_conntrack_helper *helper; if (attr == NULL) return -EINVAL; - if (help->helper->data_len == 0) + helper = rcu_dereference(help->helper); + if (!helper || helper->data_len == 0) return -EINVAL; nla_memcpy(help->data, attr, sizeof(help->data)); @@ -111,9 +113,11 @@ static int nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct) { const struct nf_conn_help *help = nfct_help(ct); + const struct nf_conntrack_helper *helper; - if (help->helper->data_len && - nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data)) + helper = rcu_dereference(help->helper); + if (helper && helper->data_len && + nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data)) goto nla_put_failure; return 0; @@ -146,7 +150,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; - nla_strlcpy(expect_policy->name, + nla_strscpy(expect_policy->name, tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); @@ -233,7 +237,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], if (ret < 0) goto err1; - nla_strlcpy(helper->name, + nla_strscpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > sizeof_field(struct nf_conn_help, data)) { @@ -380,10 +384,14 @@ static int nfnl_cthelper_update(const struct nlattr * const tb[], struct nf_conntrack_helper *helper) { + u32 size; int ret; - if (tb[NFCTH_PRIV_DATA_LEN]) - return -EBUSY; + if (tb[NFCTH_PRIV_DATA_LEN]) { + size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); + if (size != helper->data_len) + return -EBUSY; + } if (tb[NFCTH_POLICY]) { ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]); @@ -408,10 +416,8 @@ nfnl_cthelper_update(const struct nlattr * const tb[], return 0; } -static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; @@ -441,7 +447,7 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, tuple.dst.protonum != cur->tuple.dst.protonum)) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; helper = cur; @@ -526,20 +532,15 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_conntrack_helper *helper) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; int status; event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFCTH_NAME, helper->name)) goto nla_put_failure; @@ -612,10 +613,8 @@ out: return skb->len; } -static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { int ret = -ENOENT; struct nf_conntrack_helper *cur; @@ -628,11 +627,11 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, }; - return netlink_dump_start(nfnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (tb[NFCTH_NAME]) @@ -664,29 +663,23 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, } ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } -static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { char *helper_name = NULL; struct nf_conntrack_helper *cur; @@ -748,15 +741,24 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { - [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, - [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, - [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, + [NFNL_MSG_CTHELPER_NEW] = { + .call = nfnl_cthelper_new, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, + [NFNL_MSG_CTHELPER_GET] = { + .call = nfnl_cthelper_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, + [NFNL_MSG_CTHELPER_DEL] = { + .call = nfnl_cthelper_del, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, }; static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index da915c224a82..38d75484e531 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -20,6 +20,7 @@ #include <linux/netfilter.h> #include <net/netlink.h> +#include <net/netns/generic.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -30,6 +31,24 @@ #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> +static unsigned int nfct_timeout_id __read_mostly; + +struct ctnl_timeout { + struct list_head head; + struct list_head free_head; + struct rcu_head rcu_head; + refcount_t refcnt; + char name[CTNL_TIMEOUT_NAME_MAX]; + + /* must be at the end */ + struct nf_ct_timeout timeout; +}; + +struct nfct_timeout_pernet { + struct list_head nfct_timeout_list; + struct list_head nfct_timeout_freelist; +}; + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); @@ -42,6 +61,11 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, }; +static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net) +{ + return net_generic(net, nfct_timeout_id); +} + static int ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, @@ -71,12 +95,11 @@ err: return ret; } -static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_new_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); __u16 l3num; __u8 l4num; const struct nf_conntrack_l4proto *l4proto; @@ -94,11 +117,11 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - list_for_each_entry(timeout, &net->nfct_timeout_list, head) { + list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = timeout; @@ -106,7 +129,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, } if (matching) { - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ @@ -116,7 +139,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, return ctnl_timeout_parse_policy(&matching->timeout.data, matching->timeout.l4proto, - net, cda[CTA_TIMEOUT_DATA]); + info->net, + cda[CTA_TIMEOUT_DATA]); } return -EBUSY; @@ -137,8 +161,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, goto err_proto_put; } - ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net, - cda[CTA_TIMEOUT_DATA]); + ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, + info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -146,7 +170,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, timeout->timeout.l3num = l3num; timeout->timeout.l4proto = l4proto; refcount_set(&timeout->refcnt, 1); - list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list); + __module_get(THIS_MODULE); + list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list); return 0; err: @@ -160,22 +185,17 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct ctnl_timeout *timeout) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->timeout.l3num)) || @@ -206,6 +226,7 @@ nla_put_failure: static int ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct nfct_timeout_pernet *pernet; struct net *net = sock_net(skb->sk); struct ctnl_timeout *cur, *last; @@ -217,7 +238,8 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &net->nfct_timeout_list, head) { + pernet = nfct_timeout_pernet(net); + list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) { if (last) { if (cur != last) continue; @@ -238,28 +260,27 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_get_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); int ret = -ENOENT; char *name; struct ctnl_timeout *cur; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnl_timeout_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!cda[CTA_TIMEOUT_NAME]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); - list_for_each_entry(cur, &net->nfct_timeout_list, head) { + list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) @@ -272,21 +293,18 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, } ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } @@ -309,30 +327,29 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) return ret; } -static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_del_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; if (!cda[CTA_TIMEOUT_NAME]) { - list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) - ctnl_timeout_try_del(net, cur); + ctnl_timeout_try_del(info->net, cur); return 0; } name = nla_data(cda[CTA_TIMEOUT_NAME]); - list_for_each_entry(cur, &net->nfct_timeout_list, head) { + list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; - ret = ctnl_timeout_try_del(net, cur); + ret = ctnl_timeout_try_del(info->net, cur); if (ret < 0) return ret; @@ -341,18 +358,15 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, return ret; } -static int cttimeout_default_set(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_default_set(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; __u8 l4num; int ret; - if (!cda[CTA_TIMEOUT_L3PROTO] || - !cda[CTA_TIMEOUT_L4PROTO] || + if (!cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; @@ -365,7 +379,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, goto err; } - ret = ctnl_timeout_parse_policy(NULL, l4proto, net, + ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -382,21 +396,16 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, const unsigned int *timeouts) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; @@ -420,18 +429,16 @@ nla_put_failure: return -1; } -static int cttimeout_default_get(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_default_get(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; - int ret, err; __u16 l3num; __u8 l4num; + int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; @@ -440,41 +447,35 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); - err = -EOPNOTSUPP; if (l4proto->l4proto != l4num) - goto err; + return -EOPNOTSUPP; switch (l4proto->l4proto) { case IPPROTO_ICMP: - timeouts = &nf_icmp_pernet(net)->timeout; + timeouts = &nf_icmp_pernet(info->net)->timeout; break; case IPPROTO_TCP: - timeouts = nf_tcp_pernet(net)->timeouts; + timeouts = nf_tcp_pernet(info->net)->timeouts; break; - case IPPROTO_UDP: /* fallthrough */ + case IPPROTO_UDP: case IPPROTO_UDPLITE: - timeouts = nf_udp_pernet(net)->timeouts; - break; - case IPPROTO_DCCP: -#ifdef CONFIG_NF_CT_PROTO_DCCP - timeouts = nf_dccp_pernet(net)->dccp_timeout; -#endif + timeouts = nf_udp_pernet(info->net)->timeouts; break; case IPPROTO_ICMPV6: - timeouts = &nf_icmpv6_pernet(net)->timeout; + timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; case IPPROTO_SCTP: #ifdef CONFIG_NF_CT_PROTO_SCTP - timeouts = nf_sctp_pernet(net)->timeouts; + timeouts = nf_sctp_pernet(info->net)->timeouts; #endif break; case IPPROTO_GRE: #ifdef CONFIG_NF_CT_PROTO_GRE - timeouts = nf_gre_pernet(net)->timeouts; + timeouts = nf_gre_pernet(info->net)->timeouts; #endif break; case 255: - timeouts = &nf_generic_pernet(net)->timeout; + timeouts = &nf_generic_pernet(info->net)->timeout; break; default: WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); @@ -482,50 +483,38 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, } if (!timeouts) - goto err; + return -EOPNOTSUPP; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { - err = -ENOMEM; - goto err; - } + if (!skb2) + return -ENOMEM; - ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + ret = cttimeout_default_fill_info(info->net, skb2, + NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); - err = -ENOMEM; - goto err; + return -ENOMEM; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; -err: - return err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, const char *name) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *timeout, *matching = NULL; - list_for_each_entry_rcu(timeout, &net->nfct_timeout_list, head) { + list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; - if (!try_module_get(THIS_MODULE)) + if (!refcount_inc_not_zero(&timeout->refcnt)) goto err; - - if (!refcount_inc_not_zero(&timeout->refcnt)) { - module_put(THIS_MODULE); - goto err; - } matching = timeout; break; } @@ -538,28 +527,43 @@ static void ctnl_timeout_put(struct nf_ct_timeout *t) struct ctnl_timeout *timeout = container_of(t, struct ctnl_timeout, timeout); - if (refcount_dec_and_test(&timeout->refcnt)) + if (refcount_dec_and_test(&timeout->refcnt)) { kfree_rcu(timeout, rcu_head); - - module_put(THIS_MODULE); + module_put(THIS_MODULE); + } } static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { - [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_NEW] = { + .call = cttimeout_new_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_GET] = { + .call = cttimeout_get_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DELETE] = { + .call = cttimeout_del_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = { + .call = cttimeout_default_set, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = { + .call = cttimeout_default_get, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, }; static const struct nfnetlink_subsystem cttimeout_subsys = { @@ -573,20 +577,39 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); static int __net_init cttimeout_net_init(struct net *net) { - INIT_LIST_HEAD(&net->nfct_timeout_list); + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + + INIT_LIST_HEAD(&pernet->nfct_timeout_list); + INIT_LIST_HEAD(&pernet->nfct_timeout_freelist); return 0; } +static void __net_exit cttimeout_net_pre_exit(struct net *net) +{ + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + struct ctnl_timeout *cur, *tmp; + + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) { + list_del_rcu(&cur->head); + list_add(&cur->free_head, &pernet->nfct_timeout_freelist); + } + + /* core calls synchronize_rcu() after this */ +} + static void __net_exit cttimeout_net_exit(struct net *net) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; - nf_ct_unconfirmed_destroy(net); + if (list_empty(&pernet->nfct_timeout_freelist)) + return; + nf_ct_untimeout(net, NULL); - list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { - list_del_rcu(&cur->head); + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) { + list_del(&cur->free_head); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); @@ -595,7 +618,15 @@ static void __net_exit cttimeout_net_exit(struct net *net) static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, + .pre_exit = cttimeout_net_pre_exit, .exit = cttimeout_net_exit, + .id = &nfct_timeout_id, + .size = sizeof(struct nfct_timeout_pernet), +}; + +static const struct nf_ct_timeout_hooks hooks = { + .timeout_find_get = ctnl_timeout_find_get, + .timeout_put = ctnl_timeout_put, }; static int __init cttimeout_init(void) @@ -612,8 +643,7 @@ static int __init cttimeout_init(void) "nfnetlink.\n"); goto err_out; } - RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get); - RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put); + RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks); return 0; err_out: @@ -621,14 +651,24 @@ err_out: return ret; } +static int untimeout(struct nf_conn *ct, void *timeout) +{ + struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); + + if (timeout_ext) + RCU_INIT_POINTER(timeout_ext->timeout, NULL); + + return 0; +} + static void __exit cttimeout_exit(void) { nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); - RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); - RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); - synchronize_rcu(); + RCU_INIT_POINTER(nf_ct_timeout_hook, NULL); + + nf_ct_iterate_destroy(untimeout, NULL); } module_init(cttimeout_init); diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c new file mode 100644 index 000000000000..92d869317cba --- /dev/null +++ b/net/netfilter/nfnetlink_hook.c @@ -0,0 +1,486 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021 Red Hat GmbH + * + * Author: Florian Westphal <fw@strlen.de> + */ + +#include <linux/bpf.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_hook.h> + +#include <net/netfilter/nf_tables.h> +#include <net/sock.h> + +static const struct nla_policy nfnl_hook_nla_policy[NFNLA_HOOK_MAX + 1] = { + [NFNLA_HOOK_HOOKNUM] = { .type = NLA_U32 }, + [NFNLA_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFNLA_HOOK_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, + [NFNLA_HOOK_FUNCTION_NAME] = { .type = NLA_NUL_STRING, + .len = KSYM_NAME_LEN, }, + [NFNLA_HOOK_MODULE_NAME] = { .type = NLA_NUL_STRING, + .len = MODULE_NAME_LEN, }, + [NFNLA_HOOK_CHAIN_INFO] = { .type = NLA_NESTED, }, +}; + +static int nf_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *c) +{ + int err; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + rcu_read_unlock(); + err = netlink_dump_start(nlsk, skb, nlh, c); + rcu_read_lock(); + module_put(THIS_MODULE); + + return err; +} + +struct nfnl_dump_hook_data { + char devname[IFNAMSIZ]; + unsigned long headv; + u8 hook; +}; + +static struct nlattr *nfnl_start_info_type(struct sk_buff *nlskb, enum nfnl_hook_chaintype t) +{ + struct nlattr *nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO); + int ret; + + if (!nest) + return NULL; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, htonl(t)); + if (ret == 0) + return nest; + + nla_nest_cancel(nlskb, nest); + return NULL; +} + +static int nfnl_hook_put_bpf_prog_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + const struct bpf_prog *prog) +{ + struct nlattr *nest, *nest2; + int ret; + + if (!IS_ENABLED(CONFIG_NETFILTER_BPF_LINK)) + return 0; + + if (WARN_ON_ONCE(!prog)) + return 0; + + nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_BPF); + if (!nest) + return -EMSGSIZE; + + nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); + if (!nest2) + goto cancel_nest; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_BPF_ID, htonl(prog->aux->id)); + if (ret) + goto cancel_nest; + + nla_nest_end(nlskb, nest2); + nla_nest_end(nlskb, nest); + return 0; + +cancel_nest: + nla_nest_cancel(nlskb, nest); + return -EMSGSIZE; +} + +static int nfnl_hook_put_nft_info_desc(struct sk_buff *nlskb, const char *tname, + const char *name, u8 family) +{ + struct nlattr *nest; + + nest = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); + if (!nest || + nla_put_string(nlskb, NFNLA_CHAIN_TABLE, tname) || + nla_put_string(nlskb, NFNLA_CHAIN_NAME, name) || + nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, family)) { + nla_nest_cancel(nlskb, nest); + return -EMSGSIZE; + } + nla_nest_end(nlskb, nest); + return 0; +} + +static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + struct nft_chain *chain) +{ + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest; + int ret = 0; + + if (WARN_ON_ONCE(!chain)) + return 0; + + if (!nft_is_active(net, chain)) + return 0; + + nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFTABLES); + if (!nest) + return -EMSGSIZE; + + ret = nfnl_hook_put_nft_info_desc(nlskb, chain->table->name, + chain->name, chain->table->family); + if (ret) { + nla_nest_cancel(nlskb, nest); + return ret; + } + + nla_nest_end(nlskb, nest); + return 0; +} + +static int nfnl_hook_put_nft_ft_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + struct nf_flowtable *nf_ft) +{ + struct nft_flowtable *ft = + container_of(nf_ft, struct nft_flowtable, data); + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest; + int ret = 0; + + if (WARN_ON_ONCE(!nf_ft)) + return 0; + + if (!nft_is_active(net, ft)) + return 0; + + nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFT_FLOWTABLE); + if (!nest) + return -EMSGSIZE; + + ret = nfnl_hook_put_nft_info_desc(nlskb, ft->table->name, + ft->name, ft->table->family); + if (ret) { + nla_nest_cancel(nlskb, nest); + return ret; + } + + nla_nest_end(nlskb, nest); + return 0; +} + +static int nfnl_hook_dump_one(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + const struct nf_hook_ops *ops, + int family, unsigned int seq) +{ + u16 event = nfnl_msg_type(NFNL_SUBSYS_HOOK, NFNL_MSG_HOOK_GET); + unsigned int portid = NETLINK_CB(nlskb).portid; + struct nlmsghdr *nlh; + int ret = -EMSGSIZE; + u32 hooknum; +#ifdef CONFIG_KALLSYMS + char sym[KSYM_SYMBOL_LEN]; + char *module_name; +#endif + nlh = nfnl_msg_put(nlskb, portid, seq, event, + NLM_F_MULTI, family, NFNETLINK_V0, 0); + if (!nlh) + goto nla_put_failure; + +#ifdef CONFIG_KALLSYMS + ret = snprintf(sym, sizeof(sym), "%ps", ops->hook); + if (ret >= sizeof(sym)) { + ret = -EINVAL; + goto nla_put_failure; + } + + module_name = strstr(sym, " ["); + if (module_name) { + char *end; + + *module_name = '\0'; + module_name += 2; + end = strchr(module_name, ']'); + if (end) { + *end = 0; + + ret = nla_put_string(nlskb, NFNLA_HOOK_MODULE_NAME, module_name); + if (ret) + goto nla_put_failure; + } + } + + ret = nla_put_string(nlskb, NFNLA_HOOK_FUNCTION_NAME, sym); + if (ret) + goto nla_put_failure; +#endif + + if (ops->pf == NFPROTO_INET && ops->hooknum == NF_INET_INGRESS) + hooknum = NF_NETDEV_INGRESS; + else + hooknum = ops->hooknum; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(hooknum)); + if (ret) + goto nla_put_failure; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_PRIORITY, htonl(ops->priority)); + if (ret) + goto nla_put_failure; + + switch (ops->hook_ops_type) { + case NF_HOOK_OP_NF_TABLES: + ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops->priv); + break; + case NF_HOOK_OP_BPF: + ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv); + break; + case NF_HOOK_OP_NFT_FT: + ret = nfnl_hook_put_nft_ft_info(nlskb, ctx, seq, ops->priv); + break; + case NF_HOOK_OP_UNDEFINED: + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (ret) + goto nla_put_failure; + + nlmsg_end(nlskb, nlh); + return 0; +nla_put_failure: + nlmsg_trim(nlskb, nlh); + return ret; +} + +static const struct nf_hook_entries * +nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev) +{ + const struct nf_hook_entries *hook_head = NULL; +#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) + struct net_device *netdev; +#endif + + switch (pf) { + case NFPROTO_IPV4: + if (hook >= ARRAY_SIZE(net->nf.hooks_ipv4)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); + break; + case NFPROTO_IPV6: + if (hook >= ARRAY_SIZE(net->nf.hooks_ipv6)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); + break; + case NFPROTO_ARP: +#ifdef CONFIG_NETFILTER_FAMILY_ARP + if (hook >= ARRAY_SIZE(net->nf.hooks_arp)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_arp[hook]); +#endif + break; + case NFPROTO_BRIDGE: +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + if (hook >= ARRAY_SIZE(net->nf.hooks_bridge)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); +#endif + break; +#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) + case NFPROTO_NETDEV: + if (hook >= NF_NETDEV_NUMHOOKS) + return ERR_PTR(-EOPNOTSUPP); + + if (!dev) + return ERR_PTR(-ENODEV); + + netdev = dev_get_by_name_rcu(net, dev); + if (!netdev) + return ERR_PTR(-ENODEV); + +#ifdef CONFIG_NETFILTER_INGRESS + if (hook == NF_NETDEV_INGRESS) + return rcu_dereference(netdev->nf_hooks_ingress); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + if (hook == NF_NETDEV_EGRESS) + return rcu_dereference(netdev->nf_hooks_egress); +#endif + fallthrough; +#endif + default: + return ERR_PTR(-EPROTONOSUPPORT); + } + + return hook_head; +} + +static int nfnl_hook_dump(struct sk_buff *nlskb, + struct netlink_callback *cb) +{ + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nfnl_dump_hook_data *ctx = cb->data; + int err, family = nfmsg->nfgen_family; + struct net *net = sock_net(nlskb->sk); + struct nf_hook_ops * const *ops; + const struct nf_hook_entries *e; + unsigned int i = cb->args[0]; + + rcu_read_lock(); + + e = nfnl_hook_entries_head(family, ctx->hook, net, ctx->devname); + if (!e) + goto done; + + if (IS_ERR(e)) { + cb->seq++; + goto done; + } + + if ((unsigned long)e != ctx->headv || i >= e->num_hook_entries) + cb->seq++; + + ops = nf_hook_entries_get_hook_ops(e); + + for (; i < e->num_hook_entries; i++) { + err = nfnl_hook_dump_one(nlskb, ctx, ops[i], family, + cb->nlh->nlmsg_seq); + if (err) + break; + } + +done: + nl_dump_check_consistent(cb, nlmsg_hdr(nlskb)); + rcu_read_unlock(); + cb->args[0] = i; + return nlskb->len; +} + +static int nfnl_hook_dump_start(struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nlattr * const *nla = cb->data; + struct nfnl_dump_hook_data *ctx = NULL; + struct net *net = sock_net(cb->skb->sk); + u8 family = nfmsg->nfgen_family; + char name[IFNAMSIZ] = ""; + const void *head; + u32 hooknum; + + hooknum = ntohl(nla_get_be32(nla[NFNLA_HOOK_HOOKNUM])); + if (hooknum > 255) + return -EINVAL; + + if (family == NFPROTO_NETDEV) { + if (!nla[NFNLA_HOOK_DEV]) + return -EINVAL; + + nla_strscpy(name, nla[NFNLA_HOOK_DEV], sizeof(name)); + } + + rcu_read_lock(); + /* Not dereferenced; for consistency check only */ + head = nfnl_hook_entries_head(family, hooknum, net, name); + rcu_read_unlock(); + + if (head && IS_ERR(head)) + return PTR_ERR(head); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + strscpy(ctx->devname, name, sizeof(ctx->devname)); + ctx->headv = (unsigned long)head; + ctx->hook = hooknum; + + cb->seq = 1; + cb->data = ctx; + + return 0; +} + +static int nfnl_hook_dump_stop(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + +static int nfnl_hook_get(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + if (!nla[NFNLA_HOOK_HOOKNUM]) + return -EINVAL; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nfnl_hook_dump_start, + .done = nfnl_hook_dump_stop, + .dump = nfnl_hook_dump, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nf_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + return -EOPNOTSUPP; +} + +static const struct nfnl_callback nfnl_hook_cb[NFNL_MSG_HOOK_MAX] = { + [NFNL_MSG_HOOK_GET] = { + .call = nfnl_hook_get, + .type = NFNL_CB_RCU, + .attr_count = NFNLA_HOOK_MAX, + .policy = nfnl_hook_nla_policy + }, +}; + +static const struct nfnetlink_subsystem nfhook_subsys = { + .name = "nfhook", + .subsys_id = NFNL_SUBSYS_HOOK, + .cb_count = NFNL_MSG_HOOK_MAX, + .cb = nfnl_hook_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_HOOK); + +static int __init nfnetlink_hook_init(void) +{ + return nfnetlink_subsys_register(&nfhook_subsys); +} + +static void __exit nfnetlink_hook_exit(void) +{ + nfnetlink_subsys_unregister(&nfhook_subsys); +} + +module_init(nfnetlink_hook_init); +module_exit(nfnetlink_hook_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("nfnetlink_hook: list registered netfilter hooks"); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 0ba020ca38e6..bfcb9cd335bf 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -43,6 +43,10 @@ #include "../bridge/br_private.h" #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif + #define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ @@ -62,6 +66,7 @@ struct nfulnl_instance { struct sk_buff *skb; /* pre-allocatd skb */ struct timer_list timer; struct net *net; + netns_tracker ns_tracker; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ u32 peer_portid; /* PORTID of the peer process */ @@ -98,9 +103,9 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num) } static struct nfulnl_instance * -__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) +__instance_lookup(const struct nfnl_log_net *log, u16 group_num) { - struct hlist_head *head; + const struct hlist_head *head; struct nfulnl_instance *inst; head = &log->instance_table[instance_hashfn(group_num)]; @@ -118,15 +123,25 @@ instance_get(struct nfulnl_instance *inst) } static struct nfulnl_instance * -instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) +instance_lookup_get_rcu(const struct nfnl_log_net *log, u16 group_num) { struct nfulnl_instance *inst; - rcu_read_lock_bh(); inst = __instance_lookup(log, group_num); if (inst && !refcount_inc_not_zero(&inst->use)) inst = NULL; - rcu_read_unlock_bh(); + + return inst; +} + +static struct nfulnl_instance * +instance_lookup_get(const struct nfnl_log_net *log, u16 group_num) +{ + struct nfulnl_instance *inst; + + rcu_read_lock(); + inst = instance_lookup_get_rcu(log, group_num); + rcu_read_unlock(); return inst; } @@ -136,7 +151,7 @@ static void nfulnl_instance_free_rcu(struct rcu_head *head) struct nfulnl_instance *inst = container_of(head, struct nfulnl_instance, rcu); - put_net(inst->net); + put_net_track(inst->net, &inst->ns_tracker); kfree(inst); module_put(THIS_MODULE); } @@ -183,7 +198,7 @@ instance_create(struct net *net, u_int16_t group_num, timer_setup(&inst->timer, nfulnl_timer, 0); - inst->net = get_net(net); + inst->net = get_net_track(net, &inst->ns_tracker, GFP_ATOMIC); inst->peer_user_ns = user_ns; inst->peer_portid = portid; inst->group_num = group_num; @@ -356,8 +371,7 @@ __nfulnl_send(struct nfulnl_instance *inst) goto out; } } - nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, - MSG_DONTWAIT); + nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid); out: inst->qlen = 0; inst->skb = NULL; @@ -367,7 +381,7 @@ static void __nfulnl_flush(struct nfulnl_instance *inst) { /* timer holds a reference */ - if (del_timer(&inst->timer)) + if (timer_delete(&inst->timer)) instance_put(inst); if (inst->skb) __nfulnl_send(inst); @@ -376,7 +390,7 @@ __nfulnl_flush(struct nfulnl_instance *inst) static void nfulnl_timer(struct timer_list *t) { - struct nfulnl_instance *inst = from_timer(inst, t, timer); + struct nfulnl_instance *inst = timer_container_of(inst, t, timer); spin_lock_bh(&inst->lock); if (inst->skb) @@ -453,20 +467,15 @@ __build_packet_message(struct nfnl_log_net *log, { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; - nlh = nlmsg_put(inst->skb, 0, 0, - nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), - sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(inst->skb, 0, 0, + nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), + 0, pf, NFNETLINK_V0, htons(inst->group_num)); if (!nlh) return -1; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(inst->group_num); memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; @@ -499,7 +508,7 @@ __build_packet_message(struct nfnl_log_net *log, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { - struct net_device *physindev; + int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ @@ -507,10 +516,10 @@ __build_packet_message(struct nfnl_log_net *log, htonl(indev->ifindex))) goto nla_put_failure; - physindev = nf_bridge_get_physindev(skb); - if (physindev && + physinif = nf_bridge_get_physinif(skb); + if (physinif && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, - htonl(physindev->ifindex))) + htonl(physinif))) goto nla_put_failure; } #endif @@ -558,7 +567,8 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; if (indev && skb->dev && - skb->mac_header != skb->network_header) { + skb_mac_header_was_set(skb) && + skb_mac_header_len(skb) != 0) { struct nfulnl_msg_packet_hw phw; int len; @@ -588,9 +598,9 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - if (hooknum <= NF_INET_FORWARD && skb->tstamp) { + if (hooknum <= NF_INET_FORWARD) { + struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true)); struct nfulnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(skb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); @@ -688,15 +698,15 @@ nfulnl_log_packet(struct net *net, unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; + enum ip_conntrack_info ctinfo = 0; struct nf_conn *ct = NULL; - enum ip_conntrack_info uninitialized_var(ctinfo); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; - inst = instance_lookup_get(log, li->u.ulog.group); + inst = instance_lookup_get_rcu(log, li->u.ulog.group); if (!inst) return; @@ -734,14 +744,16 @@ nfulnl_log_packet(struct net *net, size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) size += nla_total_size(sizeof(u_int32_t)); +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (inst->flags & NFULNL_CFG_F_CONNTRACK) { nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfnl_ct != NULL) { - ct = nfnl_ct->get_ct(skb, &ctinfo); + ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } +#endif if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) size += nfulnl_get_bridge_size(skb); @@ -845,10 +857,8 @@ static struct notifier_block nfulnl_rtnl_notifier = { .notifier_call = nfulnl_rcv_nl_event, }; -static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfula[]) { return -ENOTSUPP; } @@ -869,29 +879,26 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, }; -static int nfulnl_recv_config(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfula[], - struct netlink_ext_ack *extack) +static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfula[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int16_t group_num = ntohs(nfmsg->res_id); - struct nfulnl_instance *inst; + struct nfnl_log_net *log = nfnl_log_pernet(info->net); + u_int16_t group_num = ntohs(info->nfmsg->res_id); struct nfulnl_msg_config_cmd *cmd = NULL; - struct nfnl_log_net *log = nfnl_log_pernet(net); - int ret = 0; + struct nfulnl_instance *inst; u16 flags = 0; + int ret = 0; if (nfula[NFULA_CFG_CMD]) { - u_int8_t pf = nfmsg->nfgen_family; + u_int8_t pf = info->nfmsg->nfgen_family; cmd = nla_data(nfula[NFULA_CFG_CMD]); /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: - return nf_log_bind_pf(net, pf, &nfulnl_logger); + return nf_log_bind_pf(info->net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: - nf_log_unbind_pf(net, pf); + nf_log_unbind_pf(info->net, pf); return 0; } } @@ -932,7 +939,7 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl, goto out_put; } - inst = instance_create(net, group_num, + inst = instance_create(info->net, group_num, NETLINK_CB(skb).portid, sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { @@ -993,11 +1000,17 @@ out: } static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { - [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, - .attr_count = NFULA_MAX, }, - [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, - .attr_count = NFULA_CFG_MAX, - .policy = nfula_cfg_policy }, + [NFULNL_MSG_PACKET] = { + .call = nfulnl_recv_unsupp, + .type = NFNL_CB_MUTEX, + .attr_count = NFULA_MAX, + }, + [NFULNL_MSG_CONFIG] = { + .call = nfulnl_recv_config, + .type = NFNL_CB_MUTEX, + .attr_count = NFULA_CFG_MAX, + .policy = nfula_cfg_policy + }, }; static const struct nfnetlink_subsystem nfulnl_subsys = { @@ -1025,7 +1038,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st) struct hlist_head *head = &log->instance_table[st->bucket]; if (!hlist_empty(head)) - return rcu_dereference_bh(hlist_first_rcu(head)); + return rcu_dereference(hlist_first_rcu(head)); } return NULL; } @@ -1033,7 +1046,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st) static struct hlist_node *get_next(struct net *net, struct iter_state *st, struct hlist_node *h) { - h = rcu_dereference_bh(hlist_next_rcu(h)); + h = rcu_dereference(hlist_next_rcu(h)); while (!h) { struct nfnl_log_net *log; struct hlist_head *head; @@ -1043,7 +1056,7 @@ static struct hlist_node *get_next(struct net *net, struct iter_state *st, log = nfnl_log_pernet(net); head = &log->instance_table[st->bucket]; - h = rcu_dereference_bh(hlist_first_rcu(head)); + h = rcu_dereference(hlist_first_rcu(head)); } return h; } @@ -1061,9 +1074,9 @@ static struct hlist_node *get_idx(struct net *net, struct iter_state *st, } static void *seq_start(struct seq_file *s, loff_t *pos) - __acquires(rcu_bh) + __acquires(rcu) { - rcu_read_lock_bh(); + rcu_read_lock(); return get_idx(seq_file_net(s), s->private, *pos); } @@ -1074,9 +1087,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) } static void seq_stop(struct seq_file *s, void *v) - __releases(rcu_bh) + __releases(rcu) { - rcu_read_unlock_bh(); + rcu_read_unlock(); } static int seq_show(struct seq_file *s, void *v) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 916a3c7f9eaf..c0fc431991e8 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -186,6 +186,8 @@ static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), ctx->optsize, opts); + if (!ctx->optp) + return NULL; } return tcp; @@ -267,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb, struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; + bool found = false; memset(&ctx, 0, sizeof(ctx)); @@ -281,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb, data->genre = f->genre; data->version = f->version; + found = true; break; } - return true; + return found; } EXPORT_SYMBOL_GPL(nf_osf_find); @@ -292,10 +296,9 @@ static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; -static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[], - struct netlink_ext_ack *extack) +static int nfnl_osf_add_callback(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; @@ -307,11 +310,19 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + if (f->opt_num > ARRAY_SIZE(f->opt)) + return -EINVAL; + + if (!memchr(f->genre, 0, MAXGENRELEN) || + !memchr(f->subtype, 0, MAXGENRELEN) || + !memchr(f->version, 0, MAXGENRELEN)) + return -EINVAL; + kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; @@ -325,7 +336,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, kfree(kf); kf = NULL; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } @@ -339,11 +350,9 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, return err; } -static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[], - struct netlink_ext_ack *extack) +static int nfnl_osf_remove_callback(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; @@ -377,11 +386,13 @@ static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl, static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, + .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, + .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, @@ -436,3 +447,5 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Passive OS fingerprint matching"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 3243a31f6e82..8b7b39d8a109 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -29,6 +29,8 @@ #include <linux/netfilter/nfnetlink_queue.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> +#include <linux/cgroup-defs.h> +#include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> @@ -167,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head) struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); + rcu_read_lock(); nfqnl_flush(inst, NULL, 0); + rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } @@ -223,22 +227,174 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) return entry; } -static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) +static unsigned int nf_iterate(struct sk_buff *skb, + struct nf_hook_state *state, + const struct nf_hook_entries *hooks, + unsigned int *index) +{ + const struct nf_hook_entry *hook; + unsigned int verdict, i = *index; + + while (i < hooks->num_hook_entries) { + hook = &hooks->hooks[i]; +repeat: + verdict = nf_hook_entry_hookfn(hook, skb, state); + if (verdict != NF_ACCEPT) { + *index = i; + if (verdict != NF_REPEAT) + return verdict; + goto repeat; + } + i++; + } + + *index = i; + return NF_ACCEPT; +} + +static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) +{ + switch (pf) { +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + case NFPROTO_BRIDGE: + return rcu_dereference(net->nf.hooks_bridge[hooknum]); +#endif + case NFPROTO_IPV4: + return rcu_dereference(net->nf.hooks_ipv4[hooknum]); + case NFPROTO_IPV6: + return rcu_dereference(net->nf.hooks_ipv6[hooknum]); + default: + WARN_ON_ONCE(1); + return NULL; + } + + return NULL; +} + +static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) +{ +#ifdef CONFIG_INET + const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->state.hook == NF_INET_LOCAL_OUT) { + const struct iphdr *iph = ip_hdr(skb); + + if (!(iph->tos == rt_info->tos && + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) + return ip_route_me_harder(entry->state.net, entry->state.sk, + skb, RTN_UNSPEC); + } +#endif + return 0; +} + +static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) +{ + const struct nf_ipv6_ops *v6ops; + int ret = 0; + + switch (entry->state.pf) { + case AF_INET: + ret = nf_ip_reroute(skb, entry); + break; + case AF_INET6: + v6ops = rcu_dereference(nf_ipv6_ops); + if (v6ops) + ret = v6ops->reroute(skb, entry); + break; + } + return ret; +} + +/* caller must hold rcu read-side lock */ +static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { - struct nf_ct_hook *ct_hook; + const struct nf_hook_entry *hook_entry; + const struct nf_hook_entries *hooks; + struct sk_buff *skb = entry->skb; + const struct net *net; + unsigned int i; int err; + u8 pf; + + net = entry->state.net; + pf = entry->state.pf; + + hooks = nf_hook_entries_head(net, pf, entry->state.hook); + + i = entry->hook_index; + if (!hooks || i >= hooks->num_hook_entries) { + kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); + nf_queue_entry_free(entry); + return; + } + + hook_entry = &hooks->hooks[i]; + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) + verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); + + if (verdict == NF_ACCEPT) { + if (nf_reroute(skb, entry) < 0) + verdict = NF_DROP; + } + + if (verdict == NF_ACCEPT) { +next_hook: + ++i; + verdict = nf_iterate(skb, &entry->state, hooks, &i); + } + + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_STOP: + local_bh_disable(); + entry->state.okfn(entry->state.net, entry->state.sk, skb); + local_bh_enable(); + break; + case NF_QUEUE: + err = nf_queue(skb, &entry->state, i, verdict); + if (err == 1) + goto next_hook; + break; + case NF_STOLEN: + break; + default: + kfree_skb(skb); + } + + nf_queue_entry_free(entry); +} + +static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) +{ + const struct nf_ct_hook *ct_hook; if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { + unsigned int ct_verdict = verdict; + rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - if (ct_hook) { - err = ct_hook->update(entry->state.net, entry->skb); - if (err < 0) - verdict = NF_DROP; - } + if (ct_hook) + ct_verdict = ct_hook->update(entry->state.net, entry->skb); rcu_read_unlock(); + + switch (ct_verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + /* follow userspace verdict, could be REPEAT */ + break; + case NF_STOLEN: + nf_queue_entry_free(entry); + return; + default: + verdict = ct_verdict & NF_VERDICT_MASK; + break; + } } nf_reinject(entry, verdict); } @@ -301,18 +457,31 @@ nla_put_failure: return -1; } -static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) +{ +#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) + if (sk && sk_fullsock(sk)) { + u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data); + + if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid))) + return -1; + } +#endif + return 0; +} + +static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx) { - u32 seclen = 0; + int seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) - security_secid_to_secctx(skb->secmark, secdata, &seclen); - + seclen = security_secid_to_secctx(skb->secmark, ctx); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; @@ -371,6 +540,14 @@ nla_put_failure: return -1; } +static int nf_queue_checksum_help(struct sk_buff *entskb) +{ + if (skb_csum_is_sctp(entskb)) + return skb_crc32c_csum_help(entskb); + + return skb_checksum_help(entskb); +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -383,16 +560,16 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; struct nf_conn *ct = NULL; - enum ip_conntrack_info uninitialized_var(ctinfo); - struct nfnl_ct_hook *nfnl_ct; + enum ip_conntrack_info ctinfo = 0; + const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; - char *secdata = NULL; - u32 seclen = 0; + struct lsm_context ctx = { NULL, 0, 0 }; + int seclen = 0; + ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) @@ -403,11 +580,16 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(u_int32_t)) /* priority */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ +#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) + + nla_total_size(sizeof(u_int32_t)) /* classid */ +#endif + nla_total_size(sizeof(u_int32_t)); /* cap_len */ - if (entskb->tstamp) + tstamp = skb_tstamp_cond(entskb, false); + if (tstamp) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); size += nfqnl_get_bridge_size(entry); @@ -428,7 +610,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(entskb)) + nf_queue_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); @@ -444,13 +626,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nfnl_ct = rcu_dereference(nfnl_ct_hook); +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (queue->flags & NFQA_CFG_F_CONNTRACK) { if (nfnl_ct != NULL) { - ct = nfnl_ct->get_ct(entskb, &ctinfo); + ct = nf_ct_get(entskb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } +#endif if (queue->flags & NFQA_CFG_F_UID_GID) { size += (nla_total_size(sizeof(u_int32_t)) /* uid */ @@ -458,7 +642,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { - seclen = nfqnl_get_sk_secctx(entskb, &secdata); + seclen = nfqnl_get_sk_secctx(entskb, &ctx); + if (seclen < 0) + return NULL; if (seclen) size += nla_total_size(seclen); } @@ -469,18 +655,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nlmsg_failure; } - nlh = nlmsg_put(skb, 0, 0, - nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), - sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(skb, 0, 0, + nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), + 0, entry->state.pf, NFNETLINK_V0, + htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = entry->state.pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(queue->queue_num); nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); @@ -561,8 +744,13 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) goto nla_put_failure; + if (entskb->priority && + nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority))) + goto nla_put_failure; + if (indev && entskb->dev && - entskb->mac_header != entskb->network_header) { + skb_mac_header_was_set(entskb) && + skb_mac_header_len(entskb) != 0) { struct nfqnl_msg_packet_hw phw; int len; @@ -578,9 +766,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; - if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) { + if (entry->state.hook <= NF_INET_FORWARD && tstamp) { struct nfqnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); + struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); @@ -593,7 +781,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; - if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) + goto nla_put_failure; + + if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) @@ -621,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } nlh->nlmsg_len = skb->len; - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return skb; nla_put_failure: @@ -630,8 +821,8 @@ nla_put_failure: kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return NULL; } @@ -639,10 +830,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; - const struct nf_conn *ct = (void *)skb_nfct(entry->skb); + struct nf_conn *ct = (void *)skb_nfct(entry->skb); + unsigned long status; + unsigned int use; + + if (!ct) + return false; - if (ct && ((ct->status & flags) == IPS_DYING)) + status = READ_ONCE(ct->status); + if ((status & flags) == IPS_DYING) return true; + + if (status & IPS_CONFIRMED) + return false; + + /* in some cases skb_clone() can occur after initial conntrack + * pickup, but conntrack assumes exclusive skb->_nfct ownership for + * unconfirmed entries. + * + * This happens for br_netfilter and with ip multicast routing. + * We can't be solved with serialization here because one clone could + * have been queued for local delivery. + */ + use = refcount_read(&ct->ct_general.use); + if (likely(use == 1)) + return false; + + /* Can't decrement further? Exclusive ownership. */ + if (!refcount_dec_not_one(&ct->ct_general.use)) + return false; + + skb_set_nfct(entry->skb, 0); + /* No nf_ct_put(): we already decremented .use and it cannot + * drop down to 0. + */ + return true; #endif return false; } @@ -681,7 +903,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; @@ -711,9 +933,15 @@ static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) - nf_queue_entry_get_refs(entry); - return entry; + + if (!entry) + return NULL; + + if (nf_queue_entry_get_refs(entry)) + return entry; + + kfree(entry); + return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) @@ -796,7 +1024,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) break; } - if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) + if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb))) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); @@ -831,11 +1059,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) } static int -nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) +nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; if (diff < 0) { + unsigned int min_len = skb_transport_offset(e->skb); + + if (data_len < min_len) + return -EINVAL; + if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) { @@ -953,6 +1186,16 @@ static void nfqnl_nf_hook_drop(struct net *net) struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; + /* This function is also called on net namespace error unwind, + * when pernet_ops->init() failed and ->exit() functions of the + * previous pernet_ops gets called. + * + * This may result in a call to nfqnl_nf_hook_drop() before + * struct nfnl_queue_net was allocated. + */ + if (!q) + return; + for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; @@ -1005,11 +1248,13 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_CT] = { .type = NLA_UNSPEC }, [NFQA_EXP] = { .type = NLA_UNSPEC }, [NFQA_VLAN] = { .type = NLA_NESTED }, + [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, + [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static struct nfqnl_instance * @@ -1048,20 +1293,17 @@ static int nfq_id_after(unsigned int id, unsigned int max) return (int)(id - max) > 0; } -static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_verdict_batch(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; - unsigned int verdict, maxid; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; + unsigned int verdict, maxid; LIST_HEAD(batch_list); - u16 queue_num = ntohs(nfmsg->res_id); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); @@ -1093,20 +1335,24 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl, if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + if (nfqa[NFQA_PRIORITY]) + entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); + nfqnl_reinject(entry, verdict); } return 0; } -static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, +static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[], struct nf_queue_entry *entry, enum ip_conntrack_info *ctinfo) { +#if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conn *ct; - ct = nfnl_ct->get_ct(entry->skb, ctinfo); + ct = nf_ct_get(entry->skb, ctinfo); if (ct == NULL) return NULL; @@ -1118,6 +1364,9 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, NETLINK_CB(entry->skb).portid, nlmsg_report(nlh)); return ct; +#else + return NULL; +#endif } static int nfqa_parse_bridge(struct nf_queue_entry *entry, @@ -1156,22 +1405,18 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, return 0; } -static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + u_int16_t queue_num = ntohs(info->nfmsg->res_id); + const struct nfnl_ct_hook *nfnl_ct; struct nfqnl_msg_verdict_hdr *vhdr; + enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; - unsigned int verdict; struct nf_queue_entry *entry; - enum ip_conntrack_info uninitialized_var(ctinfo); - struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); + unsigned int verdict; int err; queue = verdict_instance_lookup(q, queue_num, @@ -1194,7 +1439,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, if (nfqa[NFQA_CT]) { if (nfnl_ct != NULL) - ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo); + ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry, + &ctinfo); } if (entry->state.pf == PF_BRIDGE) { @@ -1218,14 +1464,15 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + if (nfqa[NFQA_PRIORITY]) + entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); + nfqnl_reinject(entry, verdict); return 0; } -static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]) { return -ENOTSUPP; } @@ -1243,16 +1490,13 @@ static const struct nf_queue_handler nfqh = { .nf_hook_drop = nfqnl_nf_hook_drop, }; -static int nfqnl_recv_config(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); - struct nfqnl_instance *queue; + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); + struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; int ret = 0; @@ -1371,17 +1615,29 @@ err_out_unlock: } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { - [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, }, - [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_policy }, - [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .policy = nfqa_cfg_policy }, - [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_batch_policy }, + [NFQNL_MSG_PACKET] = { + .call = nfqnl_recv_unsupp, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + }, + [NFQNL_MSG_VERDICT] = { + .call = nfqnl_recv_verdict, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_policy + }, + [NFQNL_MSG_CONFIG] = { + .call = nfqnl_recv_config, + .type = NFNL_CB_MUTEX, + .attr_count = NFQA_CFG_MAX, + .policy = nfqa_cfg_policy + }, + [NFQNL_MSG_VERDICT_BATCH] = { + .call = nfqnl_recv_verdict_batch, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_batch_policy + }, }; static const struct nfnetlink_subsystem nfqnl_subsys = { @@ -1499,7 +1755,6 @@ static int __net_init nfnl_queue_net_init(struct net *net) &nfqnl_seq_ops, sizeof(struct iter_state))) return -ENOMEM; #endif - nf_register_queue_handler(net, &nfqh); return 0; } @@ -1508,7 +1763,6 @@ static void __net_exit nfnl_queue_net_exit(struct net *net) struct nfnl_queue_net *q = nfnl_queue_pernet(net); unsigned int i; - nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif @@ -1516,15 +1770,9 @@ static void __net_exit nfnl_queue_net_exit(struct net *net) WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } -static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list) -{ - synchronize_rcu(); -} - static struct pernet_operations nfnl_queue_net_ops = { .init = nfnl_queue_net_init, .exit = nfnl_queue_net_exit, - .exit_batch = nfnl_queue_net_exit_batch, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), }; @@ -1552,6 +1800,8 @@ static int __init nfnetlink_queue_init(void) goto cleanup_netlink_subsys; } + nf_register_queue_handler(&nfqh); + return status; cleanup_netlink_subsys: @@ -1565,6 +1815,7 @@ out: static void __exit nfnetlink_queue_fini(void) { + nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index bc37d6c59db4..d550910aabec 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -16,8 +16,9 @@ #include <net/netfilter/nf_tables_offload.h> struct nft_bitwise { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 sreg2; + u8 dreg; enum nft_bitwise_ops op:8; u8 len; struct nft_data mask; @@ -25,12 +26,12 @@ struct nft_bitwise { struct nft_data data; }; -static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, - const struct nft_bitwise *priv) +static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) { unsigned int i; - for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) + for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; } @@ -60,101 +61,184 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, } } +static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] & src2[i]; +} + +static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] | src2[i]; +} + +static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] ^ src2[i]; +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); - const u32 *src = ®s->data[priv->sreg]; + const u32 *src = ®s->data[priv->sreg], *src2; u32 *dst = ®s->data[priv->dreg]; - switch (priv->op) { - case NFT_BITWISE_BOOL: - nft_bitwise_eval_bool(dst, src, priv); - break; - case NFT_BITWISE_LSHIFT: + if (priv->op == NFT_BITWISE_MASK_XOR) { + nft_bitwise_eval_mask_xor(dst, src, priv); + return; + } + if (priv->op == NFT_BITWISE_LSHIFT) { nft_bitwise_eval_lshift(dst, src, priv); - break; - case NFT_BITWISE_RSHIFT: + return; + } + if (priv->op == NFT_BITWISE_RSHIFT) { nft_bitwise_eval_rshift(dst, src, priv); - break; + return; + } + + src2 = priv->sreg2 ? ®s->data[priv->sreg2] : priv->data.data; + + if (priv->op == NFT_BITWISE_AND) { + nft_bitwise_eval_and(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_OR) { + nft_bitwise_eval_or(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_XOR) { + nft_bitwise_eval_xor(dst, src, src2, priv); + return; } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_SREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_SREG2] = { .type = NLA_U32 }, [NFTA_BITWISE_DREG] = { .type = NLA_U32 }, [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, - [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; -static int nft_bitwise_init_bool(struct nft_bitwise *priv, - const struct nlattr *const tb[]) +static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { - struct nft_data_desc mask, xor; + struct nft_data_desc mask = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->mask), + .len = priv->len, + }; + struct nft_data_desc xor = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->xor), + .len = priv->len, + }; int err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || !tb[NFTA_BITWISE_XOR]) return -EINVAL; - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask, - tb[NFTA_BITWISE_MASK]); + err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) { - err = -EINVAL; - goto err1; - } - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor, - tb[NFTA_BITWISE_XOR]); + err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]); if (err < 0) - goto err1; - if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) { - err = -EINVAL; - goto err2; - } + goto err_xor_err; return 0; -err2: - nft_data_release(&priv->xor, xor.type); -err1: + +err_xor_err: nft_data_release(&priv->mask, mask.type); + return err; } static int nft_bitwise_init_shift(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc d; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = sizeof(u32), + }; int err; if (tb[NFTA_BITWISE_MASK] || - tb[NFTA_BITWISE_XOR]) + tb[NFTA_BITWISE_XOR] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_DATA]) return -EINVAL; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, - tb[NFTA_BITWISE_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]); if (err < 0) return err; - if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || - priv->data.data[0] >= BITS_PER_TYPE(u32)) { - nft_data_release(&priv->data, d.type); + + if (priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, desc.type); return -EINVAL; } return 0; } +static int nft_bitwise_init_bool(const struct nft_ctx *ctx, + struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + int err; + + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) || + (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2])) + return -EINVAL; + + if (tb[NFTA_BITWISE_DATA]) { + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = priv->len, + }; + + err = nft_data_init(NULL, &priv->data, &desc, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + } else { + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2], + &priv->sreg2, priv->len); + if (err < 0) + return err; + } + + return 0; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -163,57 +247,60 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, u32 len; int err; - if (!tb[NFTA_BITWISE_SREG] || - !tb[NFTA_BITWISE_DREG] || - !tb[NFTA_BITWISE_LEN]) - return -EINVAL; - err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); if (err < 0) return err; priv->len = len; - priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); - err = nft_validate_register_load(priv->sreg, priv->len); + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg, + priv->len); if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); if (err < 0) return err; if (tb[NFTA_BITWISE_OP]) { priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); switch (priv->op) { - case NFT_BITWISE_BOOL: + case NFT_BITWISE_MASK_XOR: case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: break; default: return -EOPNOTSUPP; } } else { - priv->op = NFT_BITWISE_BOOL; + priv->op = NFT_BITWISE_MASK_XOR; } switch(priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_init_bool(priv, tb); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_init_mask_xor(priv, tb); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: err = nft_bitwise_init_shift(priv, tb); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_init_bool(ctx, priv, tb); + break; } return err; } -static int nft_bitwise_dump_bool(struct sk_buff *skb, - const struct nft_bitwise *priv) +static int nft_bitwise_dump_mask_xor(struct sk_buff *skb, + const struct nft_bitwise *priv) { if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) @@ -235,7 +322,23 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb, return 0; } -static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (priv->sreg2) { + if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2)) + return -1; + } else { + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + } + + return 0; +} + +static int nft_bitwise_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_bitwise *priv = nft_expr_priv(expr); int err = 0; @@ -250,13 +353,18 @@ static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) return -1; switch (priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_dump_bool(skb, priv); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_dump_mask_xor(skb, priv); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: err = nft_bitwise_dump_shift(skb, priv); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_dump_bool(skb, priv); + break; } return err; @@ -271,7 +379,7 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; - if (priv->op != NFT_BITWISE_BOOL) + if (priv->op != NFT_BITWISE_MASK_XOR) return -EOPNOTSUPP; if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || @@ -283,19 +391,257 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_bitwise_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + const struct nft_bitwise *bitwise; + unsigned int regcount; + u8 dreg; + int i; + + if (!track->regs[priv->sreg].selector) + return false; + + bitwise = nft_expr_priv(track->regs[priv->dreg].selector); + if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector && + track->regs[priv->sreg].num_reg == 0 && + track->regs[priv->dreg].bitwise && + track->regs[priv->dreg].bitwise->ops == expr->ops && + priv->sreg == bitwise->sreg && + priv->sreg2 == bitwise->sreg2 && + priv->dreg == bitwise->dreg && + priv->op == bitwise->op && + priv->len == bitwise->len && + !memcmp(&priv->mask, &bitwise->mask, sizeof(priv->mask)) && + !memcmp(&priv->xor, &bitwise->xor, sizeof(priv->xor)) && + !memcmp(&priv->data, &bitwise->data, sizeof(priv->data))) { + track->cur = expr; + return true; + } + + if (track->regs[priv->sreg].bitwise || + track->regs[priv->sreg].num_reg != 0) { + nft_reg_track_cancel(track, priv->dreg, priv->len); + return false; + } + + if (priv->sreg != priv->dreg) { + nft_reg_track_update(track, track->regs[priv->sreg].selector, + priv->dreg, priv->len); + } + + dreg = priv->dreg; + regcount = DIV_ROUND_UP(priv->len, NFT_REG32_SIZE); + for (i = 0; i < regcount; i++, dreg++) + track->regs[dreg].bitwise = expr; + + return false; +} + static const struct nft_expr_ops nft_bitwise_ops = { .type = &nft_bitwise_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)), .eval = nft_bitwise_eval, .init = nft_bitwise_init, .dump = nft_bitwise_dump, + .reduce = nft_bitwise_reduce, .offload = nft_bitwise_offload, }; +static int +nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out) +{ + struct nft_data data; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + .len = sizeof(u32), + }; + int err; + + err = nft_data_init(NULL, &data, &desc, tb); + if (err < 0) + return err; + + *out = data.data[0]; + + return 0; +} + +static int nft_bitwise_fast_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); + int err; + + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg, + sizeof(u32)); + if (err < 0) + return err; + + err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); + if (err < 0) + return err; + + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) + return -EINVAL; + + if (!tb[NFTA_BITWISE_MASK] || + !tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_MASK], &priv->mask); + if (err < 0) + return err; + + err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_XOR], &priv->xor); + if (err < 0) + return err; + + return 0; +} + +static int +nft_bitwise_fast_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) +{ + const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); + struct nft_data data; + + if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) + return -1; + if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) + return -1; + if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32)))) + return -1; + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR))) + return -1; + + data.data[0] = priv->mask; + if (nft_data_dump(skb, NFTA_BITWISE_MASK, &data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + + data.data[0] = priv->xor; + if (nft_data_dump(skb, NFTA_BITWISE_XOR, &data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + + return 0; +} + +static int nft_bitwise_fast_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + if (priv->xor || priv->sreg != priv->dreg || reg->len != sizeof(u32)) + return -EOPNOTSUPP; + + reg->mask.data[0] = priv->mask; + return 0; +} + +static bool nft_bitwise_fast_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); + const struct nft_bitwise_fast_expr *bitwise; + + if (!track->regs[priv->sreg].selector) + return false; + + bitwise = nft_expr_priv(track->regs[priv->dreg].selector); + if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector && + track->regs[priv->dreg].bitwise && + track->regs[priv->dreg].bitwise->ops == expr->ops && + priv->sreg == bitwise->sreg && + priv->dreg == bitwise->dreg && + priv->mask == bitwise->mask && + priv->xor == bitwise->xor) { + track->cur = expr; + return true; + } + + if (track->regs[priv->sreg].bitwise) { + nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE); + return false; + } + + if (priv->sreg != priv->dreg) { + track->regs[priv->dreg].selector = + track->regs[priv->sreg].selector; + } + track->regs[priv->dreg].bitwise = expr; + + return false; +} + +const struct nft_expr_ops nft_bitwise_fast_ops = { + .type = &nft_bitwise_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise_fast_expr)), + .eval = NULL, /* inlined */ + .init = nft_bitwise_fast_init, + .dump = nft_bitwise_fast_dump, + .reduce = nft_bitwise_fast_reduce, + .offload = nft_bitwise_fast_offload, +}; + +static const struct nft_expr_ops * +nft_bitwise_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + int err; + u32 len; + + if (!tb[NFTA_BITWISE_LEN] || + !tb[NFTA_BITWISE_SREG] || + !tb[NFTA_BITWISE_DREG]) + return ERR_PTR(-EINVAL); + + err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); + if (err < 0) + return ERR_PTR(err); + + if (len != sizeof(u32)) + return &nft_bitwise_ops; + + if (tb[NFTA_BITWISE_OP] && + ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR) + return &nft_bitwise_ops; + + return &nft_bitwise_fast_ops; +} + struct nft_expr_type nft_bitwise_type __read_mostly = { .name = "bitwise", - .ops = &nft_bitwise_ops, + .select_ops = nft_bitwise_select_ops, .policy = nft_bitwise_policy, .maxattr = NFTA_BITWISE_MAX, .owner = THIS_MODULE, }; + +bool nft_expr_reduce_bitwise(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_expr *last = track->last; + const struct nft_expr *next; + + if (expr == last) + return false; + + next = nft_expr_next(expr); + if (next->ops == &nft_bitwise_ops) + return nft_bitwise_reduce(track, next); + else if (next->ops == &nft_bitwise_fast_ops) + return nft_bitwise_fast_reduce(track, next); + + return false; +} +EXPORT_SYMBOL_GPL(nft_expr_reduce_bitwise); diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 12bed3f7bbc6..af9206a3afd1 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -5,7 +5,7 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> @@ -16,8 +16,8 @@ #include <net/netfilter/nf_tables.h> struct nft_byteorder { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; enum nft_byteorder_ops op:8; u8 len; u8 size; @@ -30,28 +30,30 @@ void nft_byteorder_eval(const struct nft_expr *expr, const struct nft_byteorder *priv = nft_expr_priv(expr); u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - union { u32 u32; u16 u16; } *s, *d; + u16 *s16, *d16; unsigned int i; - s = (void *)src; - d = (void *)dst; + s16 = (void *)src; + d16 = (void *)dst; switch (priv->size) { case 8: { + u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); - nft_reg_store64(&dst[i], be64_to_cpu(src64)); + nft_reg_store64(&dst64[i], + be64_to_cpu((__force __be64)src64)); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); - nft_reg_store64(&dst[i], src64); + nft_reg_store64(&dst64[i], src64); } break; } @@ -61,11 +63,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 4; i++) - d[i].u32 = ntohl((__force __be32)s[i].u32); + dst[i] = ntohl((__force __be32)src[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 4; i++) - d[i].u32 = (__force __u32)htonl(s[i].u32); + dst[i] = (__force __u32)htonl(src[i]); break; } break; @@ -73,11 +75,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 2; i++) - d[i].u16 = ntohs((__force __be16)s[i].u16); + d16[i] = ntohs((__force __be16)s16[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 2; i++) - d[i].u16 = (__force __u16)htons(s[i].u16); + d16[i] = (__force __u16)htons(s16[i]); break; } break; @@ -87,9 +89,9 @@ void nft_byteorder_eval(const struct nft_expr *expr, static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_OP] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_byteorder_init(const struct nft_ctx *ctx, @@ -131,23 +133,24 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); if (err < 0) return err; priv->len = len; - err = nft_validate_register_load(priv->sreg, priv->len); + err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg, + priv->len); if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } -static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_byteorder_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_byteorder *priv = nft_expr_priv(expr); @@ -167,12 +170,23 @@ nla_put_failure: return -1; } +static bool nft_byteorder_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + struct nft_byteorder *priv = nft_expr_priv(expr); + + nft_reg_track_cancel(track, priv->dreg, priv->len); + + return false; +} + static const struct nft_expr_ops nft_byteorder_ops = { .type = &nft_byteorder_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), .eval = nft_byteorder_eval, .init = nft_byteorder_init, .dump = nft_byteorder_dump, + .reduce = nft_byteorder_reduce, }; struct nft_expr_type nft_byteorder_type __read_mostly = { diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index c78d01bc02e9..b16185e9a6dd 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -18,7 +18,7 @@ static unsigned int nft_do_chain_ipv4(void *priv, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); return nft_do_chain(&pkt, priv); } @@ -62,7 +62,7 @@ static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); return nft_do_chain(&pkt, priv); } @@ -102,7 +102,7 @@ static unsigned int nft_do_chain_ipv6(void *priv, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); return nft_do_chain(&pkt, priv); } @@ -149,10 +149,10 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, switch (state->pf) { case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); break; case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); break; default: break; @@ -161,16 +161,49 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, return nft_do_chain(&pkt, priv); } +static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_hook_state ingress_state = *state; + struct nft_pktinfo pkt; + + switch (skb->protocol) { + case htons(ETH_P_IP): + /* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */ + ingress_state.pf = NFPROTO_IPV4; + ingress_state.hook = NF_INET_INGRESS; + nft_set_pktinfo(&pkt, skb, &ingress_state); + + if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0) + return NF_DROP; + break; + case htons(ETH_P_IPV6): + ingress_state.pf = NFPROTO_IPV6; + ingress_state.hook = NF_INET_INGRESS; + nft_set_pktinfo(&pkt, skb, &ingress_state); + + if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0) + return NF_DROP; + break; + default: + return NF_ACCEPT; + } + + return nft_do_chain(&pkt, priv); +} + static const struct nft_chain_type nft_chain_filter_inet = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_INET, - .hook_mask = (1 << NF_INET_LOCAL_IN) | + .hook_mask = (1 << NF_INET_INGRESS) | + (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), .hooks = { + [NF_INET_INGRESS] = nft_do_chain_inet_ingress, [NF_INET_LOCAL_IN] = nft_do_chain_inet, [NF_INET_LOCAL_OUT] = nft_do_chain_inet, [NF_INET_FORWARD] = nft_do_chain_inet, @@ -205,13 +238,13 @@ nft_do_chain_bridge(void *priv, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); + nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); + nft_set_pktinfo_ipv6_validate(&pkt); break; default: - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); break; } @@ -260,13 +293,13 @@ static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); + nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); + nft_set_pktinfo_ipv6_validate(&pkt); break; default: - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); break; } @@ -277,79 +310,122 @@ static const struct nft_chain_type nft_chain_filter_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_NETDEV, - .hook_mask = (1 << NF_NETDEV_INGRESS), + .hook_mask = (1 << NF_NETDEV_INGRESS) | + (1 << NF_NETDEV_EGRESS), .hooks = { [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + [NF_NETDEV_EGRESS] = nft_do_chain_netdev, }, }; -static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_ctx *ctx) +static int nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_base_chain *basechain, bool changename) { - struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - struct nft_hook *hook, *found = NULL; - int n = 0; - - if (event != NETDEV_UNREGISTER) - return; + struct nft_table *table = basechain->chain.table; + struct nf_hook_ops *ops; + struct nft_hook *hook; + bool match; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev == dev) - found = hook; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - n++; - } - if (!found) - return; - - if (n > 1) { - nf_unregister_net_hook(ctx->net, &found->ops); - list_del_rcu(&found->list); - kfree_rcu(found, rcu); - return; + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nf_unregister_net_hook(dev_net(dev), ops); + + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kmemdup(&basechain->ops, + sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->dev = dev; + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + nf_register_net_hook(dev_net(dev), ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } + break; } + return 0; +} + +static int __nf_tables_netdev_event(unsigned long event, + struct net_device *dev, + bool changename) +{ + struct nft_base_chain *basechain; + struct nftables_pernet *nft_net; + struct nft_chain *chain; + struct nft_table *table; + + nft_net = nft_pernet(dev_net(dev)); + list_for_each_entry(table, &nft_net->tables, list) { + if (table->family != NFPROTO_NETDEV && + table->family != NFPROTO_INET) + continue; + + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_base_chain(chain)) + continue; - /* UNREGISTER events are also happening on netns exit. - * - * Although nf_tables core releases all tables/chains, only this event - * handler provides guarantee that hook->ops.dev is still accessible, - * so we cannot skip exiting net namespaces. - */ - __nft_release_basechain(ctx); + basechain = nft_base_chain(chain); + if (table->family == NFPROTO_INET && + basechain->ops.hooknum != NF_INET_INGRESS) + continue; + + if (nft_netdev_event(event, dev, basechain, changename)) + return 1; + } + } + return 0; } static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_table *table; - struct nft_chain *chain, *nr; - struct nft_ctx ctx = { - .net = dev_net(dev), - }; + struct nftables_pernet *nft_net; + int ret = NOTIFY_DONE; - if (event != NETDEV_UNREGISTER && + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && event != NETDEV_CHANGENAME) return NOTIFY_DONE; - mutex_lock(&ctx.net->nft.commit_mutex); - list_for_each_entry(table, &ctx.net->nft.tables, list) { - if (table->family != NFPROTO_NETDEV) - continue; + nft_net = nft_pernet(dev_net(dev)); + mutex_lock(&nft_net->commit_mutex); - ctx.family = table->family; - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { - if (!nft_is_base_chain(chain)) - continue; - - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; } + __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_netdev_event(event, dev, false)) { + ret = NOTIFY_BAD; } - mutex_unlock(&ctx.net->nft.commit_mutex); - - return NOTIFY_DONE; +out_unlock: + mutex_unlock(&nft_net->commit_mutex); + return ret; } static struct notifier_block nf_tables_netdev_notifier = { diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index eac4a901233f..40e230d8b712 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -17,12 +17,12 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, switch (state->pf) { #ifdef CONFIG_NF_TABLES_IPV4 case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); break; #endif #ifdef CONFIG_NF_TABLES_IPV6 case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); break; #endif default: @@ -137,6 +137,7 @@ module_init(nft_chain_nat_init); module_exit(nft_chain_nat_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("nftables network address translation support"); #ifdef CONFIG_NF_TABLES_IPV4 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); #endif diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c index 8826bbe71136..925db0dce48d 100644 --- a/net/netfilter/nft_chain_route.c +++ b/net/netfilter/nft_chain_route.c @@ -26,7 +26,7 @@ static unsigned int nf_route_table_hook4(void *priv, u8 tos; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); mark = skb->mark; iph = ip_hdr(skb); @@ -42,7 +42,7 @@ static unsigned int nf_route_table_hook4(void *priv, iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -74,7 +74,7 @@ static unsigned int nf_route_table_hook6(void *priv, int err; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); @@ -92,7 +92,7 @@ static unsigned int nf_route_table_hook6(void *priv, skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u32 *)ipv6_hdr(skb)))) { - err = nf_ip6_route_me_harder(state->net, skb); + err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 8a28c127effc..2605f43737bc 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -18,7 +18,7 @@ struct nft_cmp_expr { struct nft_data data; - enum nft_registers sreg:8; + u8 sreg; u8 len; enum nft_cmp_ops op:8; }; @@ -43,7 +43,7 @@ void nft_cmp_eval(const struct nft_expr *expr, case NFT_CMP_LT: if (d == 0) goto mismatch; - /* fall through */ + fallthrough; case NFT_CMP_LTE: if (d > 0) goto mismatch; @@ -51,7 +51,7 @@ void nft_cmp_eval(const struct nft_expr *expr, case NFT_CMP_GT: if (d == 0) goto mismatch; - /* fall through */ + fallthrough; case NFT_CMP_GTE: if (d < 0) goto mismatch; @@ -73,22 +73,17 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_cmp_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + }; int err; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; - if (desc.type != NFT_DATA_VALUE) { - err = -EINVAL; - nft_data_release(&priv->data, desc.type); - return err; - } - - priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); - err = nft_validate_register_load(priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -97,7 +92,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return 0; } -static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_cmp_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); @@ -115,21 +111,58 @@ nla_put_failure: return -1; } +union nft_cmp_offload_data { + u16 val16; + u32 val32; + u64 val64; +}; + +static void nft_payload_n2h(union nft_cmp_offload_data *data, + const u8 *val, u32 len) +{ + switch (len) { + case 2: + data->val16 = ntohs(*((__be16 *)val)); + break; + case 4: + data->val32 = ntohl(*((__be32 *)val)); + break; + case 8: + data->val64 = be64_to_cpu(*((__be64 *)val)); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + static int __nft_cmp_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_cmp_expr *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->sreg]; + union nft_cmp_offload_data _data, _datamask; u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; + u8 *data, *datamask; - if (priv->op != NFT_CMP_EQ || reg->len != priv->len) + if (priv->op != NFT_CMP_EQ || priv->len > reg->len) return -EOPNOTSUPP; - memcpy(key + reg->offset, &priv->data, priv->len); - memcpy(mask + reg->offset, ®->mask, priv->len); + if (reg->flags & NFT_OFFLOAD_F_NETWORK2HOST) { + nft_payload_n2h(&_data, (u8 *)&priv->data, reg->len); + nft_payload_n2h(&_datamask, (u8 *)®->mask, reg->len); + data = (u8 *)&_data; + datamask = (u8 *)&_datamask; + } else { + data = (u8 *)&priv->data; + datamask = (u8 *)®->mask; + } + + memcpy(key + reg->offset, data, reg->len); + memcpy(mask + reg->offset, datamask, reg->len); - flow->match.dissector.used_keys |= BIT(reg->key); + flow->match.dissector.used_keys |= BIT_ULL(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; if (reg->key == FLOW_DISSECTOR_KEY_META && @@ -137,7 +170,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, nft_reg_load16(priv->data.data) != ARPHRD_ETHER) return -EOPNOTSUPP; - nft_offload_update_dependency(ctx, &priv->data, priv->len); + nft_offload_update_dependency(ctx, &priv->data, reg->len); return 0; } @@ -157,34 +190,48 @@ static const struct nft_expr_ops nft_cmp_ops = { .eval = nft_cmp_eval, .init = nft_cmp_init, .dump = nft_cmp_dump, + .reduce = NFT_REDUCE_READONLY, .offload = nft_cmp_offload, }; +/* Calculate the mask for the nft_cmp_fast expression. On big endian the + * mask needs to include the *upper* bytes when interpreting that data as + * something smaller than the full u32, therefore a cpu_to_le32 is done. + */ +static u32 nft_cmp_fast_mask(unsigned int len) +{ + __le32 mask = cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr, + data) * BITS_PER_BYTE - len)); + + return (__force u32)mask; +} + static int nft_cmp_fast_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; struct nft_data data; - u32 mask; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + }; int err; - err = nft_data_init(NULL, &data, sizeof(data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; - priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); - err = nft_validate_register_load(priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; desc.len *= BITS_PER_BYTE; - mask = nft_cmp_fast_mask(desc.len); - priv->data = data.data[0] & mask; + priv->mask = nft_cmp_fast_mask(desc.len); + priv->data = data.data[0] & priv->mask; priv->len = desc.len; + priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ; return 0; } @@ -201,20 +248,22 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx, }, .sreg = priv->sreg, .len = priv->len / BITS_PER_BYTE, - .op = NFT_CMP_EQ, + .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ, }; return __nft_cmp_offload(ctx, flow, &cmp); } -static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_cmp_fast_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ; struct nft_data data; if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ))) + if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op))) goto nla_put_failure; data.data[0] = priv->data; @@ -233,15 +282,114 @@ const struct nft_expr_ops nft_cmp_fast_ops = { .eval = NULL, /* inlined */ .init = nft_cmp_fast_init, .dump = nft_cmp_fast_dump, + .reduce = NFT_REDUCE_READONLY, .offload = nft_cmp_fast_offload, }; +static u32 nft_cmp_mask(u32 bitlen) +{ + return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen)); +} + +static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen) +{ + int len = bitlen / BITS_PER_BYTE; + int i, words = len / sizeof(u32); + + for (i = 0; i < words; i++) { + data->data[i] = 0xffffffff; + bitlen -= sizeof(u32) * BITS_PER_BYTE; + } + + if (len % sizeof(u32)) + data->data[i++] = nft_cmp_mask(bitlen); + + for (; i < 4; i++) + data->data[i] = 0; +} + +static int nft_cmp16_fast_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + }; + int err; + + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); + if (err < 0) + return err; + + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + if (err < 0) + return err; + + nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE); + priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ; + priv->len = desc.len; + + return 0; +} + +static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); + struct nft_cmp_expr cmp = { + .data = priv->data, + .sreg = priv->sreg, + .len = priv->len, + .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ, + }; + + return __nft_cmp_offload(ctx, flow, &cmp); +} + +static int nft_cmp16_fast_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) +{ + const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); + enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ; + + if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + + +const struct nft_expr_ops nft_cmp16_fast_ops = { + .type = &nft_cmp_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)), + .eval = NULL, /* inlined */ + .init = nft_cmp16_fast_init, + .dump = nft_cmp16_fast_dump, + .reduce = NFT_REDUCE_READONLY, + .offload = nft_cmp16_fast_offload, +}; + static const struct nft_expr_ops * nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { - struct nft_data_desc desc; struct nft_data data; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + }; enum nft_cmp_ops op; + u8 sreg; int err; if (tb[NFTA_CMP_SREG] == NULL || @@ -262,23 +410,21 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return ERR_PTR(-EINVAL); } - err = nft_data_init(NULL, &data, sizeof(data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return ERR_PTR(err); - if (desc.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err1; - } - - if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ) - return &nft_cmp_fast_ops; + sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) { + if (desc.len <= sizeof(u32)) + return &nft_cmp_fast_ops; + else if (desc.len <= sizeof(data) && + ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) || + (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0))) + return &nft_cmp16_fast_ops; + } return &nft_cmp_ops; -err1: - nft_data_release(&data, desc.type); - return ERR_PTR(-EINVAL); } struct nft_expr_type nft_cmp_type __read_mostly = { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index f9adca62ccb3..72711d62fddf 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -19,6 +19,7 @@ #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_arp/arp_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> /* Used for matches where *info is larger than X byte */ #define NFT_MATCH_LARGE_THRESH 192 @@ -57,8 +58,13 @@ union nft_entry { }; static inline void -nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) +nft_compat_set_par(struct xt_action_param *par, + const struct nft_pktinfo *pkt, + const void *xt, const void *xt_info) { + par->state = pkt->state; + par->thoff = nft_thoff(pkt); + par->fragoff = pkt->fragoff; par->target = xt; par->targinfo = xt_info; par->hotdrop = false; @@ -71,13 +77,14 @@ static void nft_target_eval_xt(const struct nft_expr *expr, void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; int ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + nft_compat_set_par(&xt, pkt, target, info); - ret = target->target(skb, &pkt->xt); + ret = target->target(skb, &xt); - if (pkt->xt.hotdrop) + if (xt.hotdrop) ret = NF_DROP; switch (ret) { @@ -97,13 +104,14 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; int ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + nft_compat_set_par(&xt, pkt, target, info); - ret = target->target(skb, &pkt->xt); + ret = target->target(skb, &xt); - if (pkt->xt.hotdrop) + if (xt.hotdrop) ret = NF_DROP; switch (ret) { @@ -127,7 +135,7 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, - [NFTA_TARGET_REV] = { .type = NLA_U32 }, + [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; @@ -192,6 +200,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; + u32 l4proto; u32 flags; int err; @@ -204,15 +213,32 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); - if (flags & ~NFT_RULE_COMPAT_F_MASK) + if (flags & NFT_RULE_COMPAT_F_UNUSED || + flags & ~NFT_RULE_COMPAT_F_MASK) return -EINVAL; if (flags & NFT_RULE_COMPAT_F_INV) *inv = true; - *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + if (l4proto > U16_MAX) + return -EINVAL; + + *proto = l4proto; + return 0; } +static void nft_compat_wait_for_destructors(struct net *net) +{ + /* xtables matches or targets can have side effects, e.g. + * creation/destruction of /proc files. + * The xt ->destroy functions are run asynchronously from + * work queue. If we have pending invocations we thus + * need to wait for those to finish. + */ + nf_tables_trans_destroy_flush_work(net); +} + static int nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -236,9 +262,25 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + nft_compat_wait_for_destructors(ctx->net); + ret = xt_check_target(&par, size, proto, inv); - if (ret < 0) + if (ret < 0) { + if (ret == -ENOENT) { + const char *modname = NULL; + + if (strcmp(target->name, "LOG") == 0) + modname = "nf_log_syslog"; + else if (strcmp(target->name, "NFLOG") == 0) + modname = "nfnetlink_log"; + + if (modname && + nft_request_module(ctx->net, "%s", modname) == -EAGAIN) + return -EAGAIN; + } + return ret; + } /* The standard target cannot be used */ if (!target->target) @@ -247,6 +289,12 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return 0; } +static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr) +{ + module_put(me); + kfree(expr->ops); +} + static void nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -262,8 +310,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.target->destroy != NULL) par.target->destroy(&par); - module_put(me); - kfree(expr->ops); + __nft_mt_tg_destroy(me, expr); } static int nft_extension_dump_info(struct sk_buff *skb, int attr, @@ -284,7 +331,8 @@ static int nft_extension_dump_info(struct sk_buff *skb, int attr, return 0; } -static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_target_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct xt_target *target = expr->ops->data; void *info = nft_expr_priv(expr); @@ -302,13 +350,28 @@ nla_put_failure: } static int nft_target_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; unsigned int hook_mask = 0; int ret; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && + ctx->family != NFPROTO_BRIDGE && + ctx->family != NFPROTO_ARP) + return -EOPNOTSUPP; + + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -332,13 +395,14 @@ static void __nft_match_eval(const struct nft_expr *expr, { struct xt_match *match = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; bool ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info); + nft_compat_set_par(&xt, pkt, match, info); - ret = match->match(skb, (struct xt_action_param *)&pkt->xt); + ret = match->match(skb, &xt); - if (pkt->xt.hotdrop) { + if (xt.hotdrop) { regs->verdict.code = NF_DROP; return; } @@ -371,7 +435,7 @@ static void nft_match_eval(const struct nft_expr *expr, static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, - [NFTA_MATCH_REV] = { .type = NLA_U32 }, + [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; @@ -451,6 +515,8 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + nft_compat_wait_for_destructors(ctx->net); + return xt_check_match(&par, size, proto, inv); } @@ -469,7 +535,7 @@ nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_match *m = expr->ops->data; int ret; - priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL); + priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT); if (!priv->info) return -ENOMEM; @@ -494,8 +560,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, if (par.match->destroy != NULL) par.match->destroy(&par); - module_put(me); - kfree(expr->ops); + __nft_mt_tg_destroy(me, expr); } static void @@ -530,12 +595,14 @@ nla_put_failure: return -1; } -static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_match_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { return __nft_match_dump(skb, expr, nft_expr_priv(expr)); } -static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e) +static int nft_match_large_dump(struct sk_buff *skb, + const struct nft_expr *e, bool reset) { struct nft_xt_match_priv *priv = nft_expr_priv(e); @@ -543,13 +610,28 @@ static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e) } static int nft_match_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; unsigned int hook_mask = 0; int ret; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && + ctx->family != NFPROTO_BRIDGE && + ctx->family != NFPROTO_ARP) + return -EOPNOTSUPP; + + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -572,19 +654,14 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int rev, int target) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) @@ -599,17 +676,15 @@ nla_put_failure: return -1; } -static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_compat_get_rcu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const tb[]) { + u8 family = info->nfmsg->nfgen_family; + const char *name, *fmt; + struct sk_buff *skb2; int ret = 0, target; - struct nfgenmsg *nfmsg; - const char *fmt; - const char *name; u32 rev; - struct sk_buff *skb2; if (tb[NFTA_COMPAT_NAME] == NULL || tb[NFTA_COMPAT_REV] == NULL || @@ -620,9 +695,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); - nfmsg = nlmsg_data(nlh); - - switch(nfmsg->nfgen_family) { + switch(family) { case AF_INET: fmt = "ipt_%s"; break; @@ -636,8 +709,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, fmt = "arpt_%s"; break; default: - pr_err("nft_compat: unsupported protocol %d\n", - nfmsg->nfgen_family); + pr_err("nft_compat: unsupported protocol %d\n", family); return -EINVAL; } @@ -645,9 +717,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, return -EINVAL; rcu_read_unlock(); - try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, - rev, target, &ret), - fmt, name); + try_then_request_module(xt_find_revision(family, name, rev, target, &ret), + fmt, name); if (ret < 0) goto out_put; @@ -659,36 +730,36 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, /* include the best revision for this extension in the message */ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_COMPAT_GET, - nfmsg->nfgen_family, - name, ret, target) <= 0) { + family, name, ret, target) <= 0) { kfree_skb(skb2); goto out_put; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); out_put: rcu_read_lock(); module_put(THIS_MODULE); - return ret == -EAGAIN ? -ENOBUFS : ret; + + return ret; } static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING, .len = NFT_COMPAT_NAME_MAX-1 }, - [NFTA_COMPAT_REV] = { .type = NLA_U32 }, + [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_COMPAT_TYPE] = { .type = NLA_U32 }, }; static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { - [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu, - .attr_count = NFTA_COMPAT_MAX, - .policy = nfnl_compat_policy_get }, + [NFNL_MSG_COMPAT_GET] = { + .call = nfnl_compat_get_rcu, + .type = NFNL_CB_RCU, + .attr_count = NFTA_COMPAT_MAX, + .policy = nfnl_compat_policy_get + }, }; static const struct nfnetlink_subsystem nfnl_compat_subsys = { @@ -700,6 +771,14 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = { static struct nft_expr_type nft_match_type; +static bool nft_match_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct xt_match *match = expr->ops->data; + + return strcmp(match->name, "comment") == 0; +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -729,7 +808,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; @@ -742,6 +821,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, ops->dump = nft_match_dump; ops->validate = nft_match_validate; ops->data = match; + ops->reduce = nft_match_reduce; matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); if (matchsize > NFT_MATCH_LARGE_THRESH) { @@ -818,7 +898,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; @@ -831,6 +911,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, ops->dump = nft_target_dump; ops->validate = nft_target_validate; ops->data = target; + ops->reduce = NFT_REDUCE_READONLY; if (family == NFPROTO_BRIDGE) ops->eval = nft_target_eval_bridge; @@ -902,3 +983,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("match"); MODULE_ALIAS_NFT_EXPR("target"); +MODULE_DESCRIPTION("x_tables over nftables support"); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 69d6173f91e2..657764774a2d 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -14,7 +14,7 @@ #include <net/netfilter/nf_conntrack_zones.h> struct nft_connlimit { - struct nf_conncount_list list; + struct nf_conncount_list *list; u32 limit; bool invert; }; @@ -24,33 +24,27 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, const struct nft_pktinfo *pkt, const struct nft_set_ext *ext) { - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; - const struct nf_conntrack_tuple *tuple_ptr; - struct nf_conntrack_tuple tuple; - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; unsigned int count; + int err; - tuple_ptr = &tuple; - - ct = nf_ct_get(pkt->skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), - nft_pf(pkt), nft_net(pkt), &tuple)) { - regs->verdict.code = NF_DROP; - return; - } - - if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) { - regs->verdict.code = NF_DROP; - return; + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); + if (err) { + if (err == -EEXIST) { + /* Call gc to update the list count if any connection has + * been closed already. This is useful for softlimit + * connections like limiting bandwidth based on a number + * of open connections. + */ + nf_conncount_gc_list(nft_net(pkt), priv->list); + } else { + regs->verdict.code = NF_DROP; + return; + } } - count = priv->list.count; + count = READ_ONCE(priv->list->count); - if ((count > priv->limit) ^ priv->invert) { + if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) { regs->verdict.code = NFT_BREAK; return; } @@ -62,6 +56,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, { bool invert = false; u32 flags, limit; + int err; if (!tb[NFTA_CONNLIMIT_COUNT]) return -EINVAL; @@ -76,18 +71,31 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, invert = true; } - nf_conncount_list_init(&priv->list); + priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT); + if (!priv->list) + return -ENOMEM; + + nf_conncount_list_init(priv->list); priv->limit = limit; priv->invert = invert; - return nf_ct_netns_get(ctx->net, ctx->family); + err = nf_ct_netns_get(ctx->net, ctx->family); + if (err < 0) + goto err_netns; + + return 0; +err_netns: + kfree(priv->list); + + return err; } static void nft_connlimit_do_destroy(const struct nft_ctx *ctx, struct nft_connlimit *priv) { nf_ct_netns_put(ctx->net, ctx->family); - nf_conncount_cache_free(&priv->list); + nf_conncount_cache_free(priv->list); + kfree(priv->list); } static int nft_connlimit_do_dump(struct sk_buff *skb, @@ -123,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx, return nft_connlimit_do_init(ctx, tb, priv); } +static void nft_connlimit_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_connlimit *newpriv = nft_obj_data(newobj); + struct nft_connlimit *priv = nft_obj_data(obj); + + WRITE_ONCE(priv->limit, newpriv->limit); + WRITE_ONCE(priv->invert, newpriv->invert); +} + static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { @@ -152,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = { .init = nft_connlimit_obj_init, .destroy = nft_connlimit_obj_destroy, .dump = nft_connlimit_obj_dump, + .update = nft_connlimit_obj_update, }; static struct nft_object_type nft_connlimit_obj_type __read_mostly = { @@ -171,7 +190,8 @@ static void nft_connlimit_eval(const struct nft_expr *expr, nft_connlimit_do_eval(priv, regs, pkt, NULL); } -static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_connlimit_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_connlimit *priv = nft_expr_priv(expr); @@ -195,12 +215,16 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx, nft_connlimit_do_destroy(ctx, priv); } -static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - nf_conncount_list_init(&priv_dst->list); + priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp); + if (!priv_dst->list) + return -ENOMEM; + + nf_conncount_list_init(priv_dst->list); priv_dst->limit = priv_src->limit; priv_dst->invert = priv_src->invert; @@ -212,19 +236,15 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, { struct nft_connlimit *priv = nft_expr_priv(expr); - nf_conncount_cache_free(&priv->list); + nf_conncount_cache_free(priv->list); + kfree(priv->list); } static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - - local_bh_disable(); - ret = nf_conncount_gc_list(net, &priv->list); - local_bh_enable(); - return ret; + return nf_conncount_gc_list(net, priv->list); } static struct nft_expr_type nft_connlimit_type; @@ -238,6 +258,7 @@ static const struct nft_expr_ops nft_connlimit_ops = { .destroy_clone = nft_connlimit_destroy_clone, .dump = nft_connlimit_dump, .gc = nft_connlimit_gc, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_connlimit_type __read_mostly = { @@ -280,3 +301,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso"); MODULE_ALIAS_NFT_EXPR("connlimit"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT); +MODULE_DESCRIPTION("nftables connlimit rule support"); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index f6d4d0fa23a6..cc7325329496 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -8,13 +8,20 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/seqlock.h> +#include <linux/u64_stats_sync.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables_offload.h> struct nft_counter { + u64_stats_t bytes; + u64_stats_t packets; +}; + +struct nft_counter_tot { s64 bytes; s64 packets; }; @@ -23,25 +30,24 @@ struct nft_counter_percpu_priv { struct nft_counter __percpu *counter; }; -static DEFINE_PER_CPU(seqcount_t, nft_counter_seq); +static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync); static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); - - write_seqcount_begin(myseq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - this_cpu->bytes += pkt->skb->len; - this_cpu->packets++; + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->bytes, pkt->skb->len); + u64_stats_inc(&this_cpu->packets); + u64_stats_update_end(nft_sync); - write_seqcount_end(myseq); local_bh_enable(); } @@ -60,21 +66,20 @@ static int nft_counter_do_init(const struct nlattr * const tb[], struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; - cpu_stats = alloc_percpu(struct nft_counter); + cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT); if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); + this_cpu = raw_cpu_ptr(cpu_stats); if (tb[NFTA_COUNTER_PACKETS]) { - this_cpu->packets = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + u64_stats_set(&this_cpu->packets, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]))); } if (tb[NFTA_COUNTER_BYTES]) { - this_cpu->bytes = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + u64_stats_set(&this_cpu->bytes, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]))); } - preempt_enable(); + priv->counter = cpu_stats; return 0; } @@ -102,35 +107,41 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx, } static void nft_counter_reset(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - this_cpu->packets -= total->packets; - this_cpu->bytes -= total->bytes; + nft_sync = this_cpu_ptr(&nft_counter_sync); + + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, -total->packets); + u64_stats_add(&this_cpu->bytes, -total->bytes); + u64_stats_update_end(nft_sync); + local_bh_enable(); } static void nft_counter_fetch(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { struct nft_counter *this_cpu; - const seqcount_t *myseq; u64 bytes, packets; unsigned int seq; int cpu; memset(total, 0, sizeof(*total)); for_each_possible_cpu(cpu) { - myseq = per_cpu_ptr(&nft_counter_seq, cpu); + struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu); + this_cpu = per_cpu_ptr(priv->counter, cpu); do { - seq = read_seqcount_begin(myseq); - bytes = this_cpu->bytes; - packets = this_cpu->packets; - } while (read_seqcount_retry(myseq, seq)); + seq = u64_stats_fetch_begin(nft_sync); + bytes = u64_stats_read(&this_cpu->bytes); + packets = u64_stats_read(&this_cpu->packets); + } while (u64_stats_fetch_retry(nft_sync, seq)); total->bytes += bytes; total->packets += packets; @@ -141,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb, struct nft_counter_percpu_priv *priv, bool reset) { - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); @@ -173,7 +184,7 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; -static struct nft_object_type nft_counter_obj_type; +struct nft_object_type nft_counter_obj_type; static const struct nft_object_ops nft_counter_obj_ops = { .type = &nft_counter_obj_type, .size = sizeof(struct nft_counter_percpu_priv), @@ -183,7 +194,7 @@ static const struct nft_object_ops nft_counter_obj_ops = { .dump = nft_counter_obj_dump, }; -static struct nft_object_type nft_counter_obj_type __read_mostly = { +struct nft_object_type nft_counter_obj_type __read_mostly = { .type = NFT_OBJECT_COUNTER, .ops = &nft_counter_obj_ops, .maxattr = NFTA_COUNTER_MAX, @@ -191,20 +202,20 @@ static struct nft_object_type nft_counter_obj_type __read_mostly = { .owner = THIS_MODULE, }; -static void nft_counter_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); nft_counter_do_eval(priv, regs, pkt); } -static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_counter_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - return nft_counter_do_dump(skb, priv, false); + return nft_counter_do_dump(skb, priv, reset); } static int nft_counter_init(const struct nft_ctx *ctx, @@ -224,31 +235,63 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, nft_counter_do_destroy(priv); } -static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_counter_percpu_priv *priv = nft_expr_priv(src); struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); - cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC); + cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp); if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); - this_cpu->packets = total.packets; - this_cpu->bytes = total.bytes; - preempt_enable(); + this_cpu = raw_cpu_ptr(cpu_stats); + u64_stats_set(&this_cpu->packets, total.packets); + u64_stats_set(&this_cpu->bytes, total.bytes); priv_clone->counter = cpu_stats; return 0; } -static struct nft_expr_type nft_counter_type; +static int nft_counter_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + /* No specific offload action is needed, but report success. */ + return 0; +} + +static void nft_counter_offload_stats(struct nft_expr *expr, + const struct flow_stats *stats) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct u64_stats_sync *nft_sync; + struct nft_counter *this_cpu; + + local_bh_disable(); + this_cpu = this_cpu_ptr(priv->counter); + nft_sync = this_cpu_ptr(&nft_counter_sync); + + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, stats->pkts); + u64_stats_add(&this_cpu->bytes, stats->bytes); + u64_stats_update_end(nft_sync); + local_bh_enable(); +} + +void nft_counter_init_seqcount(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu)); +} + +struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)), @@ -258,9 +301,12 @@ static const struct nft_expr_ops nft_counter_ops = { .destroy_clone = nft_counter_destroy, .dump = nft_counter_dump, .clone = nft_counter_clone, + .reduce = NFT_REDUCE_READONLY, + .offload = nft_counter_offload, + .offload_stats = nft_counter_offload_stats, }; -static struct nft_expr_type nft_counter_type __read_mostly = { +struct nft_expr_type nft_counter_type __read_mostly = { .name = "counter", .ops = &nft_counter_ops, .policy = nft_counter_policy, @@ -268,38 +314,3 @@ static struct nft_expr_type nft_counter_type __read_mostly = { .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, }; - -static int __init nft_counter_module_init(void) -{ - int cpu, err; - - for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu)); - - err = nft_register_obj(&nft_counter_obj_type); - if (err < 0) - return err; - - err = nft_register_expr(&nft_counter_type); - if (err < 0) - goto err1; - - return 0; -err1: - nft_unregister_obj(&nft_counter_obj_type); - return err; -} - -static void __exit nft_counter_module_exit(void) -{ - nft_unregister_expr(&nft_counter_type); - nft_unregister_obj(&nft_counter_obj_type); -} - -module_init(nft_counter_module_init); -module_exit(nft_counter_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("counter"); -MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index faea72c2df32..6f2ae7cad731 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -12,7 +12,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> @@ -22,15 +22,7 @@ #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> - -struct nft_ct { - enum nft_ct_keys key:8; - enum ip_conntrack_dir dir:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; +#include <net/netfilter/nf_conntrack_seqadj.h> struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; @@ -41,6 +33,7 @@ struct nft_ct_helper_obj { #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; +static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, @@ -96,7 +89,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - *dest = ct->mark; + *dest = READ_ONCE(ct->mark); return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK @@ -116,7 +109,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, helper = rcu_dereference(help->helper); if (helper == NULL) goto err; - strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); + strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { @@ -129,7 +122,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; } #endif - case NFT_CT_BYTES: /* fallthrough */ + case NFT_CT_BYTES: case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 count = 0; @@ -177,8 +170,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, } #endif case NFT_CT_ID: - if (!nf_ct_is_confirmed(ct)) - goto err; *dest = nf_ct_get_id(ct); return; default: @@ -204,12 +195,12 @@ static void nft_ct_get_eval(const struct nft_expr *expr, case NFT_CT_SRC_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; - *dest = tuple->src.u3.ip; + *dest = (__force __u32)tuple->src.u3.ip; return; case NFT_CT_DST_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; - *dest = tuple->dst.u3.ip; + *dest = (__force __u32)tuple->dst.u3.ip; return; case NFT_CT_SRC_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) @@ -240,6 +231,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; + int oldcnt; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ @@ -260,18 +252,22 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); - if (likely(atomic_read(&ct->ct_general.use) == 1)) { + __refcount_inc(&ct->ct_general.use, &oldcnt); + if (likely(oldcnt == 1)) { nf_ct_zone_add(ct, &zone); } else { - /* previous skb got queued to userspace */ + refcount_dec(&ct->ct_general.use); + /* previous skb got queued to userspace, allocate temporary + * one until percpu template can be reused. + */ ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); if (!ct) { regs->verdict.code = NF_DROP; return; } + __set_bit(IPS_CONFIRMED_BIT, &ct->status); } - atomic_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } #endif @@ -295,8 +291,8 @@ static void nft_ct_set_eval(const struct nft_expr *expr, switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - if (ct->mark != value) { - ct->mark = value; + if (READ_ONCE(ct->mark) != value) { + WRITE_ONCE(ct->mark, value); nf_conntrack_event_cache(IPCT_MARK, ct); } break; @@ -340,7 +336,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, - [NFTA_CT_KEY] = { .type = NLA_U32 }, + [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, [NFTA_CT_SREG] = { .type = NLA_U32 }, }; @@ -376,7 +372,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } - atomic_set(&tmp->ct_general.use, 1); + __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } @@ -384,6 +380,14 @@ static bool nft_ct_tmpl_alloc_pcpu(void) } #endif +static void __nft_ct_get_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) +{ +#ifdef CONFIG_NF_CONNTRACK_LABELS + if (priv->key == NFT_CT_LABELS) + nf_connlabels_put(ctx->net); +#endif +} + static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -418,6 +422,10 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; + + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; break; #endif case NFT_CT_HELPER: @@ -483,6 +491,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, break; #endif case NFT_CT_ID: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); break; default: @@ -496,19 +507,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, case IP_CT_DIR_REPLY: break; default: - return -EINVAL; + err = -EINVAL; + goto err; } } - priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + priv->len = len; + err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, + NFT_DATA_VALUE, len); if (err < 0) - return err; + goto err; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) - return err; + goto err; if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS || @@ -516,6 +528,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, nf_ct_set_acct(ctx->net, true); return 0; +err: + __nft_ct_get_destroy(ctx, priv); + return err; } static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) @@ -528,8 +543,11 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: + mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); + mutex_unlock(&nft_ct_pcpu_mutex); + break; #endif default: break; @@ -566,9 +584,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: - if (!nft_ct_tmpl_alloc_pcpu()) + mutex_lock(&nft_ct_pcpu_mutex); + if (!nft_ct_tmpl_alloc_pcpu()) { + mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; + } nft_ct_pcpu_template_refcnt++; + mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif @@ -602,8 +624,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, } } - priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); - err = nft_validate_register_load(priv->sreg, len); + priv->len = len; + err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; @@ -621,6 +643,9 @@ err1: static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + __nft_ct_get_destroy(ctx, priv); nf_ct_netns_put(ctx->net, ctx->family); } @@ -633,7 +658,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, nf_ct_netns_put(ctx->net, ctx->family); } -static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ct_get_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); @@ -672,7 +698,31 @@ nla_put_failure: return -1; } -static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +static bool nft_ct_get_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + const struct nft_ct *ct; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + ct = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->key != ct->key) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return nft_expr_reduce_bitwise(track, expr); +} + +static int nft_ct_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); @@ -705,7 +755,38 @@ static const struct nft_expr_ops nft_ct_get_ops = { .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, + .reduce = nft_ct_get_reduce, +}; + +static bool nft_ct_set_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + int i; + + for (i = 0; i < NFT_REG32_NUM; i++) { + if (!track->regs[i].selector) + continue; + + if (track->regs[i].selector->ops != &nft_ct_get_ops) + continue; + + __nft_reg_track_cancel(track, i); + } + + return false; +} + +#ifdef CONFIG_MITIGATION_RETPOLINE +static const struct nft_expr_ops nft_ct_get_fast_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_get_fast_eval, + .init = nft_ct_get_init, + .destroy = nft_ct_get_destroy, + .dump = nft_ct_get_dump, + .reduce = nft_ct_set_reduce, }; +#endif static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, @@ -714,6 +795,7 @@ static const struct nft_expr_ops nft_ct_set_ops = { .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, + .reduce = nft_ct_set_reduce, }; #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -724,6 +806,7 @@ static const struct nft_expr_ops nft_ct_set_zone_ops = { .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, + .reduce = nft_ct_set_reduce, }; #endif @@ -737,8 +820,21 @@ nft_ct_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); - if (tb[NFTA_CT_DREG]) + if (tb[NFTA_CT_DREG]) { +#ifdef CONFIG_MITIGATION_RETPOLINE + u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + + switch (k) { + case NFT_CT_STATE: + case NFT_CT_DIRECTION: + case NFT_CT_STATUS: + case NFT_CT_MARK: + case NFT_CT_SECMARK: + return &nft_ct_get_fast_ops; + } +#endif return &nft_ct_get_ops; + } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -780,6 +876,7 @@ static const struct nft_expr_ops nft_notrack_ops = { .type = &nft_notrack_type, .size = NFT_EXPR_SIZE(0), .eval = nft_notrack_eval, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_notrack_type __read_mostly = { @@ -854,7 +951,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj, */ values = nf_ct_timeout_data(timeout); if (values) - nf_ct_refresh(ct, pkt->skb, values[0]); + nf_ct_refresh(ct, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, @@ -990,7 +1087,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (!priv->l4proto) return -ENOENT; - nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); + nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); if (tb[NFTA_CT_HELPER_L3PROTO]) family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); @@ -1013,8 +1110,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, help6 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; - case NFPROTO_NETDEV: /* fallthrough */ - case NFPROTO_BRIDGE: /* same */ + case NFPROTO_NETDEV: + case NFPROTO_BRIDGE: case NFPROTO_INET: help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, priv->l4proto); @@ -1096,6 +1193,10 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj, if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); + + if ((ct->status & IPS_NAT_MASK) && !nfct_seqadj(ct)) + if (!nfct_seqadj_ext_add(ct)) + regs->verdict.code = NF_DROP; } } @@ -1178,7 +1279,30 @@ static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_EXPECT_L3PROTO]) priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + switch (priv->l3num) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET) + break; + + return -EINVAL; + case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */ + default: + return -EAFNOSUPPORT; + } + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + switch (priv->l4proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_DCCP: + case IPPROTO_SCTP: + break; + default: + return -EOPNOTSUPP; + } + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); @@ -1220,7 +1344,7 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj, struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); - if (!ct || ctinfo == IP_CT_UNTRACKED) { + if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) { regs->verdict.code = NFT_BREAK; return; } @@ -1345,3 +1469,4 @@ MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); +MODULE_DESCRIPTION("Netfilter nf_tables conntrack module"); diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c new file mode 100644 index 000000000000..e684c8a91848 --- /dev/null +++ b/net/netfilter/nft_ct_fast.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only +#if IS_ENABLED(CONFIG_NFT_CT) +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack.h> + +void nft_ct_get_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int state; + + ct = nf_ct_get(pkt->skb, &ctinfo); + + switch (priv->key) { + case NFT_CT_STATE: + if (ct) + state = NF_CT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + state = NF_CT_STATE_UNTRACKED_BIT; + else + state = NF_CT_STATE_INVALID_BIT; + *dest = state; + return; + default: + break; + } + + if (!ct) { + regs->verdict.code = NFT_BREAK; + return; + } + + switch (priv->key) { + case NFT_CT_DIRECTION: + nft_reg_store8(dest, CTINFO2DIR(ctinfo)); + return; + case NFT_CT_STATUS: + *dest = ct->status; + return; +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + *dest = ct->mark; + return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + *dest = ct->secmark; + return; +#endif + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + break; + } +} +EXPORT_SYMBOL_GPL(nft_ct_get_fast_eval); +#endif diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index c2e78c160fd7..0573f96ce079 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -14,7 +14,7 @@ #include <net/netfilter/nf_dup_netdev.h> struct nft_dup_netdev { - enum nft_registers sreg_dev:8; + u8 sreg_dev; }; static void nft_dup_netdev_eval(const struct nft_expr *expr, @@ -40,11 +40,12 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_DEV] == NULL) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + return nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, + sizeof(int)); } -static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dup_netdev_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_dup_netdev *priv = nft_expr_priv(expr); @@ -67,6 +68,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); } +static bool nft_dup_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, @@ -74,7 +80,9 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .eval = nft_dup_netdev_eval, .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, + .reduce = NFT_REDUCE_READONLY, .offload = nft_dup_netdev_offload, + .offload_action = nft_dup_netdev_offload_action, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { @@ -102,3 +110,4 @@ module_exit(nft_dup_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_AF_EXPR(5, "dup"); +MODULE_DESCRIPTION("nftables netdev packet duplication support"); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 64ca13a1885b..7807d8129664 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -16,42 +16,62 @@ struct nft_dynset { struct nft_set *set; struct nft_set_ext_tmpl tmpl; enum nft_dynset_ops op:8; - enum nft_registers sreg_key:8; - enum nft_registers sreg_data:8; + u8 sreg_key; + u8 sreg_data; bool invert; + bool expr; + u8 num_exprs; u64 timeout; - struct nft_expr *expr; + struct nft_expr *expr_array[NFT_SET_EXPR_MAX]; struct nft_set_binding binding; }; -static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, - struct nft_regs *regs) +static int nft_dynset_expr_setup(const struct nft_dynset *priv, + const struct nft_set_ext *ext) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_expr *expr; + int i; + + for (i = 0; i < priv->num_exprs; i++) { + expr = nft_setelem_expr_at(elem_expr, elem_expr->size); + if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0) + return -1; + + elem_expr->size += priv->expr_array[i]->ops->size; + } + + return 0; +} + +struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set_ext *ext; + void *elem_priv; u64 timeout; - void *elem; if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; - timeout = priv->timeout ? : set->timeout; - elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], NULL, - ®s->data[priv->sreg_data], - timeout, 0, GFP_ATOMIC); - if (elem == NULL) + timeout = priv->timeout ? : READ_ONCE(set->timeout); + elem_priv = nft_set_elem_init(set, &priv->tmpl, + ®s->data[priv->sreg_key], NULL, + ®s->data[priv->sreg_data], + timeout, 0, GFP_ATOMIC); + if (IS_ERR(elem_priv)) goto err1; - ext = nft_set_elem_ext(set, elem); - if (priv->expr != NULL && - nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) + ext = nft_set_elem_ext(set, elem_priv); + if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0) goto err2; - return elem; + return elem_priv; err2: - nft_set_elem_destroy(set, elem, false); + nft_set_elem_destroy(set, elem_priv, false); err1: if (set->size) atomic_dec(&set->nelems); @@ -71,12 +91,13 @@ void nft_dynset_eval(const struct nft_expr *expr, return; } - if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, - expr, regs, &ext)) { + ext = set->ops->update(set, ®s->data[priv->sreg_key], expr, regs); + if (ext) { if (priv->op == NFT_DYNSET_OP_UPDATE && - nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - timeout = priv->timeout ? : set->timeout; - *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout; + nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && + READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) { + timeout = priv->timeout ? : READ_ONCE(set->timeout); + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + timeout); } nft_set_elem_update_expr(ext, regs, pkt); @@ -90,29 +111,66 @@ void nft_dynset_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +static void nft_dynset_ext_add_expr(struct nft_dynset *priv) +{ + u8 size = 0; + int i; + + for (i = 0; i < priv->num_exprs; i++) + size += priv->expr_array[i]->ops->size; + + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPRESSIONS, + sizeof(struct nft_set_elem_expr) + size); +} + +static struct nft_expr * +nft_dynset_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, + const struct nlattr *attr, int pos) +{ + struct nft_expr *expr; + int err; + + expr = nft_set_elem_expr_alloc(ctx, set, attr); + if (IS_ERR(expr)) + return expr; + + if (set->exprs[pos] && set->exprs[pos]->ops != expr->ops) { + err = -EOPNOTSUPP; + goto err_dynset_expr; + } + + return expr; + +err_dynset_expr: + nft_expr_destroy(ctx, expr); + return ERR_PTR(err); +} + static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, - [NFTA_DYNSET_OP] = { .type = NLA_U32 }, + [NFTA_DYNSET_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 }, + [NFTA_DYNSET_EXPRESSIONS] = { .type = NLA_NESTED }, }; static int nft_dynset_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_dynset *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; - int err; + int err, i; - lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); if (tb[NFTA_DYNSET_SET_NAME] == NULL || tb[NFTA_DYNSET_OP] == NULL || @@ -121,11 +179,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); - - if (flags & ~NFT_DYNSET_F_INV) - return -EINVAL; + if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR)) + return -EOPNOTSUPP; if (flags & NFT_DYNSET_F_INV) priv->invert = true; + if (flags & NFT_DYNSET_F_EXPR) + priv->expr = true; } set = nft_set_lookup_global(ctx->net, ctx->table, @@ -134,6 +193,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_OBJECT) + return -EOPNOTSUPP; + if (set->ops->update == NULL) return -EOPNOTSUPP; @@ -141,70 +203,119 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return -EBUSY; priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); - switch (priv->op) { - case NFT_DYNSET_OP_ADD: - case NFT_DYNSET_OP_DELETE: - break; - case NFT_DYNSET_OP_UPDATE: - if (!(set->flags & NFT_SET_TIMEOUT)) - return -EOPNOTSUPP; - break; - default: + if (priv->op > NFT_DYNSET_OP_DELETE) return -EOPNOTSUPP; - } timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) - return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - tb[NFTA_DYNSET_TIMEOUT]))); + return -EOPNOTSUPP; + + err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout); + if (err) + return err; } - priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]); - err = nft_validate_register_load(priv->sreg_key, set->klen); + err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, + set->klen); if (err < 0) return err; if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { if (!(set->flags & NFT_SET_MAP)) - return -EINVAL; + return -EOPNOTSUPP; if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; - priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]); - err = nft_validate_register_load(priv->sreg_data, set->dlen); + err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_DATA], + &priv->sreg_data, set->dlen); if (err < 0) return err; } else if (set->flags & NFT_SET_MAP) return -EINVAL; - if (tb[NFTA_DYNSET_EXPR] != NULL) { - if (!(set->flags & NFT_SET_EVAL)) - return -EINVAL; + if ((tb[NFTA_DYNSET_EXPR] || tb[NFTA_DYNSET_EXPRESSIONS]) && + !(set->flags & NFT_SET_EVAL)) + return -EINVAL; + + if (tb[NFTA_DYNSET_EXPR]) { + struct nft_expr *dynset_expr; + + dynset_expr = nft_dynset_expr_alloc(ctx, set, + tb[NFTA_DYNSET_EXPR], 0); + if (IS_ERR(dynset_expr)) + return PTR_ERR(dynset_expr); + + priv->num_exprs++; + priv->expr_array[0] = dynset_expr; + + if (set->num_exprs > 1 || + (set->num_exprs == 1 && + dynset_expr->ops != set->exprs[0]->ops)) { + err = -EOPNOTSUPP; + goto err_expr_free; + } + } else if (tb[NFTA_DYNSET_EXPRESSIONS]) { + struct nft_expr *dynset_expr; + struct nlattr *tmp; + int left; - priv->expr = nft_set_elem_expr_alloc(ctx, set, - tb[NFTA_DYNSET_EXPR]); - if (IS_ERR(priv->expr)) - return PTR_ERR(priv->expr); + if (!priv->expr) + return -EINVAL; - if (set->expr && set->expr->ops != priv->expr->ops) { + i = 0; + nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_expr_free; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_expr_free; + } + dynset_expr = nft_dynset_expr_alloc(ctx, set, tmp, i); + if (IS_ERR(dynset_expr)) { + err = PTR_ERR(dynset_expr); + goto err_expr_free; + } + priv->expr_array[i] = dynset_expr; + priv->num_exprs++; + + if (set->num_exprs) { + if (i >= set->num_exprs) { + err = -EINVAL; + goto err_expr_free; + } + if (dynset_expr->ops != set->exprs[i]->ops) { + err = -EOPNOTSUPP; + goto err_expr_free; + } + } + i++; + } + if (set->num_exprs && set->num_exprs != i) { err = -EOPNOTSUPP; goto err_expr_free; } + } else if (set->num_exprs > 0) { + err = nft_set_elem_expr_clone(ctx, set, priv->expr_array); + if (err < 0) + return err; + + priv->num_exprs = set->num_exprs; } nft_set_ext_prepare(&priv->tmpl); nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); if (set->flags & NFT_SET_MAP) nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); - if (priv->expr != NULL) - nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR, - priv->expr->ops->size); - if (set->flags & NFT_SET_TIMEOUT) { - if (timeout || set->timeout) - nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); - } + + if (priv->num_exprs) + nft_dynset_ext_add_expr(priv); + + if (set->flags & NFT_SET_TIMEOUT && + (timeout || READ_ONCE(set->timeout))) + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); priv->timeout = timeout; @@ -219,8 +330,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return 0; err_expr_free: - if (priv->expr != NULL) - nft_expr_destroy(ctx, priv->expr); + for (i = 0; i < priv->num_exprs; i++) + nft_expr_destroy(ctx, priv->expr_array[i]); return err; } @@ -238,24 +349,27 @@ static void nft_dynset_activate(const struct nft_ctx *ctx, { struct nft_dynset *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_dynset_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_dynset *priv = nft_expr_priv(expr); + int i; - if (priv->expr != NULL) - nft_expr_destroy(ctx, priv->expr); + for (i = 0; i < priv->num_exprs; i++) + nft_expr_destroy(ctx, priv->expr_array[i]); nf_tables_destroy_set(ctx, priv->set); } -static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dynset_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_dynset *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; + int i; if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) goto nla_put_failure; @@ -267,11 +381,29 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, - cpu_to_be64(jiffies_to_msecs(priv->timeout)), + nf_jiffies64_to_msecs(priv->timeout), NFTA_DYNSET_PAD)) goto nla_put_failure; - if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) - goto nla_put_failure; + if (priv->set->num_exprs == 0) { + if (priv->num_exprs == 1) { + if (nft_expr_dump(skb, NFTA_DYNSET_EXPR, + priv->expr_array[0], reset)) + goto nla_put_failure; + } else if (priv->num_exprs > 1) { + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, NFTA_DYNSET_EXPRESSIONS); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < priv->num_exprs; i++) { + if (nft_expr_dump(skb, NFTA_LIST_ELEM, + priv->expr_array[i], reset)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + } if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags))) goto nla_put_failure; return 0; @@ -289,6 +421,7 @@ static const struct nft_expr_ops nft_dynset_ops = { .activate = nft_dynset_activate, .deactivate = nft_dynset_deactivate, .dump = nft_dynset_dump, + .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_dynset_type __read_mostly = { diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 07782836fad6..7eedf4e3ae9c 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -5,11 +5,13 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/dccp.h> +#include <linux/sctp.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/tcp.h> @@ -19,8 +21,8 @@ struct nft_exthdr { u8 offset; u8 len; u8 op; - enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 dreg; + u8 sreg; u8 flags; }; @@ -33,6 +35,14 @@ static unsigned int optlen(const u8 *opt, unsigned int offset) return opt[offset + 1]; } +static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len) +{ + if (len % NFT_REG32_SIZE) + dest[len / NFT_REG32_SIZE] = 0; + + return skb_copy_bits(skb, offset, dest, len); +} + static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -42,17 +52,19 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, unsigned int offset = 0; int err; + if (pkt->skb->protocol != htons(ETH_P_IPV6)) + goto err; + err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); if (priv->flags & NFT_EXTHDR_F_PRESENT) { - *dest = (err >= 0); + nft_reg_store8(dest, err >= 0); return; } else if (err < 0) { goto err; } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: @@ -73,7 +85,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; struct iphdr *iph, _iph; - unsigned int start; bool found = false; __be32 info; int optlen; @@ -81,7 +92,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (!iph) return -EBADMSG; - start = sizeof(struct iphdr); optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); if (optlen <= 0) @@ -91,7 +101,7 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, /* Copy the options since __ip_options_compile() modifies * the options. */ - if (skb_copy_bits(skb, start, opt->__data, optlen)) + if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen)) return -EBADMSG; opt->optlen = optlen; @@ -106,18 +116,18 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, found = target == IPOPT_SSRR ? opt->is_strictroute : !opt->is_strictroute; if (found) - *offset = opt->srr + start; + *offset = opt->srr; break; case IPOPT_RR: if (!opt->rr) break; - *offset = opt->rr + start; + *offset = opt->rr; found = true; break; case IPOPT_RA: if (!opt->router_alert) break; - *offset = opt->router_alert + start; + *offset = opt->router_alert; found = true; break; default: @@ -141,15 +151,14 @@ static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); if (priv->flags & NFT_EXTHDR_F_PRESENT) { - *dest = (err >= 0); + nft_reg_store8(dest, err >= 0); return; } else if (err < 0) { goto err; } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: @@ -162,10 +171,10 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, { struct tcphdr *tcph; - if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) + if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) return NULL; - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer); + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); if (!tcph) return NULL; @@ -173,7 +182,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len) return NULL; - return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer); + return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer); } static void nft_exthdr_tcp_eval(const struct nft_expr *expr, @@ -203,9 +212,10 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr, offset = i + priv->offset; if (priv->flags & NFT_EXTHDR_F_PRESENT) { - *dest = 1; + nft_reg_store8(dest, 1); } else { - dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; memcpy(dest, opt + offset, priv->len); } @@ -231,9 +241,14 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) - return; + goto err; + if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) + goto err; + + tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { union { __be16 v16; @@ -246,22 +261,13 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, continue; if (i + optl > tcphdr_len || priv->len + priv->offset > optl) - return; - - if (skb_ensure_writable(pkt->skb, - pkt->xt.thoff + i + priv->len)) - return; - - tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, - &tcphdr_len); - if (!tcph) - return; + goto err; offset = i + priv->offset; switch (priv->len) { case 2: - old.v16 = get_unaligned((u16 *)(opt + offset)); + old.v16 = (__force __be16)get_unaligned((u16 *)(opt + offset)); new.v16 = (__force __be16)nft_reg_load16( ®s->data[priv->sreg]); @@ -276,18 +282,18 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (old.v16 == new.v16) return; - put_unaligned(new.v16, (u16*)(opt + offset)); + put_unaligned(new.v16, (__be16*)(opt + offset)); inet_proto_csum_replace2(&tcph->check, pkt->skb, old.v16, new.v16, false); break; case 4: - new.v32 = regs->data[priv->sreg]; - old.v32 = get_unaligned((u32 *)(opt + offset)); + new.v32 = nft_reg_load_be32(®s->data[priv->sreg]); + old.v32 = (__force __be32)get_unaligned((u32 *)(opt + offset)); if (old.v32 == new.v32) return; - put_unaligned(new.v32, (u32*)(opt + offset)); + put_unaligned(new.v32, (__be32*)(opt + offset)); inet_proto_csum_replace4(&tcph->check, pkt->skb, old.v32, new.v32, false); break; @@ -298,15 +304,194 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, return; } + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int i, tcphdr_len, optl; + struct tcphdr *tcph; + u8 *opt; + + tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); + if (!tcph) + goto err; + + if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) + goto drop; + + tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); + opt = (u8 *)tcph; + + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { + unsigned int j; + + optl = optlen(opt, i); + if (priv->type != opt[i]) + continue; + + if (i + optl > tcphdr_len) + goto drop; + + for (j = 0; j < optl; ++j) { + u16 n = TCPOPT_NOP; + u16 o = opt[i+j]; + + if ((i + j) % 2 == 0) { + o <<= 8; + n <<= 8; + } + inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o), + htons(n), false); + } + memset(opt + i, TCPOPT_NOP, optl); + return; + } + + /* option not found, continue. This allows to do multiple + * option removals per rule. + */ + return; +err: + regs->verdict.code = NFT_BREAK; + return; +drop: + /* can't remove, no choice but to drop */ + regs->verdict.code = NF_DROP; +} + +static void nft_exthdr_sctp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr); + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + + if (pkt->tprot != IPPROTO_SCTP) + goto err; + + do { + sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); + if (!sch || !sch->length) + break; + + if (sch->type == priv->type) { + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + nft_reg_store8(dest, true); + return; + } + if (priv->offset + priv->len > ntohs(sch->length) || + offset + ntohs(sch->length) > pkt->skb->len) + break; + + if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset, + dest, priv->len) < 0) + break; + return; + } + offset += SCTP_PAD4(ntohs(sch->length)); + } while (offset < pkt->skb->len); +err: + if (priv->flags & NFT_EXTHDR_F_PRESENT) + nft_reg_store8(dest, false); + else + regs->verdict.code = NFT_BREAK; } +#ifdef CONFIG_NFT_EXTHDR_DCCP +static void nft_exthdr_dccp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int thoff, dataoff, optoff, optlen, i; + u32 *dest = ®s->data[priv->dreg]; + const struct dccp_hdr *dh; + struct dccp_hdr _dh; + + if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff) + goto err; + + thoff = nft_thoff(pkt); + + dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh); + if (!dh) + goto err; + + dataoff = dh->dccph_doff * sizeof(u32); + optoff = __dccp_hdr_len(dh); + if (dataoff <= optoff) + goto err; + + optlen = dataoff - optoff; + + for (i = 0; i < optlen; ) { + /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in + * the length; the remaining options are at least 2B long. In + * all cases, the first byte contains the option type. In + * multi-byte options, the second byte contains the option + * length, which must be at least two: 1 for the type plus 1 for + * the length plus 0-253 for any following option data. We + * aren't interested in the option data, only the type and the + * length, so we don't need to read more than two bytes at a + * time. + */ + unsigned int buflen = optlen - i; + u8 buf[2], *bufp; + u8 type, len; + + if (buflen > sizeof(buf)) + buflen = sizeof(buf); + + bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen, + &buf); + if (!bufp) + goto err; + + type = bufp[0]; + + if (type == priv->type) { + nft_reg_store8(dest, 1); + return; + } + + if (type <= DCCPO_MAX_RESERVED) { + i++; + continue; + } + + if (buflen < 2) + goto err; + + len = bufp[1]; + + if (len < 2) + goto err; + + i += len; + } + +err: + *dest = 0; +} +#endif + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, - [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, + [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, - [NFTA_EXTHDR_OP] = { .type = NLA_U32 }, + [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; @@ -350,12 +535,12 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; - priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); priv->flags = flags; priv->op = op; - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, @@ -400,11 +585,33 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; - priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]); priv->flags = flags; priv->op = op; - return nft_validate_register_load(priv->sreg, priv->len); + return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg, + priv->len); +} + +static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + + if (tb[NFTA_EXTHDR_SREG] || + tb[NFTA_EXTHDR_DREG] || + tb[NFTA_EXTHDR_FLAGS] || + tb[NFTA_EXTHDR_OFFSET] || + tb[NFTA_EXTHDR_LEN]) + return -EINVAL; + + if (!tb[NFTA_EXTHDR_TYPE]) + return -EINVAL; + + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); + priv->op = NFT_EXTHDR_OP_TCPOPT; + + return 0; } static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, @@ -429,6 +636,24 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, return 0; } +#ifdef CONFIG_NFT_EXTHDR_DCCP +static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + if (!(priv->flags & NFT_EXTHDR_F_PRESENT)) + return -EOPNOTSUPP; + + return 0; +} +#endif + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -447,7 +672,8 @@ nla_put_failure: return -1; } -static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_exthdr_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); @@ -457,7 +683,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } -static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_exthdr_dump_set(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); @@ -467,12 +694,48 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } +static int nft_exthdr_dump_strip(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) +{ + const struct nft_exthdr *priv = nft_expr_priv(expr); + + return nft_exthdr_dump_common(skb, priv); +} + +static bool nft_exthdr_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_exthdr *priv = nft_expr_priv(expr); + const struct nft_exthdr *exthdr; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + exthdr = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->type != exthdr->type || + priv->op != exthdr->op || + priv->flags != exthdr->flags || + priv->offset != exthdr->offset || + priv->len != exthdr->len) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return nft_expr_reduce_bitwise(track, expr); +} + static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_ipv6_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_ipv4_ops = { @@ -481,6 +744,7 @@ static const struct nft_expr_ops nft_exthdr_ipv4_ops = { .eval = nft_exthdr_ipv4_eval, .init = nft_exthdr_ipv4_init, .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_tcp_ops = { @@ -489,6 +753,7 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = { .eval = nft_exthdr_tcp_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { @@ -497,8 +762,38 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .eval = nft_exthdr_tcp_set_eval, .init = nft_exthdr_tcp_set_init, .dump = nft_exthdr_dump_set, + .reduce = NFT_REDUCE_READONLY, }; +static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_tcp_strip_eval, + .init = nft_exthdr_tcp_strip_init, + .dump = nft_exthdr_dump_strip, + .reduce = NFT_REDUCE_READONLY, +}; + +static const struct nft_expr_ops nft_exthdr_sctp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_sctp_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, +}; + +#ifdef CONFIG_NFT_EXTHDR_DCCP +static const struct nft_expr_ops nft_exthdr_dccp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_dccp_eval, + .init = nft_exthdr_dccp_init, + .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, +}; +#endif + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -518,7 +813,7 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return &nft_exthdr_tcp_set_ops; if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_tcp_ops; - break; + return &nft_exthdr_tcp_strip_ops; case NFT_EXTHDR_OP_IPV6: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; @@ -529,6 +824,16 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return &nft_exthdr_ipv4_ops; } break; + case NFT_EXTHDR_OP_SCTP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_sctp_ops; + break; +#ifdef CONFIG_NFT_EXTHDR_DCCP + case NFT_EXTHDR_OP_DCCP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_dccp_ops; + break; +#endif } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index cfac0964f48d..96e02a83c045 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -14,27 +14,29 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> +#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ + NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ + NFTA_FIB_F_PRESENT) + const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { [NFTA_FIB_DREG] = { .type = NLA_U32 }, [NFTA_FIB_RESULT] = { .type = NLA_U32 }, - [NFTA_FIB_FLAGS] = { .type = NLA_U32 }, + [NFTA_FIB_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFTA_FIB_F_ALL), }; EXPORT_SYMBOL(nft_fib_policy); -#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ - NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ - NFTA_FIB_F_PRESENT) - -int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_fib *priv = nft_expr_priv(expr); unsigned int hooks; switch (priv->result) { - case NFT_FIB_RESULT_OIF: /* fallthrough */ + case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: - hooks = (1 << NF_INET_PRE_ROUTING); + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); break; case NFT_FIB_RESULT_ADDRTYPE: if (priv->flags & NFTA_FIB_F_IIF) @@ -73,7 +75,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS])); - if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL)) + if (priv->flags == 0) return -EINVAL; if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == @@ -86,7 +88,6 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); - priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]); switch (priv->result) { case NFT_FIB_RESULT_OIF: @@ -106,8 +107,8 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; } - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + err = nft_parse_register_store(ctx, tb[NFTA_FIB_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); if (err < 0) return err; @@ -115,7 +116,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } EXPORT_SYMBOL_GPL(nft_fib_init); -int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_fib *priv = nft_expr_priv(expr); @@ -141,13 +142,17 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv, switch (priv->result) { case NFT_FIB_RESULT_OIF: index = dev ? dev->ifindex : 0; - *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index; + if (priv->flags & NFTA_FIB_F_PRESENT) + nft_reg_store8(dreg, !!index); + else + *dreg = index; + break; case NFT_FIB_RESULT_OIFNAME: if (priv->flags & NFTA_FIB_F_PRESENT) - *dreg = !!dev; + nft_reg_store8(dreg, !!dev); else - strncpy(reg, dev ? dev->name : "", IFNAMSIZ); + strscpy_pad(reg, dev ? dev->name : "", IFNAMSIZ); break; default: WARN_ON_ONCE(1); @@ -157,5 +162,48 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv, } EXPORT_SYMBOL_GPL(nft_fib_store_result); +bool nft_fib_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + unsigned int len = NFT_REG32_SIZE; + const struct nft_fib *fib; + + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + break; + case NFT_FIB_RESULT_OIFNAME: + if (priv->flags & NFTA_FIB_F_PRESENT) + len = NFT_REG32_SIZE; + else + len = IFNAMSIZ; + break; + case NFT_FIB_RESULT_ADDRTYPE: + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, len); + return false; + } + + fib = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->result != fib->result || + priv->flags != fib->flags) { + nft_reg_track_update(track, expr, priv->dreg, len); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(nft_fib_reduce); + MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Query routing table from nftables"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c index 465432e0531b..666a3741d20b 100644 --- a/net/netfilter/nft_fib_inet.c +++ b/net/netfilter/nft_fib_inet.c @@ -49,6 +49,7 @@ static const struct nft_expr_ops nft_fib_inet_ops = { .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, + .reduce = nft_fib_reduce, }; static struct nft_expr_type nft_fib_inet_type __read_mostly = { @@ -76,3 +77,4 @@ module_exit(nft_fib_inet_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_ALIAS_NFT_AF_EXPR(1, "fib"); +MODULE_DESCRIPTION("nftables fib inet support"); diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c index a2e726ae7f07..9121ec64e918 100644 --- a/net/netfilter/nft_fib_netdev.c +++ b/net/netfilter/nft_fib_netdev.c @@ -58,6 +58,7 @@ static const struct nft_expr_ops nft_fib_netdev_ops = { .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, + .reduce = nft_fib_reduce, }; static struct nft_expr_type nft_fib_netdev_type __read_mostly = { @@ -85,3 +86,4 @@ module_exit(nft_fib_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo M. Bermudo Garay <pablombg@gmail.com>"); MODULE_ALIAS_NFT_AF_EXPR(5, "fib"); +MODULE_DESCRIPTION("nftables netdev fib lookups support"); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index b70b48996801..b8f76c9057fd 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -8,7 +8,8 @@ #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> -#include <net/ip.h> /* for ipv4 options. */ +#include <net/ip.h> +#include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> @@ -19,37 +20,6 @@ struct nft_flow_offload { struct nft_flowtable *flowtable; }; -static int nft_flow_route(const struct nft_pktinfo *pkt, - const struct nf_conn *ct, - struct nf_flow_route *route, - enum ip_conntrack_dir dir) -{ - struct dst_entry *this_dst = skb_dst(pkt->skb); - struct dst_entry *other_dst = NULL; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; - fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; - break; - case NFPROTO_IPV6: - fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; - fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; - break; - } - - nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); - if (!other_dst) - return -ENOENT; - - route->tuple[dir].dst = this_dst; - route->tuple[!dir].dst = other_dst; - - return 0; -} - static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) @@ -67,6 +37,15 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family) return false; } +static void flow_offload_ct_tcp(struct nf_conn *ct) +{ + /* conntrack will not see all packets, disable tcp window validation. */ + spin_lock_bh(&ct->lock); + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + spin_unlock_bh(&ct->lock); +} + static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -74,8 +53,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; + struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; - struct nf_flow_route route; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; @@ -90,19 +69,33 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); - if (unlikely(!tcph || tcph->fin || tcph->rst)) + if (unlikely(!tcph || tcph->fin || tcph->rst || + !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: break; +#ifdef CONFIG_NF_CT_PROTO_GRE + case IPPROTO_GRE: { + struct nf_conntrack_tuple *tuple; + + if (ct->status & IPS_NAT_MASK) + goto out; + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + /* No support for GRE v1 */ + if (tuple->src.u.gre.key || tuple->dst.u.gre.key) + goto out; + break; + } +#endif default: goto out; } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || - ct->status & IPS_SEQ_ADJUST) + ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) @@ -112,31 +105,28 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto out; dir = CTINFO2DIR(ctinfo); - if (nft_flow_route(pkt, ct, &route, dir) < 0) + if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; - if (flow_offload_route_init(flow, &route) < 0) - goto err_flow_add; - - if (tcph) { - ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - } + flow_offload_route_init(flow, &route); + if (tcph) + flow_offload_ct_tcp(ct); + __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; - dst_release(route.tuple[!dir].dst); return; err_flow_add: flow_offload_free(flow); err_flow_alloc: + dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); @@ -145,11 +135,15 @@ out: } static int nft_flow_offload_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, hook_mask); } @@ -169,13 +163,15 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->net, ctx->table, + tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); + if (!nft_use_inc(&flowtable->use)) + return -EMFILE; + priv->flowtable = flowtable; - flowtable->use++; return nf_ct_netns_get(ctx->net, ctx->family); } @@ -194,7 +190,7 @@ static void nft_flow_offload_activate(const struct nft_ctx *ctx, { struct nft_flow_offload *priv = nft_expr_priv(expr); - priv->flowtable->use++; + nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, @@ -203,7 +199,8 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx, nf_ct_netns_put(ctx->net, ctx->family); } -static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_flow_offload_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); @@ -227,6 +224,7 @@ static const struct nft_expr_ops nft_flow_offload_ops = { .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { @@ -286,3 +284,4 @@ module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("flow_offload"); +MODULE_DESCRIPTION("nftables hardware flow offload module"); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 3087e23297db..152a9fb4d23a 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -18,7 +18,7 @@ #include <net/ip.h> struct nft_fwd_netdev { - enum nft_registers sreg_dev:8; + u8 sreg_dev; }; static void nft_fwd_netdev_eval(const struct nft_expr *expr, @@ -27,9 +27,11 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, { struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; + struct sk_buff *skb = pkt->skb; /* This is used by ifb only. */ - skb_set_redirected(pkt->skb, true); + skb->skb_iif = skb->dev->ifindex; + skb_set_redirected(skb, nft_hook(pkt) == NF_NETDEV_INGRESS); nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; @@ -38,7 +40,7 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 }, - [NFTA_FWD_NFPROTO] = { .type = NLA_U32 }, + [NFTA_FWD_NFPROTO] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_fwd_netdev_init(const struct nft_ctx *ctx, @@ -50,11 +52,12 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_FWD_SREG_DEV] == NULL) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + sizeof(int)); } -static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_fwd_netdev_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_fwd_netdev *priv = nft_expr_priv(expr); @@ -77,9 +80,14 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); } +static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + struct nft_fwd_neigh { - enum nft_registers sreg_dev:8; - enum nft_registers sreg_addr:8; + u8 sreg_dev; + u8 sreg_addr; u8 nfproto; }; @@ -138,6 +146,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, return; skb->dev = dev; + skb_clear_tstamp(skb); neigh_xmit(neigh_table, dev, addr, skb); out: regs->verdict.code = verdict; @@ -156,8 +165,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, !tb[NFTA_FWD_NFPROTO]) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); - priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]); priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO])); switch (priv->nfproto) { @@ -171,14 +178,17 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nft_validate_register_load(priv->sreg_dev, sizeof(int)); + err = nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + sizeof(int)); if (err < 0) return err; - return nft_validate_register_load(priv->sreg_addr, addr_len); + return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, + addr_len); } -static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_fwd_neigh_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_fwd_neigh *priv = nft_expr_priv(expr); @@ -194,10 +204,10 @@ nla_put_failure: } static int nft_fwd_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { - return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); + return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) | + (1 << NF_NETDEV_EGRESS)); } static struct nft_expr_type nft_fwd_netdev_type; @@ -208,6 +218,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .init = nft_fwd_neigh_init, .dump = nft_fwd_neigh_dump, .validate = nft_fwd_validate, + .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_fwd_netdev_ops = { @@ -217,7 +228,9 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, .validate = nft_fwd_validate, + .reduce = NFT_REDUCE_READONLY, .offload = nft_fwd_netdev_offload, + .offload_action = nft_fwd_netdev_offload_action, }; static const struct nft_expr_ops * @@ -256,4 +269,5 @@ module_exit(nft_fwd_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nftables netdev packet forwarding support"); MODULE_ALIAS_NFT_AF_EXPR(5, "fwd"); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index b836d550b919..5d034bbb6913 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -14,8 +14,8 @@ #include <linux/jhash.h> struct nft_jhash { - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; u8 len; bool autogen_seed:1; u32 modulus; @@ -38,7 +38,7 @@ static void nft_jhash_eval(const struct nft_expr *expr, } struct nft_symhash { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; u32 offset; }; @@ -51,7 +51,8 @@ static void nft_symhash_eval(const struct nft_expr *expr, struct sk_buff *skb = pkt->skb; u32 h; - h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus); + h = reciprocal_scale(__skb_get_hash_symmetric_net(nft_net(pkt), skb), + priv->modulus); regs->data[priv->dreg] = h + priv->offset; } @@ -59,7 +60,7 @@ static void nft_symhash_eval(const struct nft_expr *expr, static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, - [NFTA_HASH_LEN] = { .type = NLA_U32 }, + [NFTA_HASH_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, @@ -83,9 +84,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); - priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len); if (err < 0) return err; @@ -94,6 +92,10 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; + err = nft_parse_register_load(ctx, tb[NFTA_HASH_SREG], &priv->sreg, len); + if (err < 0) + return err; + priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; @@ -108,9 +110,8 @@ static int nft_jhash_init(const struct nft_ctx *ctx, get_random_bytes(&priv->seed, sizeof(priv->seed)); } - return nft_validate_register_load(priv->sreg, len) && - nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_symhash_init(const struct nft_ctx *ctx, @@ -126,8 +127,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; @@ -135,12 +134,13 @@ static int nft_symhash_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + sizeof(u32)); } static int nft_jhash_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_jhash *priv = nft_expr_priv(expr); @@ -166,8 +166,18 @@ nla_put_failure: return -1; } +static bool nft_jhash_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_jhash *priv = nft_expr_priv(expr); + + nft_reg_track_cancel(track, priv->dreg, sizeof(u32)); + + return false; +} + static int nft_symhash_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_symhash *priv = nft_expr_priv(expr); @@ -186,6 +196,30 @@ nla_put_failure: return -1; } +static bool nft_symhash_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + struct nft_symhash *priv = nft_expr_priv(expr); + struct nft_symhash *symhash; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, sizeof(u32)); + return false; + } + + symhash = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->offset != symhash->offset || + priv->modulus != symhash->modulus) { + nft_reg_track_update(track, expr, priv->dreg, sizeof(u32)); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return false; +} + static struct nft_expr_type nft_hash_type; static const struct nft_expr_ops nft_jhash_ops = { .type = &nft_hash_type, @@ -193,6 +227,7 @@ static const struct nft_expr_ops nft_jhash_ops = { .eval = nft_jhash_eval, .init = nft_jhash_init, .dump = nft_jhash_dump, + .reduce = nft_jhash_reduce, }; static const struct nft_expr_ops nft_symhash_ops = { @@ -201,6 +236,7 @@ static const struct nft_expr_ops nft_symhash_ops = { .eval = nft_symhash_eval, .init = nft_symhash_init, .dump = nft_symhash_dump, + .reduce = nft_symhash_reduce, }; static const struct nft_expr_ops * @@ -248,3 +284,4 @@ module_exit(nft_hash_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>"); MODULE_ALIAS_NFT_EXPR("hash"); +MODULE_DESCRIPTION("Netfilter nftables hash module"); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c7f0ef73d939..02ee5fb69871 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -29,31 +29,62 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, }; +static enum nft_data_types nft_reg_to_type(const struct nlattr *nla) +{ + enum nft_data_types type; + u8 reg; + + reg = ntohl(nla_get_be32(nla)); + if (reg == NFT_REG_VERDICT) + type = NFT_DATA_VERDICT; + else + type = NFT_DATA_VALUE; + + return type; +} + static int nft_immediate_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_immediate_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .size = sizeof(priv->data), + }; int err; if (tb[NFTA_IMMEDIATE_DREG] == NULL || tb[NFTA_IMMEDIATE_DATA] == NULL) return -EINVAL; - err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_IMMEDIATE_DATA]); + desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]); + err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; priv->dlen = desc.len; - priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, &priv->data, - desc.type, desc.len); + err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG], + &priv->dreg, &priv->data, desc.type, + desc.len); if (err < 0) goto err1; + if (priv->dreg == NFT_REG_VERDICT) { + struct nft_chain *chain = priv->data.verdict.chain; + + switch (priv->data.verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + err = nf_tables_bind_chain(ctx, chain); + if (err < 0) + goto err1; + break; + default: + break; + } + } + return 0; err1: @@ -65,15 +96,86 @@ static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_activate(&chain_ctx, rule); + + nft_clear(ctx->net, chain); + break; + default: + break; + } + } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } +static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx, + struct nft_chain *chain, + enum nft_trans_phase phase) +{ + struct nft_ctx chain_ctx; + struct nft_rule *rule; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_deactivate(&chain_ctx, rule, phase); +} + static void nft_immediate_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_chain *chain; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nf_tables_unbind_chain(ctx, chain); + nft_deactivate_next(ctx->net, chain); + break; + case NFT_TRANS_PREPARE: + nft_immediate_chain_deactivate(ctx, chain, phase); + nft_deactivate_next(ctx->net, chain); + break; + default: + nft_immediate_chain_deactivate(ctx, chain, phase); + nft_chain_del(chain); + chain->bound = false; + nft_use_dec(&chain->table->use); + break; + } + break; + default: + break; + } + } if (phase == NFT_TRANS_COMMIT) return; @@ -81,7 +183,53 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } -static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_rule *rule, *n; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + + if (priv->dreg != NFT_REG_VERDICT) + return; + + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + + if (!nft_chain_binding(chain)) + break; + + /* Rule construction failed, but chain is already bound: + * let the transaction records release this chain and its rules. + */ + if (chain->bound) { + nft_use_dec(&chain->use); + break; + } + + /* Rule has been deleted, release chain and its rules. */ + chain_ctx = *ctx; + chain_ctx.chain = chain; + + nft_use_dec(&chain->use); + list_for_each_entry_safe(rule, n, &chain->rules, list) { + nft_use_dec(&chain->use); + list_del(&rule->list); + nf_tables_rule_destroy(&chain_ctx, rule); + } + nf_tables_chain_destroy(chain); + break; + default: + break; + } +} + +static int nft_immediate_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); @@ -96,8 +244,7 @@ nla_put_failure: } static int nft_immediate_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **d) + const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_ctx *pctx = (struct nft_ctx *)ctx; @@ -163,6 +310,27 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_immediate_offload_action(const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + return true; + + return false; +} + +static bool nft_immediate_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg != NFT_REG_VERDICT) + nft_reg_track_cancel(track, priv->dreg, priv->dlen); + + return false; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -170,10 +338,12 @@ static const struct nft_expr_ops nft_imm_ops = { .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, + .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, + .reduce = nft_immediate_reduce, .offload = nft_immediate_offload, - .offload_flags = NFT_OFFLOAD_F_ACTION, + .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c new file mode 100644 index 000000000000..c4569d4b9228 --- /dev/null +++ b/net/netfilter/nft_inner.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org> + */ + +#include <linux/kernel.h> +#include <linux/if_vlan.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> +#include <net/netfilter/nf_tables_offload.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <net/gre.h> +#include <net/geneve.h> +#include <net/ip.h> +#include <linux/icmpv6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> + +struct nft_inner_tun_ctx_locked { + struct nft_inner_tun_ctx ctx; + local_lock_t bh_lock; +}; + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; + +/* Same layout as nft_expr but it embeds the private expression data area. */ +struct __nft_expr { + const struct nft_expr_ops *ops; + union { + struct nft_payload payload; + struct nft_meta meta; + } __attribute__((aligned(__alignof__(u64)))); +}; + +enum { + NFT_INNER_EXPR_PAYLOAD, + NFT_INNER_EXPR_META, +}; + +struct nft_inner { + u8 flags; + u8 hdrsize; + u8 type; + u8 expr_type; + + struct __nft_expr expr; +}; + +static int nft_inner_parse_l2l3(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx, u32 off) +{ + __be16 llproto, outer_llproto; + u32 nhoff, thoff; + + if (priv->flags & NFT_INNER_LL) { + struct vlan_ethhdr *veth, _veth; + struct ethhdr *eth, _eth; + u32 hdrsize; + + eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth); + if (!eth) + return -1; + + switch (eth->h_proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + llproto = eth->h_proto; + hdrsize = sizeof(_eth); + break; + case htons(ETH_P_8021Q): + veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth); + if (!veth) + return -1; + + outer_llproto = veth->h_vlan_encapsulated_proto; + llproto = veth->h_vlan_proto; + hdrsize = sizeof(_veth); + break; + default: + return -1; + } + + ctx->inner_lloff = off; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL; + off += hdrsize; + } else { + struct iphdr *iph; + u32 _version; + + iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version); + if (!iph) + return -1; + + switch (iph->version) { + case 4: + llproto = htons(ETH_P_IP); + break; + case 6: + llproto = htons(ETH_P_IPV6); + break; + default: + return -1; + } + } + + ctx->llproto = llproto; + if (llproto == htons(ETH_P_8021Q)) + llproto = outer_llproto; + + nhoff = off; + + switch (llproto) { + case htons(ETH_P_IP): { + struct iphdr *iph, _iph; + + iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return -1; + + if (iph->ihl < 5 || iph->version != 4) + return -1; + + ctx->inner_nhoff = nhoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; + + thoff = nhoff + (iph->ihl * 4); + if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) { + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + ctx->inner_thoff = thoff; + ctx->l4proto = iph->protocol; + } + } + break; + case htons(ETH_P_IPV6): { + struct ipv6hdr *ip6h, _ip6h; + int fh_flags = IP6_FH_F_AUTH; + unsigned short fragoff; + int l4proto; + + ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h); + if (!ip6h) + return -1; + + if (ip6h->version != 6) + return -1; + + ctx->inner_nhoff = nhoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; + + thoff = nhoff; + l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags); + if (l4proto < 0 || thoff > U16_MAX) + return -1; + + if (fragoff == 0) { + thoff = nhoff + sizeof(_ip6h); + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + ctx->inner_thoff = thoff; + ctx->l4proto = l4proto; + } + } + break; + default: + return -1; + } + + return 0; +} + +static int nft_inner_parse_tunhdr(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx, u32 *off) +{ + if (pkt->tprot == IPPROTO_GRE) { + ctx->inner_tunoff = pkt->thoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; + return 0; + } + + if (pkt->tprot != IPPROTO_UDP) + return -1; + + ctx->inner_tunoff = *off; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; + *off += priv->hdrsize; + + switch (priv->type) { + case NFT_INNER_GENEVE: { + struct genevehdr *gnvh, _gnvh; + + gnvh = skb_header_pointer(pkt->skb, pkt->inneroff, + sizeof(_gnvh), &_gnvh); + if (!gnvh) + return -1; + + *off += gnvh->opt_len * 4; + } + break; + default: + break; + } + + return 0; +} + +static int nft_inner_parse(const struct nft_inner *priv, + struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + u32 off = pkt->inneroff; + + if (priv->flags & NFT_INNER_HDRSIZE && + nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0) + return -1; + + if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { + if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0) + return -1; + } else if (priv->flags & NFT_INNER_TH) { + tun_ctx->inner_thoff = off; + tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + } + + tun_ctx->type = priv->type; + tun_ctx->cookie = (unsigned long)pkt->skb; + pkt->flags |= NFT_PKTINFO_INNER_FULL; + + return 0; +} + +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx *this_cpu_tun_ctx; + + local_bh_disable(); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); + if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { + local_bh_enable(); + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + return false; + } + *tun_ctx = *this_cpu_tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + local_bh_enable(); + + return true; +} + +static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, + const struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx *this_cpu_tun_ctx; + + local_bh_disable(); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); + if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) + *this_cpu_tun_ctx = *tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + local_bh_enable(); +} + +static bool nft_inner_parse_needed(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + if (!(pkt->flags & NFT_PKTINFO_INNER_FULL)) + return true; + + if (!nft_inner_restore_tun_ctx(pkt, tun_ctx)) + return true; + + if (priv->type != tun_ctx->type) + return true; + + return false; +} + +static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_inner *priv = nft_expr_priv(expr); + struct nft_inner_tun_ctx tun_ctx = {}; + + if (nft_payload_inner_offset(pkt) < 0) + goto err; + + if (nft_inner_parse_needed(priv, pkt, &tun_ctx) && + nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0) + goto err; + + switch (priv->expr_type) { + case NFT_INNER_EXPR_PAYLOAD: + nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); + break; + case NFT_INNER_EXPR_META: + nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); + break; + default: + WARN_ON_ONCE(1); + goto err; + } + nft_inner_save_tun_ctx(pkt, &tun_ctx); + + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = { + [NFTA_INNER_NUM] = { .type = NLA_U32 }, + [NFTA_INNER_FLAGS] = { .type = NLA_U32 }, + [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 }, + [NFTA_INNER_TYPE] = { .type = NLA_U32 }, + [NFTA_INNER_EXPR] = { .type = NLA_NESTED }, +}; + +struct nft_expr_info { + const struct nft_expr_ops *ops; + const struct nlattr *attr; + struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; +}; + +static int nft_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_inner *priv = nft_expr_priv(expr); + u32 flags, hdrsize, type, num; + struct nft_expr_info expr_info; + int err; + + if (!tb[NFTA_INNER_FLAGS] || + !tb[NFTA_INNER_NUM] || + !tb[NFTA_INNER_HDRSIZE] || + !tb[NFTA_INNER_TYPE] || + !tb[NFTA_INNER_EXPR]) + return -EINVAL; + + flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS])); + if (flags & ~NFT_INNER_MASK) + return -EOPNOTSUPP; + + num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM])); + if (num != 0) + return -EOPNOTSUPP; + + hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE])); + type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE])); + + if (type > U8_MAX) + return -EINVAL; + + if (flags & NFT_INNER_HDRSIZE) { + if (hdrsize == 0 || hdrsize > 64) + return -EOPNOTSUPP; + } + + priv->flags = flags; + priv->hdrsize = hdrsize; + priv->type = type; + + err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info); + if (err < 0) + return err; + + priv->expr.ops = expr_info.ops; + + if (!strcmp(expr_info.ops->type->name, "payload")) + priv->expr_type = NFT_INNER_EXPR_PAYLOAD; + else if (!strcmp(expr_info.ops->type->name, "meta")) + priv->expr_type = NFT_INNER_EXPR_META; + else + return -EINVAL; + + err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr, + (const struct nlattr * const*)expr_info.tb); + if (err < 0) + return err; + + return 0; +} + +static int nft_inner_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) +{ + const struct nft_inner *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) || + nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) || + nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) || + nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize))) + goto nla_put_failure; + + if (nft_expr_dump(skb, NFTA_INNER_EXPR, + (struct nft_expr *)&priv->expr, reset) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_expr_ops nft_inner_ops = { + .type = &nft_inner_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)), + .eval = nft_inner_eval, + .init = nft_inner_init, + .dump = nft_inner_dump, +}; + +struct nft_expr_type nft_inner_type __read_mostly = { + .name = "inner", + .ops = &nft_inner_ops, + .policy = nft_inner_policy, + .maxattr = NFTA_INNER_MAX, + .owner = THIS_MODULE, +}; diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c new file mode 100644 index 000000000000..de1b6066bfa8 --- /dev/null +++ b/net/netfilter/nft_last.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_last { + unsigned long jiffies; + unsigned int set; +}; + +struct nft_last_priv { + struct nft_last *last; +}; + +static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = { + [NFTA_LAST_SET] = { .type = NLA_U32 }, + [NFTA_LAST_MSECS] = { .type = NLA_U64 }, +}; + +static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + struct nft_last *last; + u64 last_jiffies; + int err; + + last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT); + if (!last) + return -ENOMEM; + + if (tb[NFTA_LAST_SET]) + last->set = ntohl(nla_get_be32(tb[NFTA_LAST_SET])); + + if (last->set && tb[NFTA_LAST_MSECS]) { + err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies); + if (err < 0) + goto err; + + last->jiffies = jiffies - (unsigned long)last_jiffies; + } + priv->last = last; + + return 0; +err: + kfree(last); + + return err; +} + +static void nft_last_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + struct nft_last *last = priv->last; + + if (READ_ONCE(last->jiffies) != jiffies) + WRITE_ONCE(last->jiffies, jiffies); + if (READ_ONCE(last->set) == 0) + WRITE_ONCE(last->set, 1); +} + +static int nft_last_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + struct nft_last *last = priv->last; + unsigned long last_jiffies = READ_ONCE(last->jiffies); + u32 last_set = READ_ONCE(last->set); + __be64 msecs; + + if (time_before(jiffies, last_jiffies)) { + WRITE_ONCE(last->set, 0); + last_set = 0; + } + + if (last_set) + msecs = nf_jiffies64_to_msecs(jiffies - last_jiffies); + else + msecs = 0; + + if (nla_put_be32(skb, NFTA_LAST_SET, htonl(last_set)) || + nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static void nft_last_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + + kfree(priv->last); +} + +static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) +{ + struct nft_last_priv *priv_dst = nft_expr_priv(dst); + struct nft_last_priv *priv_src = nft_expr_priv(src); + + priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp); + if (!priv_dst->last) + return -ENOMEM; + + priv_dst->last->set = priv_src->last->set; + priv_dst->last->jiffies = priv_src->last->jiffies; + + return 0; +} + +static const struct nft_expr_ops nft_last_ops = { + .type = &nft_last_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)), + .eval = nft_last_eval, + .init = nft_last_init, + .destroy = nft_last_destroy, + .clone = nft_last_clone, + .dump = nft_last_dump, + .reduce = NFT_REDUCE_READONLY, +}; + +struct nft_expr_type nft_last_type __read_mostly = { + .name = "last", + .ops = &nft_last_ops, + .policy = nft_last_policy, + .maxattr = NFTA_LAST_MAX, + .flags = NFT_EXPR_STATEFUL, + .owner = THIS_MODULE, +}; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 35b67d7e3694..21d26b79b460 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -18,6 +18,10 @@ struct nft_limit { spinlock_t lock; u64 last; u64 tokens; +}; + +struct nft_limit_priv { + struct nft_limit *limit; u64 tokens_max; u64 rate; u64 nsecs; @@ -25,93 +29,111 @@ struct nft_limit { bool invert; }; -static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) +static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) { u64 now, tokens; s64 delta; - spin_lock_bh(&limit->lock); + spin_lock_bh(&priv->limit->lock); now = ktime_get_ns(); - tokens = limit->tokens + now - limit->last; - if (tokens > limit->tokens_max) - tokens = limit->tokens_max; + tokens = priv->limit->tokens + now - priv->limit->last; + if (tokens > priv->tokens_max) + tokens = priv->tokens_max; - limit->last = now; + priv->limit->last = now; delta = tokens - cost; if (delta >= 0) { - limit->tokens = delta; - spin_unlock_bh(&limit->lock); - return limit->invert; + priv->limit->tokens = delta; + spin_unlock_bh(&priv->limit->lock); + return priv->invert; } - limit->tokens = tokens; - spin_unlock_bh(&limit->lock); - return !limit->invert; + priv->limit->tokens = tokens; + spin_unlock_bh(&priv->limit->lock); + return !priv->invert; } /* Use same default as in iptables. */ #define NFT_LIMIT_PKT_BURST_DEFAULT 5 -static int nft_limit_init(struct nft_limit *limit, +static int nft_limit_init(struct nft_limit_priv *priv, const struct nlattr * const tb[], bool pkts) { - u64 unit, tokens; + u64 unit, tokens, rate_with_burst; + bool invert = false; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; - limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + if (priv->rate == 0) + return -EINVAL; + unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); - limit->nsecs = unit * NSEC_PER_SEC; - if (limit->rate == 0 || limit->nsecs < unit) + if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs)) return -EOVERFLOW; if (tb[NFTA_LIMIT_BURST]) - limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); + priv->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); - if (pkts && limit->burst == 0) - limit->burst = NFT_LIMIT_PKT_BURST_DEFAULT; + if (pkts && priv->burst == 0) + priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT; - if (limit->rate + limit->burst < limit->rate) + if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst)) return -EOVERFLOW; if (pkts) { - tokens = div_u64(limit->nsecs, limit->rate) * limit->burst; + u64 tmp = div64_u64(priv->nsecs, priv->rate); + + if (check_mul_overflow(tmp, priv->burst, &tokens)) + return -EOVERFLOW; } else { + u64 tmp; + /* The token bucket size limits the number of tokens can be * accumulated. tokens_max specifies the bucket size. * tokens_max = unit * (rate + burst) / rate. */ - tokens = div_u64(limit->nsecs * (limit->rate + limit->burst), - limit->rate); - } + if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp)) + return -EOVERFLOW; - limit->tokens = tokens; - limit->tokens_max = limit->tokens; + tokens = div64_u64(tmp, priv->rate); + } if (tb[NFTA_LIMIT_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); + if (flags & ~NFT_LIMIT_F_INV) + return -EOPNOTSUPP; + if (flags & NFT_LIMIT_F_INV) - limit->invert = true; + invert = true; } - limit->last = ktime_get_ns(); - spin_lock_init(&limit->lock); + + priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT); + if (!priv->limit) + return -ENOMEM; + + priv->limit->tokens = tokens; + priv->tokens_max = priv->limit->tokens; + priv->invert = invert; + priv->limit->last = ktime_get_ns(); + spin_lock_init(&priv->limit->lock); return 0; } -static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, +static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit_priv *priv, enum nft_limit_type type) { - u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0; - u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); + u32 flags = priv->invert ? NFT_LIMIT_F_INV : 0; + u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC); - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate), + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate), NFTA_LIMIT_PAD) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), NFTA_LIMIT_PAD) || - nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || + nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(priv->burst)) || nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) || nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags))) goto nla_put_failure; @@ -121,8 +143,34 @@ nla_put_failure: return -1; } -struct nft_limit_pkts { - struct nft_limit limit; +static void nft_limit_destroy(const struct nft_ctx *ctx, + const struct nft_limit_priv *priv) +{ + kfree(priv->limit); +} + +static int nft_limit_clone(struct nft_limit_priv *priv_dst, + const struct nft_limit_priv *priv_src, gfp_t gfp) +{ + priv_dst->tokens_max = priv_src->tokens_max; + priv_dst->rate = priv_src->rate; + priv_dst->nsecs = priv_src->nsecs; + priv_dst->burst = priv_src->burst; + priv_dst->invert = priv_src->invert; + + priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp); + if (!priv_dst->limit) + return -ENOMEM; + + spin_lock_init(&priv_dst->limit->lock); + priv_dst->limit->tokens = priv_src->tokens_max; + priv_dst->limit->last = ktime_get_ns(); + + return 0; +} + +struct nft_limit_priv_pkts { + struct nft_limit_priv limit; u64 cost; }; @@ -130,7 +178,7 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit_pkts *priv = nft_expr_priv(expr); + struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; @@ -148,7 +196,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nft_limit_pkts *priv = nft_expr_priv(expr); + struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); int err; err = nft_limit_init(&priv->limit, tb, true); @@ -159,27 +207,50 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx, return 0; } -static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_limit_pkts_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { - const struct nft_limit_pkts *priv = nft_expr_priv(expr); + const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } +static void nft_limit_pkts_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); + + nft_limit_destroy(ctx, &priv->limit); +} + +static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) +{ + struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst); + struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src); + + priv_dst->cost = priv_src->cost; + + return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp); +} + static struct nft_expr_type nft_limit_type; static const struct nft_expr_ops nft_limit_pkts_ops = { .type = &nft_limit_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .eval = nft_limit_pkts_eval, .init = nft_limit_pkts_init, + .destroy = nft_limit_pkts_destroy, + .clone = nft_limit_pkts_clone, .dump = nft_limit_pkts_dump, + .reduce = NFT_REDUCE_READONLY, }; static void nft_limit_bytes_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit *priv = nft_expr_priv(expr); + struct nft_limit_priv *priv = nft_expr_priv(expr); u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); if (nft_limit_eval(priv, cost)) @@ -190,25 +261,45 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nft_limit *priv = nft_expr_priv(expr); + struct nft_limit_priv *priv = nft_expr_priv(expr); return nft_limit_init(priv, tb, false); } static int nft_limit_bytes_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { - const struct nft_limit *priv = nft_expr_priv(expr); + const struct nft_limit_priv *priv = nft_expr_priv(expr); return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } +static void nft_limit_bytes_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_limit_priv *priv = nft_expr_priv(expr); + + nft_limit_destroy(ctx, priv); +} + +static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) +{ + struct nft_limit_priv *priv_dst = nft_expr_priv(dst); + struct nft_limit_priv *priv_src = nft_expr_priv(src); + + return nft_limit_clone(priv_dst, priv_src, gfp); +} + static const struct nft_expr_ops nft_limit_bytes_ops = { .type = &nft_limit_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)), .eval = nft_limit_bytes_eval, .init = nft_limit_bytes_init, .dump = nft_limit_bytes_dump, + .clone = nft_limit_bytes_clone, + .destroy = nft_limit_bytes_destroy, + .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * @@ -240,7 +331,7 @@ static void nft_limit_obj_pkts_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit_pkts *priv = nft_obj_data(obj); + struct nft_limit_priv_pkts *priv = nft_obj_data(obj); if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; @@ -250,7 +341,7 @@ static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { - struct nft_limit_pkts *priv = nft_obj_data(obj); + struct nft_limit_priv_pkts *priv = nft_obj_data(obj); int err; err = nft_limit_init(&priv->limit, tb, true); @@ -265,16 +356,25 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { - const struct nft_limit_pkts *priv = nft_obj_data(obj); + const struct nft_limit_priv_pkts *priv = nft_obj_data(obj); return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } +static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_limit_priv_pkts *priv = nft_obj_data(obj); + + nft_limit_destroy(ctx, &priv->limit); +} + static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_pkts_ops = { .type = &nft_limit_obj_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .init = nft_limit_obj_pkts_init, + .destroy = nft_limit_obj_pkts_destroy, .eval = nft_limit_obj_pkts_eval, .dump = nft_limit_obj_pkts_dump, }; @@ -283,7 +383,7 @@ static void nft_limit_obj_bytes_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit *priv = nft_obj_data(obj); + struct nft_limit_priv *priv = nft_obj_data(obj); u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); if (nft_limit_eval(priv, cost)) @@ -294,7 +394,7 @@ static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { - struct nft_limit *priv = nft_obj_data(obj); + struct nft_limit_priv *priv = nft_obj_data(obj); return nft_limit_init(priv, tb, false); } @@ -303,16 +403,25 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { - const struct nft_limit *priv = nft_obj_data(obj); + const struct nft_limit_priv *priv = nft_obj_data(obj); return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } +static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_limit_priv *priv = nft_obj_data(obj); + + nft_limit_destroy(ctx, priv); +} + static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_bytes_ops = { .type = &nft_limit_obj_type, - .size = sizeof(struct nft_limit), + .size = sizeof(struct nft_limit_priv), .init = nft_limit_obj_bytes_init, + .destroy = nft_limit_obj_bytes_destroy, .eval = nft_limit_obj_bytes_eval, .dump = nft_limit_obj_bytes_dump, }; @@ -372,3 +481,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("limit"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT); +MODULE_DESCRIPTION("nftables limit expression support"); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index fe4831f2258f..e35588137995 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -128,6 +128,20 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_FLAGS] = { .type = NLA_U32 }, }; +static int nft_log_modprobe(struct net *net, enum nf_log_type t) +{ + switch (t) { + case NF_LOG_TYPE_LOG: + return nft_request_module(net, "%s", "nf_log_syslog"); + case NF_LOG_TYPE_ULOG: + return nft_request_module(net, "%s", "nfnetlink_log"); + case NF_LOG_TYPE_MAX: + break; + } + + return -ENOENT; +} + static int nft_log_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -149,10 +163,10 @@ static int nft_log_init(const struct nft_ctx *ctx, nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { - priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); + priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT); if (priv->prefix == NULL) return -ENOMEM; - nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); + nla_strscpy(priv->prefix, nla, nla_len(nla) + 1); } else { priv->prefix = (char *)nft_log_null_prefix; } @@ -197,8 +211,12 @@ static int nft_log_init(const struct nft_ctx *ctx, return 0; err = nf_logger_find_get(ctx->family, li->type); - if (err < 0) + if (err < 0) { + if (nft_log_modprobe(ctx->net, li->type) == -EAGAIN) + err = -EAGAIN; + goto err1; + } return 0; @@ -223,7 +241,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx, nf_logger_put(ctx->family, li->type); } -static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_log_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_log *priv = nft_expr_priv(expr); const struct nf_loginfo *li = &priv->loginfo; @@ -272,6 +291,7 @@ static const struct nft_expr_ops nft_log_ops = { .init = nft_log_init, .destroy = nft_log_destroy, .dump = nft_log_dump, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_log_type __read_mostly = { @@ -298,3 +318,4 @@ module_exit(nft_log_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("log"); +MODULE_DESCRIPTION("Netfilter nf_tables log module"); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index f1363b8aabba..fc2d7c5d83c8 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -17,30 +17,103 @@ struct nft_lookup { struct nft_set *set; - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; + bool dreg_set; bool invert; struct nft_set_binding binding; }; +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ +#ifdef CONFIG_MITIGATION_RETPOLINE + if (set->ops == &nft_set_hash_fast_type.ops) + return nft_hash_lookup_fast(net, set, key); + if (set->ops == &nft_set_hash_type.ops) + return nft_hash_lookup(net, set, key); + + if (set->ops == &nft_set_rhash_type.ops) + return nft_rhash_lookup(net, set, key); + + if (set->ops == &nft_set_bitmap_type.ops) + return nft_bitmap_lookup(net, set, key); + + if (set->ops == &nft_set_pipapo_type.ops) + return nft_pipapo_lookup(net, set, key); +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (set->ops == &nft_set_pipapo_avx2_type.ops) + return nft_pipapo_avx2_lookup(net, set, key); +#endif + + if (set->ops == &nft_set_rbtree_type.ops) + return nft_rbtree_lookup(net, set, key); + + WARN_ON_ONCE(1); +#endif + return set->ops->lookup(net, set, key); +} + +static unsigned int nft_base_seq(const struct net *net) +{ + /* pairs with smp_store_release() in nf_tables_commit() */ + return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ + return unlikely(seq != nft_base_seq(net)); +} + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + const struct nft_set_ext *ext; + unsigned int base_seq; + + do { + base_seq = nft_base_seq(net); + + ext = __nft_set_do_lookup(net, set, key); + if (ext) + break; + /* No match? There is a small chance that lookup was + * performed in the old generation, but nf_tables_commit() + * already unlinked a (matching) element. + * + * We need to repeat the lookup to make sure that we didn't + * miss a matching element in the new generation. + */ + } while (nft_lookup_should_retry(net, base_seq)); + + return ext; +} +EXPORT_SYMBOL_GPL(nft_set_do_lookup); + void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; - const struct nft_set_ext *ext = NULL; + const struct net *net = nft_net(pkt); + const struct nft_set_ext *ext; bool found; - found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], - &ext) ^ priv->invert; + ext = nft_set_do_lookup(net, set, ®s->data[priv->sreg]); + found = !!ext ^ priv->invert; if (!found) { - regs->verdict.code = NFT_BREAK; - return; + ext = nft_set_catchall_lookup(net, set); + if (!ext) { + regs->verdict.code = NFT_BREAK; + return; + } } if (ext) { - if (set->flags & NFT_SET_MAP) + if (priv->dreg_set) nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); @@ -54,7 +127,8 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, - [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 }, + [NFTA_LOOKUP_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV), }; static int nft_lookup_init(const struct nft_ctx *ctx, @@ -76,22 +150,16 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); - err = nft_validate_register_load(priv->sreg, set->klen); + err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg, + set->klen); if (err < 0) return err; if (tb[NFTA_LOOKUP_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); - if (flags & ~NFT_LOOKUP_F_INV) - return -EINVAL; - - if (flags & NFT_LOOKUP_F_INV) { - if (set->flags & NFT_SET_MAP) - return -EINVAL; + if (flags & NFT_LOOKUP_F_INV) priv->invert = true; - } } if (tb[NFTA_LOOKUP_DREG] != NULL) { @@ -100,13 +168,23 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_MAP)) return -EINVAL; - priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - set->dtype, set->dlen); + err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], + &priv->dreg, NULL, + nft_set_datatype(set), + set->dlen); if (err < 0) return err; - } else if (set->flags & NFT_SET_MAP) - return -EINVAL; + priv->dreg_set = true; + } else if (set->flags & NFT_SET_MAP) { + /* Map given, but user asks for lookup only (i.e. to + * ignore value assoicated with key). + * + * This makes no sense for anonymous maps since they are + * scoped to the rule, but for named sets this can be useful. + */ + if (set->flags & NFT_SET_ANONYMOUS) + return -EINVAL; + } priv->binding.flags = set->flags & NFT_SET_MAP; @@ -132,7 +210,7 @@ static void nft_lookup_activate(const struct nft_ctx *ctx, { struct nft_lookup *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_lookup_destroy(const struct nft_ctx *ctx, @@ -143,7 +221,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } -static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_lookup_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_lookup *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0; @@ -152,7 +231,7 @@ static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; - if (priv->set->flags & NFT_SET_MAP) + if (priv->dreg_set) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) @@ -163,61 +242,41 @@ nla_put_failure: return -1; } -static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - struct nft_ctx *pctx = (struct nft_ctx *)ctx; - const struct nft_data *data; - int err; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - pctx->level++; - err = nft_chain_validate(ctx, data->verdict.chain); - if (err < 0) - return err; - pctx->level--; - break; - default: - break; - } - - return 0; -} - static int nft_lookup_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **d) + const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); - struct nft_set_iter iter; + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nft_setelem_validate, + }; if (!(priv->set->flags & NFT_SET_MAP) || priv->set->dtype != NFT_DATA_VERDICT) return 0; - iter.genmask = nft_genmask_next(ctx->net); - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nft_lookup_validate_setelem; - priv->set->ops->walk(ctx, priv->set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_validate(ctx, priv->set); + if (iter.err < 0) return iter.err; return 0; } +static bool nft_lookup_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_lookup *priv = nft_expr_priv(expr); + + if (priv->set->flags & NFT_SET_MAP) + nft_reg_track_cancel(track, priv->dreg, priv->set->dlen); + + return false; +} + static const struct nft_expr_ops nft_lookup_ops = { .type = &nft_lookup_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), @@ -228,6 +287,7 @@ static const struct nft_expr_ops nft_lookup_ops = { .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, .validate = nft_lookup_validate, + .reduce = nft_lookup_reduce, }; struct nft_expr_type nft_lookup_type __read_mostly = { diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index bc9fd98c5d6d..868bd4d73555 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -15,19 +15,19 @@ struct nft_masq { u32 flags; - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_proto_min; + u8 sreg_proto_max; }; static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { - [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 }, }; static int nft_masq_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { int err; @@ -43,30 +43,23 @@ static int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - u32 plen = sizeof_field(struct nf_nat_range, min_addr.all); + u32 plen = sizeof_field(struct nf_nat_range, min_proto.all); struct nft_masq *priv = nft_expr_priv(expr); int err; - if (tb[NFTA_MASQ_FLAGS]) { + if (tb[NFTA_MASQ_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } if (tb[NFTA_MASQ_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_MASQ_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { @@ -77,7 +70,8 @@ static int nft_masq_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -static int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_masq_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_masq *priv = nft_expr_priv(expr); @@ -99,23 +93,39 @@ nla_put_failure: return -1; } -static void nft_masq_ipv4_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_masq_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_masq *priv = nft_expr_priv(expr); + const struct nft_masq *priv = nft_expr_priv(expr); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); + range.min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); + } + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, + nft_hook(pkt), + &range, + nft_out(pkt)); + break; +#ifdef CONFIG_NF_TABLES_IPV6 + case NFPROTO_IPV6: + regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, + nft_out(pkt)); + break; +#endif + default: + WARN_ON_ONCE(1); + break; } - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), - &range, nft_out(pkt)); } static void @@ -128,11 +138,12 @@ static struct nft_expr_type nft_masq_ipv4_type; static const struct nft_expr_ops nft_masq_ipv4_ops = { .type = &nft_masq_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), - .eval = nft_masq_ipv4_eval, + .eval = nft_masq_eval, .init = nft_masq_init, .destroy = nft_masq_ipv4_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { @@ -145,25 +156,6 @@ static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { }; #ifdef CONFIG_NF_TABLES_IPV6 -static void nft_masq_ipv6_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range2 range; - - memset(&range, 0, sizeof(range)); - range.flags = priv->flags; - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - } - regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, - nft_out(pkt)); -} - static void nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -174,11 +166,12 @@ static struct nft_expr_type nft_masq_ipv6_type; static const struct nft_expr_ops nft_masq_ipv6_ops = { .type = &nft_masq_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), - .eval = nft_masq_ipv6_eval, + .eval = nft_masq_eval, .init = nft_masq_init, .destroy = nft_masq_ipv6_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { @@ -205,20 +198,6 @@ static inline void nft_masq_module_exit_ipv6(void) {} #endif #ifdef CONFIG_NF_TABLES_INET -static void nft_masq_inet_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - return nft_masq_ipv4_eval(expr, regs, pkt); - case NFPROTO_IPV6: - return nft_masq_ipv6_eval(expr, regs, pkt); - } - - WARN_ON_ONCE(1); -} - static void nft_masq_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -229,11 +208,12 @@ static struct nft_expr_type nft_masq_inet_type; static const struct nft_expr_ops nft_masq_inet_ops = { .type = &nft_masq_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), - .eval = nft_masq_inet_eval, + .eval = nft_masq_eval, .init = nft_masq_init, .destroy = nft_masq_inet_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_masq_inet_type __read_mostly = { @@ -305,3 +285,4 @@ module_exit(nft_masq_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); MODULE_ALIAS_NFT_EXPR("masq"); +MODULE_DESCRIPTION("Netfilter nftables masquerade expression support"); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 951b6e87ed5d..05cd1e6e6a2f 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -14,6 +14,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/random.h> #include <linux/smp.h> #include <linux/static_key.h> #include <net/dst.h> @@ -32,8 +33,6 @@ #define NFT_META_SECS_PER_DAY 86400 #define NFT_META_DAYS_PER_WEEK 7 -static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); - static u8 nft_meta_weekday(void) { time64_t secs = ktime_get_real_seconds(); @@ -64,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key, { switch (key) { case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); + nft_reg_store64((u64 *)dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: nft_reg_store8(dest, nft_meta_weekday()); @@ -147,11 +146,11 @@ nft_meta_get_eval_skugid(enum nft_meta_keys key, switch (key) { case NFT_META_SKUID: - *dest = from_kuid_munged(&init_user_ns, + *dest = from_kuid_munged(sock_net(sk)->user_ns, sock->file->f_cred->fsuid); break; case NFT_META_SKGID: - *dest = from_kgid_munged(&init_user_ns, + *dest = from_kgid_munged(sock_net(sk)->user_ns, sock->file->f_cred->fsgid); break; default: @@ -186,12 +185,12 @@ static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, case NFT_META_IIFKIND: if (!in || !in->rtnl_link_ops) return false; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + strscpy_pad((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_OIFKIND: if (!out || !out->rtnl_link_ops) return false; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + strscpy_pad((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; default: return false; @@ -207,7 +206,7 @@ static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) { - strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); + strscpy_pad((char *)dest, dev ? dev->name : "", IFNAMSIZ); } static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) @@ -244,7 +243,11 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, case NFT_META_OIF: nft_meta_store_ifindex(dest, nft_out(pkt)); break; - case NFT_META_IIFTYPE: + case NFT_META_IFTYPE: + if (!nft_meta_store_iftype(dest, pkt->skb->dev)) + return false; + break; + case __NFT_META_IIFTYPE: if (!nft_meta_store_iftype(dest, nft_in(pkt))) return false; break; @@ -253,7 +256,7 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, return false; break; case NFT_META_IIFGROUP: - if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + if (!nft_meta_store_ifgroup(dest, nft_in(pkt))) return false; break; case NFT_META_OIFGROUP: @@ -267,13 +270,6 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, return true; } -static noinline u32 nft_prandom_u32(void) -{ - struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); - - return prandom_u32_state(state); -} - #ifdef CONFIG_IP_ROUTE_CLASSID static noinline bool nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) @@ -329,7 +325,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store8(dest, nft_pf(pkt)); break; case NFT_META_L4PROTO: - if (!pkt->tprot_set) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) goto err; nft_reg_store8(dest, pkt->tprot); break; @@ -385,7 +381,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; #endif case NFT_META_PRANDOM: - *dest = nft_prandom_u32(); + *dest = get_random_u32(); break; #ifdef CONFIG_XFRM case NFT_META_SECPATH: @@ -462,7 +458,7 @@ EXPORT_SYMBOL_GPL(nft_meta_set_eval); const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, - [NFTA_META_KEY] = { .type = NLA_U32 }, + [NFTA_META_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_META_SREG] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(nft_meta_policy); @@ -514,7 +510,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx, len = IFNAMSIZ; break; case NFT_META_PRANDOM: - prandom_init_once(&nft_prandom_state); len = sizeof(u32); break; #ifdef CONFIG_XFRM @@ -535,9 +530,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + priv->len = len; + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } EXPORT_SYMBOL_GPL(nft_meta_get_init); @@ -586,8 +581,7 @@ static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) } static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -605,8 +599,7 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, } int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -661,8 +654,8 @@ int nft_meta_set_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); - err = nft_validate_register_load(priv->sreg, len); + priv->len = len; + err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; @@ -674,7 +667,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, EXPORT_SYMBOL_GPL(nft_meta_set_init); int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -689,7 +682,8 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_meta_get_dump); -int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -724,22 +718,22 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, switch (priv->key) { case NFT_META_PROTOCOL: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, - sizeof(__u16), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, + sizeof(__u16), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case NFT_META_L4PROTO: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, - sizeof(__u8), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; case NFT_META_IIF: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, - ingress_ifindex, sizeof(__u32), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta, + ingress_ifindex, sizeof(__u32), reg); break; case NFT_META_IIFTYPE: - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, - ingress_iftype, sizeof(__u16), reg); + NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta, + ingress_iftype, sizeof(__u16), reg); break; default: return -EOPNOTSUPP; @@ -748,16 +742,60 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, return 0; } +bool nft_meta_get_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct nft_meta *meta; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + meta = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->key != meta->key || + priv->dreg != meta->dreg) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return nft_expr_reduce_bitwise(track, expr); +} +EXPORT_SYMBOL_GPL(nft_meta_get_reduce); + static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_get_eval, .init = nft_meta_get_init, .dump = nft_meta_get_dump, + .reduce = nft_meta_get_reduce, .validate = nft_meta_get_validate, .offload = nft_meta_get_offload, }; +static bool nft_meta_set_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + int i; + + for (i = 0; i < NFT_REG32_NUM; i++) { + if (!track->regs[i].selector) + continue; + + if (track->regs[i].selector->ops != &nft_meta_get_ops) + continue; + + __nft_reg_track_cancel(track, i); + } + + return false; +} + static const struct nft_expr_ops nft_meta_set_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), @@ -765,6 +803,7 @@ static const struct nft_expr_ops nft_meta_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .reduce = nft_meta_set_reduce, .validate = nft_meta_set_validate, }; @@ -791,9 +830,74 @@ nft_meta_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EINVAL); } +static int nft_meta_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + unsigned int len; + + if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG]) + return -EINVAL; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_PROTOCOL: + len = sizeof(u16); + break; + case NFT_META_L4PROTO: + len = sizeof(u32); + break; + default: + return -EOPNOTSUPP; + } + priv->len = len; + + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); +} + +void nft_meta_inner_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + + switch (priv->key) { + case NFT_META_PROTOCOL: + nft_reg_store16(dest, (__force u16)tun_ctx->llproto); + break; + case NFT_META_L4PROTO: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) + goto err; + + nft_reg_store8(dest, tun_ctx->l4proto); + break; + default: + WARN_ON_ONCE(1); + goto err; + } + return; + +err: + regs->verdict.code = NFT_BREAK; +} +EXPORT_SYMBOL_GPL(nft_meta_inner_eval); + +static const struct nft_expr_ops nft_meta_inner_ops = { + .type = &nft_meta_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .init = nft_meta_inner_init, + .dump = nft_meta_get_dump, + /* direct call to nft_meta_inner_eval(). */ +}; + struct nft_expr_type nft_meta_type __read_mostly = { .name = "meta", .select_ops = nft_meta_select_ops, + .inner_ops = &nft_meta_inner_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, @@ -848,7 +952,7 @@ static int nft_secmark_obj_init(const struct nft_ctx *ctx, if (tb[NFTA_SECMARK_CTX] == NULL) return -EINVAL; - priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL); + priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT); if (!priv->ctx) return -ENOMEM; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 23a7bfd10521..6e21f72c5b57 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -21,10 +21,10 @@ #include <net/ip.h> struct nft_nat { - enum nft_registers sreg_addr_min:8; - enum nft_registers sreg_addr_max:8; - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_addr_min; + u8 sreg_addr_max; + u8 sreg_proto_min; + u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; @@ -132,16 +132,21 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_nat *priv = nft_expr_priv(expr); int err; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; @@ -201,23 +206,22 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: - return -EAFNOSUPPORT; + if (tb[NFTA_NAT_REG_ADDR_MIN]) + return -EAFNOSUPPORT; + break; } priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { - priv->sreg_addr_min = - nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]); - err = nft_validate_register_load(priv->sreg_addr_min, alen); + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN], + &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { - priv->sreg_addr_max = - nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]); - - err = nft_validate_register_load(priv->sreg_addr_max, - alen); + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX], + &priv->sreg_addr_max, + alen); if (err < 0) return err; } else { @@ -227,21 +231,17 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_MAP_IPS; } - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { @@ -251,16 +251,14 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_NAT_FLAGS]) { + if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EOPNOTSUPP; - } return nf_ct_netns_get(ctx->net, family); } -static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_nat_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); @@ -322,6 +320,7 @@ static const struct nft_expr_ops nft_nat_ops = { .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_nat_type __read_mostly = { @@ -339,7 +338,8 @@ static void nft_nat_inet_eval(const struct nft_expr *expr, { const struct nft_nat *priv = nft_expr_priv(expr); - if (priv->family == nft_pf(pkt)) + if (priv->family == nft_pf(pkt) || + priv->family == NFPROTO_INET) nft_nat_eval(expr, regs, pkt); } @@ -351,6 +351,7 @@ static const struct nft_expr_ops nft_nat_inet_ops = { .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_inet_nat_type __read_mostly = { @@ -402,3 +403,4 @@ module_exit(nft_nat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); MODULE_ALIAS_NFT_EXPR("nat"); +MODULE_DESCRIPTION("Network Address Translation support"); diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 48edb9d5f012..bd058babfc82 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -9,16 +9,15 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/random.h> #include <linux/static_key.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> -static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); - struct nft_ng_inc { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; - atomic_t counter; + atomic_t *counter; u32 offset; }; @@ -27,9 +26,9 @@ static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) u32 nval, oval; do { - oval = atomic_read(&priv->counter); + oval = atomic_read(priv->counter); nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; - } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); + } while (atomic_cmpxchg(priv->counter, oval, nval) != oval); return nval + priv->offset; } @@ -55,6 +54,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_ng_inc *priv = nft_expr_priv(expr); + int err; if (tb[NFTA_NG_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); @@ -66,11 +66,32 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); - atomic_set(&priv->counter, priv->modulus - 1); + priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT); + if (!priv->counter) + return -ENOMEM; + + atomic_set(priv->counter, priv->modulus - 1); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + err = nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); + if (err < 0) + goto err; + + return 0; +err: + kfree(priv->counter); + + return err; +} + +static bool nft_ng_inc_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_ng_inc *priv = nft_expr_priv(expr); + + nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE); + + return false; } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, @@ -91,7 +112,8 @@ nla_put_failure: return -1; } -static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ng_inc_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ng_inc *priv = nft_expr_priv(expr); @@ -99,18 +121,23 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } +static void nft_ng_inc_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_ng_inc *priv = nft_expr_priv(expr); + + kfree(priv->counter); +} + struct nft_ng_random { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; u32 offset; }; -static u32 nft_ng_random_gen(struct nft_ng_random *priv) +static u32 nft_ng_random_gen(const struct nft_ng_random *priv) { - struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); - - return reciprocal_scale(prandom_u32_state(state), priv->modulus) + - priv->offset; + return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset; } static void nft_ng_random_eval(const struct nft_expr *expr, @@ -138,15 +165,12 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - prandom_init_once(&nft_numgen_prandom_state); - - priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); - - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ng_random_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ng_random *priv = nft_expr_priv(expr); @@ -154,13 +178,25 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } +static bool nft_ng_random_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_ng_random *priv = nft_expr_priv(expr); + + nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE); + + return false; +} + static struct nft_expr_type nft_ng_type; static const struct nft_expr_ops nft_ng_inc_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), .eval = nft_ng_inc_eval, .init = nft_ng_inc_init, + .destroy = nft_ng_inc_destroy, .dump = nft_ng_inc_dump, + .reduce = nft_ng_inc_reduce, }; static const struct nft_expr_ops nft_ng_random_ops = { @@ -169,6 +205,7 @@ static const struct nft_expr_ops nft_ng_random_ops = { .eval = nft_ng_random_eval, .init = nft_ng_random_init, .dump = nft_ng_random_dump, + .reduce = nft_ng_random_reduce, }; static const struct nft_expr_ops * @@ -217,3 +254,4 @@ module_exit(nft_ng_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>"); MODULE_ALIAS_NFT_EXPR("numgen"); +MODULE_DESCRIPTION("nftables number generator module"); diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index bfd18d2b65a2..1a62e384766a 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -9,19 +9,48 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) -static void nft_objref_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_object *obj = nft_objref_priv(expr); obj->ops->eval(obj, regs, pkt); } +static int nft_objref_validate_obj_type(const struct nft_ctx *ctx, u32 type) +{ + unsigned int hooks; + + switch (type) { + case NFT_OBJECT_SYNPROXY: + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + + hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD); + + return nft_chain_validate_hooks(ctx->chain, hooks); + default: + break; + } + + return 0; +} + +static int nft_objref_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_object *obj = nft_objref_priv(expr); + + return nft_objref_validate_obj_type(ctx, obj->ops->type->type); +} + static int nft_objref_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -41,13 +70,16 @@ static int nft_objref_init(const struct nft_ctx *ctx, if (IS_ERR(obj)) return -ENOENT; + if (!nft_use_inc(&obj->use)) + return -EMFILE; + nft_objref_priv(expr) = obj; - obj->use++; return 0; } -static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_objref_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_object *obj = nft_objref_priv(expr); @@ -71,7 +103,7 @@ static void nft_objref_deactivate(const struct nft_ctx *ctx, if (phase == NFT_TRANS_COMMIT) return; - obj->use--; + nft_use_dec(&obj->use); } static void nft_objref_activate(const struct nft_ctx *ctx, @@ -79,10 +111,9 @@ static void nft_objref_activate(const struct nft_ctx *ctx, { struct nft_object *obj = nft_objref_priv(expr); - obj->use++; + nft_use_inc_restore(&obj->use); } -static struct nft_expr_type nft_objref_type; static const struct nft_expr_ops nft_objref_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), @@ -91,29 +122,33 @@ static const struct nft_expr_ops nft_objref_ops = { .activate = nft_objref_activate, .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, + .validate = nft_objref_validate, + .reduce = NFT_REDUCE_READONLY, }; struct nft_objref_map { struct nft_set *set; - enum nft_registers sreg:8; + u8 sreg; struct nft_set_binding binding; }; -static void nft_objref_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; + struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; - bool found; - found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], - &ext); - if (!found) { - regs->verdict.code = NFT_BREAK; - return; + ext = nft_set_do_lookup(net, set, ®s->data[priv->sreg]); + if (!ext) { + ext = nft_set_catchall_lookup(net, set); + if (!ext) { + regs->verdict.code = NFT_BREAK; + return; + } } obj = *nft_set_ext_obj(ext); obj->ops->eval(obj, regs, pkt); @@ -137,8 +172,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; - priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]); - err = nft_validate_register_load(priv->sreg, set->klen); + err = nft_parse_register_load(ctx, tb[NFTA_OBJREF_SET_SREG], &priv->sreg, + set->klen); if (err < 0) return err; @@ -152,7 +187,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, return 0; } -static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_objref_map_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_objref_map *priv = nft_expr_priv(expr); @@ -180,7 +216,7 @@ static void nft_objref_map_activate(const struct nft_ctx *ctx, { struct nft_objref_map *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_objref_map_destroy(const struct nft_ctx *ctx, @@ -191,7 +227,14 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } -static struct nft_expr_type nft_objref_type; +static int nft_objref_map_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_objref_map *priv = nft_expr_priv(expr); + + return nft_objref_validate_obj_type(ctx, priv->set->objtype); +} + static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), @@ -201,6 +244,8 @@ static const struct nft_expr_ops nft_objref_map_ops = { .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, + .validate = nft_objref_map_validate, + .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * @@ -228,27 +273,10 @@ static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; -static struct nft_expr_type nft_objref_type __read_mostly = { +struct nft_expr_type nft_objref_type __read_mostly = { .name = "objref", .select_ops = nft_objref_select_ops, .policy = nft_objref_policy, .maxattr = NFTA_OBJREF_MAX, .owner = THIS_MODULE, }; - -static int __init nft_objref_module_init(void) -{ - return nft_register_expr(&nft_objref_type); -} - -static void __exit nft_objref_module_exit(void) -{ - nft_unregister_expr(&nft_objref_type); -} - -module_init(nft_objref_module_init); -module_exit(nft_objref_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_ALIAS_NFT_EXPR("objref"); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index b42247aa48a9..1c0b493ef0a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -6,7 +6,7 @@ #include <linux/netfilter/nfnetlink_osf.h> struct nft_osf { - enum nft_registers dreg:8; + u8 dreg; u8 ttl; u32 flags; }; @@ -23,11 +23,16 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct sk_buff *skb = pkt->skb; - char os_match[NFT_OSF_MAXGENRELEN + 1]; + char os_match[NFT_OSF_MAXGENRELEN]; const struct tcphdr *tcp; struct nf_osf_data data; struct tcphdr _tcph; + if (pkt->tprot != IPPROTO_TCP) { + regs->verdict.code = NFT_BREAK; + return; + } + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); if (!tcp) { @@ -40,15 +45,15 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, } if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { - strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); + strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); } else { if (priv->flags & NFT_OSF_F_VERSION) snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", data.genre, data.version); else - strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); + strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); - strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN); + strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN); } } @@ -58,7 +63,6 @@ static int nft_osf_init(const struct nft_ctx *ctx, { struct nft_osf *priv = nft_expr_priv(expr); u32 flags; - int err; u8 ttl; if (!tb[NFTA_OSF_DREG]) @@ -78,23 +82,20 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->flags = flags; } - priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); - if (err < 0) - return err; - - return 0; + return nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, + NFT_OSF_MAXGENRELEN); } -static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_osf_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_osf *priv = nft_expr_priv(expr); if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_OSF_FLAGS, ntohl(priv->flags))) + if (nla_put_u32(skb, NFTA_OSF_FLAGS, ntohl((__force __be32)priv->flags))) goto nla_put_failure; if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) @@ -107,12 +108,47 @@ nla_put_failure: } static int nft_osf_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) +{ + unsigned int hooks; + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + +static bool nft_osf_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) { - return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_FORWARD)); + struct nft_osf *priv = nft_expr_priv(expr); + struct nft_osf *osf; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN); + return false; + } + + osf = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->flags != osf->flags || + priv->ttl != osf->ttl) { + nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return false; } static struct nft_expr_type nft_osf_type; @@ -123,6 +159,7 @@ static const struct nft_expr_ops nft_osf_op = { .dump = nft_osf_dump, .type = &nft_osf_type, .validate = nft_osf_validate, + .reduce = nft_osf_reduce, }; static struct nft_expr_type nft_osf_type __read_mostly = { @@ -149,3 +186,4 @@ module_exit(nft_osf_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("osf"); +MODULE_DESCRIPTION("nftables passive OS fingerprint support"); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a7de3a58f553..b0214418f75a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -19,9 +19,11 @@ /* For layer 4 checksum field offset. */ #include <linux/tcp.h> #include <linux/udp.h> +#include <net/gre.h> #include <linux/icmpv6.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <net/sctp/checksum.h> static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, struct vlan_ethhdr *veth) @@ -38,46 +40,122 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, /* add vlan header into the user buffer for if tag was removed by offloads */ static bool -nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; - u8 vlan_hlen = 0; - - if ((skb->protocol == htons(ETH_P_8021AD) || - skb->protocol == htons(ETH_P_8021Q)) && - offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) - vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < VLAN_ETH_HLEN + vlan_hlen) { + if (offset < VLAN_ETH_HLEN) { u8 ethlen = len; - if (vlan_hlen && - skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) - return false; - else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) + if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - if (offset + len > VLAN_ETH_HLEN + vlan_hlen) - ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen; + if (offset + len > VLAN_ETH_HLEN) + ethlen -= offset + len - VLAN_ETH_HLEN; - memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); + memcpy(dst_u8, vlanh + offset, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN + vlan_hlen; + offset = ETH_HLEN; } else { - offset -= VLAN_HLEN + vlan_hlen; + offset -= VLAN_HLEN; } return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; } +static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) +{ + unsigned int thoff = nft_thoff(pkt); + + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) + return -1; + + switch (pkt->tprot) { + case IPPROTO_UDP: + pkt->inneroff = thoff + sizeof(struct udphdr); + break; + case IPPROTO_TCP: { + struct tcphdr *th, _tcph; + + th = skb_header_pointer(pkt->skb, thoff, sizeof(_tcph), &_tcph); + if (!th) + return -1; + + pkt->inneroff = thoff + __tcp_hdrlen(th); + } + break; + case IPPROTO_GRE: { + u32 offset = sizeof(struct gre_base_hdr); + struct gre_base_hdr *gre, _gre; + __be16 version; + + gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre); + if (!gre) + return -1; + + version = gre->flags & GRE_VERSION; + switch (version) { + case GRE_VERSION_0: + if (gre->flags & GRE_ROUTING) + return -1; + + if (gre->flags & GRE_CSUM) { + offset += sizeof_field(struct gre_full_hdr, csum) + + sizeof_field(struct gre_full_hdr, reserved1); + } + if (gre->flags & GRE_KEY) + offset += sizeof_field(struct gre_full_hdr, key); + + if (gre->flags & GRE_SEQ) + offset += sizeof_field(struct gre_full_hdr, seq); + break; + default: + return -1; + } + + pkt->inneroff = thoff + offset; + } + break; + case IPPROTO_IPIP: + pkt->inneroff = thoff; + break; + default: + return -1; + } + + pkt->flags |= NFT_PKTINFO_INNER; + + return 0; +} + +int nft_payload_inner_offset(const struct nft_pktinfo *pkt) +{ + if (!(pkt->flags & NFT_PKTINFO_INNER) && + __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0) + return -1; + + return pkt->inneroff; +} + +static bool nft_payload_need_vlan_adjust(u32 offset, u32 len) +{ + unsigned int boundary = offset + len; + + /* data past ether src/dst requested, copy needed */ + if (boundary > offsetof(struct ethhdr, h_proto)) + return true; + + return false; +} + void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -87,13 +165,16 @@ void nft_payload_eval(const struct nft_expr *expr, u32 *dest = ®s->data[priv->dreg]; int offset; - dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: - if (!skb_mac_header_was_set(skb)) + if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) == 0) goto err; - if (skb_vlan_tag_present(skb)) { + if (skb_vlan_tag_present(skb) && + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; @@ -105,12 +186,18 @@ void nft_payload_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: - if (!pkt->tprot_set) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) + goto err; + offset = nft_thoff(pkt); + break; + case NFT_PAYLOAD_INNER_HEADER: + offset = nft_payload_inner_offset(pkt); + if (offset < 0) goto err; - offset = pkt->xt.thoff; break; default: - BUG(); + WARN_ON_ONCE(1); + goto err; } offset += priv->offset; @@ -125,10 +212,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, + [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 }, }; @@ -141,13 +228,14 @@ static int nft_payload_init(const struct nft_ctx *ctx, priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } -static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_payload_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_payload *priv = nft_expr_priv(expr); @@ -162,6 +250,59 @@ nla_put_failure: return -1; } +static bool nft_payload_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct nft_payload *payload; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + payload = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->base != payload->base || + priv->offset != payload->offset || + priv->len != payload->len) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return nft_expr_reduce_bitwise(track, expr); +} + +static bool nft_payload_offload_mask(struct nft_offload_reg *reg, + u32 priv_len, u32 field_len) +{ + unsigned int remainder, delta, k; + struct nft_data mask = {}; + __be32 remainder_mask; + + if (priv_len == field_len) { + memset(®->mask, 0xff, priv_len); + return true; + } else if (priv_len > field_len) { + return false; + } + + memset(&mask, 0xff, field_len); + remainder = priv_len % sizeof(u32); + if (remainder) { + k = priv_len / sizeof(u32); + delta = field_len - priv_len; + remainder_mask = htonl(~((1 << (delta * BITS_PER_BYTE)) - 1)); + mask.data[k] = (__force u32)remainder_mask; + } + + memcpy(®->mask, &mask, field_len); + + return true; +} + static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) @@ -170,21 +311,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ethhdr, h_source): - if (priv->len != ETH_ALEN) + if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN)) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, src, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_dest): - if (priv->len != ETH_ALEN) + if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN)) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_proto): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, @@ -192,14 +333,15 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case offsetof(struct vlan_ethhdr, h_vlan_TCI): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, - vlan_tci, sizeof(__be16), reg); + NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tci, sizeof(__be16), reg, + NFT_OFFLOAD_F_NETWORK2HOST); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, @@ -207,19 +349,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, - vlan_tci, sizeof(__be16), reg); + NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_CVLAN, cvlan, + vlan_tci, sizeof(__be16), reg, + NFT_OFFLOAD_F_NETWORK2HOST); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + sizeof(struct vlan_hdr): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan, vlan_tpid, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; default: return -EOPNOTSUPP; @@ -236,21 +380,25 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct iphdr, saddr): - if (priv->len != sizeof(struct in_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, sizeof(struct in_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS); break; case offsetof(struct iphdr, daddr): - if (priv->len != sizeof(struct in_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, sizeof(struct in_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS); break; case offsetof(struct iphdr, protocol): - if (priv->len != sizeof(__u8)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, @@ -272,21 +420,25 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ipv6hdr, saddr): - if (priv->len != sizeof(struct in6_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in6_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, sizeof(struct in6_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS); break; case offsetof(struct ipv6hdr, daddr): - if (priv->len != sizeof(struct in6_addr)) + if (!nft_payload_offload_mask(reg, priv->len, + sizeof(struct in6_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, sizeof(struct in6_addr), reg); + nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS); break; case offsetof(struct ipv6hdr, nexthdr): - if (priv->len != sizeof(__u8)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, @@ -328,14 +480,14 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct tcphdr, source): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct tcphdr, dest): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, @@ -356,14 +508,14 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct udphdr, source): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct udphdr, dest): - if (priv->len != sizeof(__be16)) + if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, @@ -426,6 +578,7 @@ static const struct nft_expr_ops nft_payload_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .reduce = nft_payload_reduce, .offload = nft_payload_offload, }; @@ -435,12 +588,103 @@ const struct nft_expr_ops nft_payload_fast_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .reduce = nft_payload_reduce, .offload = nft_payload_offload, }; +void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + u32 *dest = ®s->data[priv->dreg]; + int offset; + + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + + switch (priv->base) { + case NFT_PAYLOAD_TUN_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN)) + goto err; + + offset = tun_ctx->inner_tunoff; + break; + case NFT_PAYLOAD_LL_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL)) + goto err; + + offset = tun_ctx->inner_lloff; + break; + case NFT_PAYLOAD_NETWORK_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH)) + goto err; + + offset = tun_ctx->inner_nhoff; + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) + goto err; + + offset = tun_ctx->inner_thoff; + break; + default: + WARN_ON_ONCE(1); + goto err; + } + offset += priv->offset; + + if (skb_copy_bits(skb, offset, dest, priv->len) < 0) + goto err; + + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_payload_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_payload *priv = nft_expr_priv(expr); + u32 base; + + if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] || + !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG]) + return -EINVAL; + + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + switch (base) { + case NFT_PAYLOAD_TUN_HEADER: + case NFT_PAYLOAD_LL_HEADER: + case NFT_PAYLOAD_NETWORK_HEADER: + case NFT_PAYLOAD_TRANSPORT_HEADER: + break; + default: + return -EOPNOTSUPP; + } + + priv->base = base; + priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + + return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); +} + +static const struct nft_expr_ops nft_payload_inner_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), + .init = nft_payload_inner_init, + .dump = nft_payload_dump, + /* direct call to nft_payload_inner_eval(). */ +}; + static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); + csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum); if (*sum == 0) *sum = CSUM_MANGLED_0; } @@ -460,14 +704,17 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, struct sk_buff *skb, unsigned int *l4csum_offset) { + if (pkt->fragoff) + return -1; + switch (pkt->tprot) { case IPPROTO_TCP: *l4csum_offset = offsetof(struct tcphdr, check); break; case IPPROTO_UDP: - if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) + if (!nft_payload_udp_checksum(skb, nft_thoff(pkt))) return -1; - /* Fall through. */ + fallthrough; case IPPROTO_UDPLITE: *l4csum_offset = offsetof(struct udphdr, check); break; @@ -478,7 +725,20 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, return -1; } - *l4csum_offset += pkt->xt.thoff; + *l4csum_offset += nft_thoff(pkt); + return 0; +} + +static int nft_payload_csum_sctp(struct sk_buff *skb, int offset) +{ + struct sctphdr *sh; + + if (skb_ensure_writable(skb, offset + sizeof(*sh))) + return -1; + + sh = (struct sctphdr *)(skb->data + offset); + sh->checksum = sctp_compute_cksum(skb, offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; } @@ -535,40 +795,118 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, return 0; } +struct nft_payload_set { + enum nft_payload_bases base:8; + u16 offset; + u8 len; + u8 sreg; + u8 csum_type; + u8 csum_offset; + u8 csum_flags; +}; + +/* This is not struct vlan_hdr. */ +struct nft_payload_vlan_hdr { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; +}; + +static bool +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, + int *vlan_hlen) +{ + struct nft_payload_vlan_hdr *vlanh; + __be16 vlan_proto; + u16 vlan_tci; + + if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) { + *vlan_hlen = VLAN_HLEN; + return true; + } + + switch (offset) { + case offsetof(struct vlan_ethhdr, h_vlan_proto): + if (len == 2) { + vlan_proto = nft_reg_load_be16(src); + skb->vlan_proto = vlan_proto; + } else if (len == 4) { + vlanh = (struct nft_payload_vlan_hdr *)src; + __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto, + ntohs(vlanh->h_vlan_TCI)); + } else { + return false; + } + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (len != 2) + return false; + + vlan_tci = ntohs(nft_reg_load_be16(src)); + skb->vlan_tci = vlan_tci; + break; + default: + return false; + } + + return true; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload_set *priv = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; const u32 *src = ®s->data[priv->sreg]; - int offset, csum_offset; + int offset, csum_offset, vlan_hlen = 0; + struct sk_buff *skb = pkt->skb; __wsum fsum, tsum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; - offset = skb_mac_header(skb) - skb->data; + + if (skb_vlan_tag_present(skb) && + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { + if (!nft_payload_set_vlan(src, skb, + priv->offset, priv->len, + &vlan_hlen)) + goto err; + + if (!vlan_hlen) + return; + } + + offset = skb_mac_header(skb) - skb->data - vlan_hlen; break; case NFT_PAYLOAD_NETWORK_HEADER: offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: - if (!pkt->tprot_set) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) + goto err; + offset = nft_thoff(pkt); + break; + case NFT_PAYLOAD_INNER_HEADER: + offset = nft_payload_inner_offset(pkt); + if (offset < 0) goto err; - offset = pkt->xt.thoff; break; default: - BUG(); + WARN_ON_ONCE(1); + goto err; } csum_offset = offset + priv->csum_offset; offset += priv->offset; if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) && - (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER || + ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER && + priv->base != NFT_PAYLOAD_INNER_HEADER) || skb->ip_summed != CHECKSUM_PARTIAL)) { + if (offset + priv->len > skb->len) + goto err; + fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); @@ -585,6 +923,14 @@ static void nft_payload_set_eval(const struct nft_expr *expr, skb_store_bits(skb, offset, src, priv->len) < 0) goto err; + if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && + pkt->tprot == IPPROTO_SCTP && + skb->ip_summed != CHECKSUM_PARTIAL) { + if (pkt->fragoff == 0 && + nft_payload_csum_sctp(skb, nft_thoff(pkt))) + goto err; + } + return; err: regs->verdict.code = NFT_BREAK; @@ -594,19 +940,28 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); + int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); - priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]); + + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return err; + priv->offset = offset; if (tb[NFTA_PAYLOAD_CSUM_TYPE]) - priv->csum_type = - ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); - if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) - priv->csum_offset = - ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); + csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); + if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX, + &csum_offset); + if (err < 0) + return err; + + priv->csum_offset = csum_offset; + } if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) { u32 flags; @@ -617,18 +972,28 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, priv->csum_flags = flags; } - switch (priv->csum_type) { + switch (csum_type) { case NFT_PAYLOAD_CSUM_NONE: case NFT_PAYLOAD_CSUM_INET: break; + case NFT_PAYLOAD_CSUM_SCTP: + if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER) + return -EINVAL; + + if (priv->csum_offset != offsetof(struct sctphdr, checksum)) + return -EINVAL; + break; default: return -EOPNOTSUPP; } + priv->csum_type = csum_type; - return nft_validate_register_load(priv->sreg, priv->len); + return nft_parse_register_load(ctx, tb[NFTA_PAYLOAD_SREG], &priv->sreg, + priv->len); } -static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_payload_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_payload_set *priv = nft_expr_priv(expr); @@ -647,12 +1012,32 @@ nla_put_failure: return -1; } +static bool nft_payload_set_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + int i; + + for (i = 0; i < NFT_REG32_NUM; i++) { + if (!track->regs[i].selector) + continue; + + if (track->regs[i].selector->ops != &nft_payload_ops && + track->regs[i].selector->ops != &nft_payload_fast_ops) + continue; + + __nft_reg_track_cancel(track, i); + } + + return false; +} + static const struct nft_expr_ops nft_payload_set_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)), .eval = nft_payload_set_eval, .init = nft_payload_set_init, .dump = nft_payload_set_dump, + .reduce = nft_payload_set_reduce, }; static const struct nft_expr_ops * @@ -661,6 +1046,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, { enum nft_payload_bases base; unsigned int offset, len; + int err; if (tb[NFTA_PAYLOAD_BASE] == NULL || tb[NFTA_PAYLOAD_OFFSET] == NULL || @@ -672,6 +1058,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, case NFT_PAYLOAD_LL_HEADER: case NFT_PAYLOAD_NETWORK_HEADER: case NFT_PAYLOAD_TRANSPORT_HEADER: + case NFT_PAYLOAD_INNER_HEADER: break; default: return ERR_PTR(-EOPNOTSUPP); @@ -686,11 +1073,16 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); - len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return ERR_PTR(err); + + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len); + if (err < 0) + return ERR_PTR(err); if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && - base != NFT_PAYLOAD_LL_HEADER) + base != NFT_PAYLOAD_LL_HEADER && base != NFT_PAYLOAD_INNER_HEADER) return &nft_payload_fast_ops; else return &nft_payload_ops; @@ -699,6 +1091,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, struct nft_expr_type nft_payload_type __read_mostly = { .name = "payload", .select_ops = nft_payload_select_ops, + .inner_ops = &nft_payload_inner_ops, .policy = nft_payload_policy, .maxattr = NFTA_PAYLOAD_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 5ece0a6aa8c3..344fe311878f 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -19,10 +19,10 @@ static u32 jhash_initval __read_mostly; struct nft_queue { - enum nft_registers sreg_qnum:8; - u16 queuenum; - u16 queues_total; - u16 flags; + u8 sreg_qnum; + u16 queuenum; + u16 queues_total; + u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -68,6 +68,30 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr, regs->verdict.code = ret; } +static int nft_queue_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + case NFPROTO_BRIDGE: + break; + case NFPROTO_NETDEV: /* lacks okfn */ + fallthrough; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, supported_hooks); +} + static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, @@ -111,8 +135,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, struct nft_queue *priv = nft_expr_priv(expr); int err; - priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]); - err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32)); + err = nft_parse_register_load(ctx, tb[NFTA_QUEUE_SREG_QNUM], + &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; @@ -127,7 +151,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, return 0; } -static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_queue_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); @@ -143,7 +168,8 @@ nla_put_failure: } static int -nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr) +nft_queue_sreg_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); @@ -164,6 +190,8 @@ static const struct nft_expr_ops nft_queue_ops = { .eval = nft_queue_eval, .init = nft_queue_init, .dump = nft_queue_dump, + .validate = nft_queue_validate, + .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_queue_sreg_ops = { @@ -172,6 +200,8 @@ static const struct nft_expr_ops nft_queue_sreg_ops = { .eval = nft_queue_sreg_eval, .init = nft_queue_sreg_init, .dump = nft_queue_sreg_dump, + .validate = nft_queue_validate, + .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * @@ -216,3 +246,4 @@ module_exit(nft_queue_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eric Leblond <eric@regit.org>"); MODULE_ALIAS_NFT_EXPR("queue"); +MODULE_DESCRIPTION("Netfilter nftables queue module"); diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 4413690591f2..df0798da2329 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -15,14 +15,20 @@ struct nft_quota { atomic64_t quota; unsigned long flags; - atomic64_t consumed; + atomic64_t *consumed; }; static inline bool nft_overquota(struct nft_quota *priv, - const struct sk_buff *skb) + const struct sk_buff *skb, + bool *report) { - return atomic64_add_return(skb->len, &priv->consumed) >= - atomic64_read(&priv->quota); + u64 consumed = atomic64_add_return(skb->len, priv->consumed); + u64 quota = atomic64_read(&priv->quota); + + if (report) + *report = consumed >= quota; + + return consumed > quota; } static inline bool nft_quota_invert(struct nft_quota *priv) @@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv)) + if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; } @@ -51,16 +57,16 @@ static void nft_quota_obj_eval(struct nft_object *obj, const struct nft_pktinfo *pkt) { struct nft_quota *priv = nft_obj_data(obj); - bool overquota; + bool overquota, report; - overquota = nft_overquota(priv, pkt->skb); + overquota = nft_overquota(priv, pkt->skb, &report); if (overquota ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; - if (overquota && + if (report && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, - NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC); + NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); } static int nft_quota_do_init(const struct nlattr * const tb[], @@ -90,13 +96,23 @@ static int nft_quota_do_init(const struct nlattr * const tb[], return -EOPNOTSUPP; } + priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT); + if (!priv->consumed) + return -ENOMEM; + atomic64_set(&priv->quota, quota); priv->flags = flags; - atomic64_set(&priv->consumed, consumed); + atomic64_set(priv->consumed, consumed); return 0; } +static void nft_quota_do_destroy(const struct nft_ctx *ctx, + struct nft_quota *priv) +{ + kfree(priv->consumed); +} + static int nft_quota_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) @@ -128,7 +144,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, * that we see, don't go over the quota boundary in what we send to * userspace. */ - consumed = atomic64_read(&priv->consumed); + consumed = atomic64_read(priv->consumed); quota = atomic64_read(&priv->quota); if (consumed >= quota) { consumed_cap = quota; @@ -145,7 +161,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, goto nla_put_failure; if (reset) { - atomic64_sub(consumed, &priv->consumed); + atomic64_sub(consumed, priv->consumed); clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); } return 0; @@ -162,11 +178,20 @@ static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj, return nft_quota_do_dump(skb, priv, reset); } +static void nft_quota_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_quota *priv = nft_obj_data(obj); + + return nft_quota_do_destroy(ctx, priv); +} + static struct nft_object_type nft_quota_obj_type; static const struct nft_object_ops nft_quota_obj_ops = { .type = &nft_quota_obj_type, .size = sizeof(struct nft_quota), .init = nft_quota_obj_init, + .destroy = nft_quota_obj_destroy, .eval = nft_quota_obj_eval, .dump = nft_quota_obj_dump, .update = nft_quota_obj_update, @@ -198,11 +223,37 @@ static int nft_quota_init(const struct nft_ctx *ctx, return nft_quota_do_init(tb, priv); } -static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_quota_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_quota *priv = nft_expr_priv(expr); - return nft_quota_do_dump(skb, priv, false); + return nft_quota_do_dump(skb, priv, reset); +} + +static void nft_quota_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + return nft_quota_do_destroy(ctx, priv); +} + +static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) +{ + struct nft_quota *priv_dst = nft_expr_priv(dst); + struct nft_quota *priv_src = nft_expr_priv(src); + + priv_dst->quota = priv_src->quota; + priv_dst->flags = priv_src->flags; + + priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp); + if (!priv_dst->consumed) + return -ENOMEM; + + *priv_dst->consumed = *priv_src->consumed; + + return 0; } static struct nft_expr_type nft_quota_type; @@ -211,7 +262,10 @@ static const struct nft_expr_ops nft_quota_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_quota)), .eval = nft_quota_eval, .init = nft_quota_init, + .destroy = nft_quota_destroy, + .clone = nft_quota_clone, .dump = nft_quota_dump, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_quota_type __read_mostly = { @@ -254,3 +308,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("quota"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA); +MODULE_DESCRIPTION("Netfilter nftables quota module"); diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 89efcc5a533d..ea382f7bbd78 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -15,7 +15,7 @@ struct nft_range_expr { struct nft_data data_from; struct nft_data data_to; - enum nft_registers sreg:8; + u8 sreg; u8 len; enum nft_range_ops op:8; }; @@ -42,7 +42,7 @@ void nft_range_eval(const struct nft_expr *expr, static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = { [NFTA_RANGE_SREG] = { .type = NLA_U32 }, - [NFTA_RANGE_OP] = { .type = NLA_U32 }, + [NFTA_RANGE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED }, [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED }, }; @@ -51,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr const struct nlattr * const tb[]) { struct nft_range_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc_from, desc_to; + struct nft_data_desc desc_from = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data_from), + }; + struct nft_data_desc desc_to = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data_to), + }; int err; u32 op; @@ -61,33 +68,23 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr !tb[NFTA_RANGE_TO_DATA]) return -EINVAL; - err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from), - &desc_from, tb[NFTA_RANGE_FROM_DATA]); + err = nft_data_init(NULL, &priv->data_from, &desc_from, + tb[NFTA_RANGE_FROM_DATA]); if (err < 0) return err; - if (desc_from.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err1; - } - - err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), - &desc_to, tb[NFTA_RANGE_TO_DATA]); + err = nft_data_init(NULL, &priv->data_to, &desc_to, + tb[NFTA_RANGE_TO_DATA]); if (err < 0) goto err1; - if (desc_to.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err2; - } - if (desc_from.len != desc_to.len) { err = -EINVAL; goto err2; } - priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]); - err = nft_validate_register_load(priv->sreg, desc_from.len); + err = nft_parse_register_load(ctx, tb[NFTA_RANGE_SREG], &priv->sreg, + desc_from.len); if (err < 0) goto err2; @@ -114,7 +111,8 @@ err1: return err; } -static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_range_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_range_expr *priv = nft_expr_priv(expr); @@ -140,6 +138,7 @@ static const struct nft_expr_ops nft_range_ops = { .eval = nft_range_eval, .init = nft_range_init, .dump = nft_range_dump, + .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_range_type __read_mostly = { diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 5b779171565c..95eedad85c83 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -14,20 +14,20 @@ #include <net/netfilter/nf_tables.h> struct nft_redir { - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_proto_min; + u8 sreg_proto_max; u16 flags; }; static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_REDIR_FLAGS] = { .type = NLA_U32 }, + [NFTA_REDIR_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_redir_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { int err; @@ -48,38 +48,34 @@ static int nft_redir_init(const struct nft_ctx *ctx, unsigned int plen; int err; - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } + + priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_REDIR_FLAGS]) { + if (tb[NFTA_REDIR_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } return nf_ct_netns_get(ctx->net, ctx->family); } -static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_redir_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_redir *priv = nft_expr_priv(expr); @@ -102,25 +98,37 @@ nla_put_failure: return -1; } -static void nft_redir_ipv4_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_redir_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_ipv4_multi_range_compat mr; + const struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_range2 range; - memset(&mr, 0, sizeof(mr)); + memset(&range, 0, sizeof(range)); + range.flags = priv->flags; if (priv->sreg_proto_min) { - mr.range[0].min.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - mr.range[0].max.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range.min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); } - mr.range[0].flags |= priv->flags; - - regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range, + nft_hook(pkt)); + break; +#ifdef CONFIG_NF_TABLES_IPV6 + case NFPROTO_IPV6: + regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, + nft_hook(pkt)); + break; +#endif + default: + WARN_ON_ONCE(1); + break; + } } static void @@ -133,11 +141,12 @@ static struct nft_expr_type nft_redir_ipv4_type; static const struct nft_expr_ops nft_redir_ipv4_ops = { .type = &nft_redir_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_ipv4_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_ipv4_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { @@ -150,28 +159,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { }; #ifdef CONFIG_NF_TABLES_IPV6 -static void nft_redir_ipv6_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_range2 range; - - memset(&range, 0, sizeof(range)); - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } - - range.flags |= priv->flags; - - regs->verdict.code = - nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt)); -} - static void nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -182,11 +169,12 @@ static struct nft_expr_type nft_redir_ipv6_type; static const struct nft_expr_ops nft_redir_ipv6_ops = { .type = &nft_redir_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_ipv6_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_ipv6_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { @@ -200,20 +188,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { #endif #ifdef CONFIG_NF_TABLES_INET -static void nft_redir_inet_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - return nft_redir_ipv4_eval(expr, regs, pkt); - case NFPROTO_IPV6: - return nft_redir_ipv6_eval(expr, regs, pkt); - } - - WARN_ON_ONCE(1); -} - static void nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -224,11 +198,12 @@ static struct nft_expr_type nft_redir_inet_type; static const struct nft_expr_ops nft_redir_inet_ops = { .type = &nft_redir_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_inet_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_inet_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_inet_type __read_mostly = { @@ -236,7 +211,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = { .name = "redir", .ops = &nft_redir_inet_ops, .policy = nft_redir_policy, - .maxattr = NFTA_MASQ_MAX, + .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; @@ -292,3 +267,4 @@ module_exit(nft_redir_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); MODULE_ALIAS_NFT_EXPR("redir"); +MODULE_DESCRIPTION("Netfilter nftables redirect support"); diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 00f865fb80ca..196a92c7ea09 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -18,19 +18,19 @@ #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { - [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, + [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT)); + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); @@ -39,6 +39,7 @@ int nft_reject_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; @@ -46,9 +47,17 @@ int nft_reject_init(const struct nft_ctx *ctx, priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; - priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; case NFT_REJECT_TCP_RST: break; default: @@ -59,7 +68,8 @@ int nft_reject_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_reject_init); -int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_reject_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); @@ -68,6 +78,7 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; @@ -119,3 +130,4 @@ EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Netfilter x_tables over nftables module"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index f41f414b72d1..49020e67304a 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -28,7 +28,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset(nft_net(pkt), nft_sk(pkt), + pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach(pkt->skb, @@ -44,7 +45,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset6(nft_net(pkt), nft_sk(pkt), + pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach6(nft_net(pkt), pkt->skb, @@ -58,60 +60,15 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; } -static int nft_reject_inet_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_reject_inet_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) { - struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code; - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - - icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - if (priv->type == NFT_REJECT_ICMPX_UNREACH && - icmp_code > NFT_REJECT_ICMPX_MAX) - return -EINVAL; - - priv->icmp_code = icmp_code; - break; - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - return 0; -} - -static int nft_reject_inet_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - case NFT_REJECT_ICMPX_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - default: - break; - } - - return 0; - -nla_put_failure: - return -1; + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_INGRESS)); } static struct nft_expr_type nft_reject_inet_type; @@ -119,9 +76,10 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .type = &nft_reject_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_inet_eval, - .init = nft_reject_inet_init, - .dump = nft_reject_inet_dump, - .validate = nft_reject_validate, + .init = nft_reject_init, + .dump = nft_reject_dump, + .validate = nft_reject_inet_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { @@ -149,3 +107,4 @@ module_exit(nft_reject_inet_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); +MODULE_DESCRIPTION("Netfilter nftables reject inet support"); diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c new file mode 100644 index 000000000000..2558ce1505d9 --- /dev/null +++ b/net/netfilter/nft_reject_netdev.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Laura Garcia Liebana <nevola@gmail.com> + * Copyright (c) 2020 Jose M. Guisado <guigom@riseup.net> + */ + +#include <linux/etherdevice.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> + +static void nft_reject_queue_xmit(struct sk_buff *nskb, struct sk_buff *oldskb) +{ + dev_hard_header(nskb, nskb->dev, ntohs(oldskb->protocol), + eth_hdr(oldskb)->h_source, eth_hdr(oldskb)->h_dest, + nskb->len); + dev_queue_xmit(nskb); +} + +static void nft_reject_netdev_send_v4_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_send_v4_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_send_v6_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + + +static void nft_reject_netdev_send_v6_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) +{ + struct sk_buff *nskb; + + nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code); + if (!nskb) + return; + + nft_reject_queue_xmit(nskb, oldskb); +} + +static void nft_reject_netdev_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct ethhdr *eth = eth_hdr(pkt->skb); + struct nft_reject *priv = nft_expr_priv(expr); + const unsigned char *dest = eth->h_dest; + + if (is_broadcast_ether_addr(dest) || + is_multicast_ether_addr(dest)) + goto out; + + switch (eth->h_proto) { + case htons(ETH_P_IP): + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nft_reject_netdev_send_v4_tcp_reset(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt)); + break; + case NFT_REJECT_ICMPX_UNREACH: + nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; + case htons(ETH_P_IPV6): + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nft_reject_netdev_send_v6_tcp_reset(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt)); + break; + case NFT_REJECT_ICMPX_UNREACH: + nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb, + nft_in(pkt), + nft_hook(pkt), + nft_reject_icmpv6_code(priv->icmp_code)); + break; + } + break; + default: + /* No explicit way to reject this protocol, drop it. */ + break; + } +out: + regs->verdict.code = NF_DROP; +} + +static int nft_reject_netdev_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); +} + +static struct nft_expr_type nft_reject_netdev_type; +static const struct nft_expr_ops nft_reject_netdev_ops = { + .type = &nft_reject_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_netdev_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, + .validate = nft_reject_netdev_validate, + .reduce = NFT_REDUCE_READONLY, +}; + +static struct nft_expr_type nft_reject_netdev_type __read_mostly = { + .family = NFPROTO_NETDEV, + .name = "reject", + .ops = &nft_reject_netdev_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_netdev_module_init(void) +{ + return nft_register_expr(&nft_reject_netdev_type); +} + +static void __exit nft_reject_netdev_module_exit(void) +{ + nft_unregister_expr(&nft_reject_netdev_type); +} + +module_init(nft_reject_netdev_module_init); +module_exit(nft_reject_netdev_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Laura Garcia Liebana <nevola@gmail.com>"); +MODULE_AUTHOR("Jose M. Guisado <guigom@riseup.net>"); +MODULE_DESCRIPTION("Reject packets from netdev via nftables"); +MODULE_ALIAS_NFT_AF_EXPR(5, "reject"); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 7cfcb0e2f7ee..dc50b9a5bd68 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -15,7 +15,7 @@ struct nft_rt { enum nft_rt_keys key:8; - enum nft_registers dreg:8; + u8 dreg; }; static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst) @@ -73,14 +73,14 @@ void nft_rt_get_eval(const struct nft_expr *expr, if (nft_pf(pkt) != NFPROTO_IPV4) goto err; - *dest = (__force u32)rt_nexthop((const struct rtable *)dst, + *dest = (__force u32)rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: if (nft_pf(pkt) != NFPROTO_IPV6) goto err; - memcpy(dest, rt6_nexthop((struct rt6_info *)dst, + memcpy(dest, rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr), sizeof(struct in6_addr)); break; @@ -104,7 +104,7 @@ err: static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { [NFTA_RT_DREG] = { .type = NLA_U32 }, - [NFTA_RT_KEY] = { .type = NLA_U32 }, + [NFTA_RT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_rt_get_init(const struct nft_ctx *ctx, @@ -141,13 +141,12 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_RT_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_rt_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_rt *priv = nft_expr_priv(expr); @@ -161,12 +160,16 @@ nla_put_failure: return -1; } -static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_rt *priv = nft_expr_priv(expr); unsigned int hooks; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + switch (priv->key) { case NFT_RT_NEXTHOP4: case NFT_RT_NEXTHOP6: @@ -192,6 +195,7 @@ static const struct nft_expr_ops nft_rt_get_ops = { .init = nft_rt_get_init, .dump = nft_rt_get_dump, .validate = nft_rt_validate, + .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_rt_type __read_mostly = { diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 2a81ea421819..8d3f040a904a 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -13,6 +13,7 @@ #include <net/netfilter/nf_tables_core.h> struct nft_bitmap_elem { + struct nft_elem_priv priv; struct list_head head; struct nft_set_ext ext; }; @@ -21,7 +22,7 @@ struct nft_bitmap_elem { * the element state in the current and the future generation. * * An element can be in three states. The generation cursor is represented using - * the ^ character, note that this cursor shifts on every succesful transaction. + * the ^ character, note that this cursor shifts on every successful transaction. * If no transaction is going on, we observe all elements are in the following * state: * @@ -39,7 +40,7 @@ struct nft_bitmap_elem { * 10 = this element is active in the current generation and it becomes inactive * ^ in the next one. This happens when the element is deactivated but commit * path has not yet been executed yet, so removal is still pending. On - * transation abortion, the next generation bit is reset to go back to + * transaction abortion, the next generation bit is reset to go back to * restore its previous state. */ struct nft_bitmap { @@ -73,26 +74,34 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) return (bitmap[idx] & (0x3 << off)) & (genmask << off); } -static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +const struct nft_set_ext * +nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { const struct nft_bitmap *priv = nft_set_priv(set); + static const struct nft_set_ext found; u8 genmask = nft_genmask_cur(net); u32 idx, off; nft_bitmap_location(set, key, &idx, &off); - return nft_bitmap_active(priv->bitmap, idx, off, genmask); + if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) + return &found; + + return NULL; } static struct nft_bitmap_elem * -nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, +nft_bitmap_elem_find(const struct net *net, + const struct nft_set *set, struct nft_bitmap_elem *this, u8 genmask) { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(net)->commit_mutex)) { if (memcmp(nft_set_ext_key(&be->ext), nft_set_ext_key(&this->ext), set->klen) || !nft_set_elem_active(&be->ext, genmask)) @@ -103,8 +112,9 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, return NULL; } -static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_bitmap_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -115,23 +125,23 @@ static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, !nft_set_elem_active(&be->ext, genmask)) continue; - return be; + return &be->priv; } return ERR_PTR(-ENOENT); } static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *new = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; - be = nft_bitmap_elem_find(set, new, genmask); + be = nft_bitmap_elem_find(net, set, new, genmask); if (be) { - *ext = &be->ext; + *elem_priv = &be->priv; return -EEXIST; } @@ -143,12 +153,11 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, return 0; } -static void nft_bitmap_remove(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static void nft_bitmap_remove(const struct net *net, const struct nft_set *set, + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -160,47 +169,46 @@ static void nft_bitmap_remove(const struct net *net, static void nft_bitmap_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); - nft_set_elem_change_active(net, set, &be->ext); + nft_clear(net, &be->ext); } -static bool nft_bitmap_flush(const struct net *net, - const struct nft_set *set, void *_be) +static void nft_bitmap_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); - struct nft_bitmap_elem *be = _be; u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); - - return true; } -static void *nft_bitmap_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *this = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, elem->key.val.data, &idx, &off); - be = nft_bitmap_elem_find(set, this, genmask); + be = nft_bitmap_elem_find(net, set, this, genmask); if (!be) return NULL; @@ -208,7 +216,7 @@ static void *nft_bitmap_deactivate(const struct net *net, priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); - return be; + return &be->priv; } static void nft_bitmap_walk(const struct nft_ctx *ctx, @@ -217,17 +225,13 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - struct nft_set_elem elem; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&be->ext, iter->genmask)) - goto cont; - elem.priv = be; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &be->priv); if (iter->err < 0) return; @@ -264,19 +268,22 @@ static int nft_bitmap_init(const struct nft_set *set, { struct nft_bitmap *priv = nft_set_priv(set); + BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0); + INIT_LIST_HEAD(&priv->list); priv->bitmap_size = nft_bitmap_size(set->klen); return 0; } -static void nft_bitmap_destroy(const struct nft_set *set) +static void nft_bitmap_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nft_set_elem_destroy(set, be, true); + nf_tables_set_elem_destroy(ctx, set, &be->priv); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 4d3f147e8d8d..ba01ce75d6de 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -24,10 +24,14 @@ struct nft_rhash { struct rhashtable ht; struct delayed_work gc_work; + u32 wq_gc_seq; }; struct nft_rhash_elem { + struct nft_elem_priv priv; struct rhash_head node; + struct llist_node walk_node; + u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -35,6 +39,7 @@ struct nft_rhash_cmp_arg { const struct nft_set *set; const u32 *key; u8 genmask; + u64 tstamp; }; static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed) @@ -59,7 +64,9 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; - if (nft_set_elem_expired(&he->ext)) + if (nft_set_elem_is_dead(&he->ext)) + return 1; + if (__nft_set_elem_expired(&he->ext, x->tstamp)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) return 1; @@ -74,8 +81,10 @@ static const struct rhashtable_params nft_rhash_params = { .automatic_shrinking = true, }; -static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +const struct nft_set_ext * +nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_rhash *priv = nft_set_priv(set); const struct nft_rhash_elem *he; @@ -83,17 +92,19 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) - *ext = &he->ext; + return &he->ext; - return !!he; + return NULL; } -static void *nft_rhash_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_rhash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; @@ -101,39 +112,40 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = elem->key.val.data, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) - return he; + return &he->priv; return ERR_PTR(-ENOENT); } -static bool nft_rhash_update(struct nft_set *set, const u32 *key, - void *(*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *regs), - const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext) +static const struct nft_set_ext * +nft_rhash_update(struct nft_set *set, const u32 *key, + const struct nft_expr *expr, struct nft_regs *regs) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he, *prev; + struct nft_elem_priv *elem_priv; struct nft_rhash_cmp_arg arg = { .genmask = NFT_GENMASK_ANY, .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) goto out; - he = new(set, expr, regs); - if (he == NULL) + elem_priv = nft_dynset_new(set, expr, regs); + if (!elem_priv) goto err1; + he = nft_elem_priv_cast(elem_priv); + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -141,69 +153,67 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, /* Another cpu may race to insert the element with the same key */ if (prev) { - nft_set_elem_destroy(set, he, true); + nft_set_elem_destroy(set, &he->priv, true); + atomic_dec(&set->nelems); he = prev; } out: - *ext = &he->ext; - return true; + return &he->ext; err2: - nft_set_elem_destroy(set, he, true); + nft_set_elem_destroy(set, &he->priv, true); + atomic_dec(&set->nelems); err1: - return false; + return NULL; } static int nft_rhash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv); struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he = elem->priv; struct nft_rhash_cmp_arg arg = { .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; struct nft_rhash_elem *prev; + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) return PTR_ERR(prev); if (prev) { - *ext = &prev->ext; + *elem_priv = &prev->priv; return -EEXIST; } return 0; } static void nft_rhash_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_rhash_elem *he = elem->priv; + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); + nft_clear(net, &he->ext); } -static bool nft_rhash_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_rhash_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_rhash_elem *he = priv; + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); } -static void *nft_rhash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_rhash_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; @@ -211,25 +221,25 @@ static void *nft_rhash_deactivate(const struct net *net, .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); - return he; + return &he->priv; } static void nft_rhash_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he = elem->priv; rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); } @@ -249,16 +259,17 @@ static bool nft_rhash_delete(const struct nft_set *set, if (he == NULL) return false; - return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; + nft_set_elem_dead(&he->ext); + + return true; } -static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he; struct rhashtable_iter hti; - struct nft_set_elem elem; + struct nft_rhash_elem *he; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -275,14 +286,8 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&he->ext)) - goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; - - elem.priv = he; - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) break; @@ -293,51 +298,199 @@ cont: rhashtable_walk_exit(&hti); } +static void nft_rhash_walk_update(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *tmp; + struct llist_node *first_node; + struct rhashtable_iter hti; + LLIST_HEAD(walk_list); + + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (set->in_update_walk) { + /* This can happen with bogus rulesets during ruleset validation + * when a verdict map causes a jump back to the same map. + * + * Without this extra check the walk_next loop below will see + * elems on the callers walk_list and skip (not validate) them. + */ + iter->err = -EMLINK; + return; + } + + /* walk happens under RCU. + * + * We create a snapshot list so ->iter callback can sleep. + * commit_mutex is held, elements can ... + * .. be added in parallel from dataplane (dynset) + * .. be marked as dead in parallel from dataplane (dynset). + * .. be queued for removal in parallel (gc timeout). + * .. not be freed: transaction mutex is held. + */ + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; + } + + continue; + } + + /* rhashtable resized during walk, skip */ + if (llist_on_list(&he->walk_node)) + continue; + + llist_add(&he->walk_node, &walk_list); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + first_node = __llist_del_all(&walk_list); + set->in_update_walk = true; + llist_for_each_entry_safe(he, tmp, first_node, walk_node) { + if (iter->err == 0) { + iter->err = iter->fn(ctx, set, iter, &he->priv); + if (iter->err == 0) + iter->count++; + } + + /* all entries must be cleared again, else next ->walk iteration + * will skip entries. + */ + init_llist_node(&he->walk_node); + } + set->in_update_walk = false; +} + +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + switch (iter->type) { + case NFT_ITER_UPDATE: + /* only relevant for netlink dumps which use READ type */ + WARN_ON_ONCE(iter->skip != 0); + + nft_rhash_walk_update(ctx, set, iter); + break; + case NFT_ITER_READ: + nft_rhash_walk_ro(ctx, set, iter); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + +static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, + struct nft_set_ext *ext) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_expr *expr; + u32 size; + + nft_setelem_expr_foreach(expr, elem_expr, size) { + if (expr->ops->gc && + expr->ops->gc(read_pnet(&set->net), expr) && + set->flags & NFT_SET_EVAL) + return true; + } + + return false; +} + static void nft_rhash_gc(struct work_struct *work) { + struct nftables_pernet *nft_net; struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + + if (nft_set_gc_is_pending(set)) + goto done; + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; + + /* Elements never collected use a zero gc worker sequence number. */ + if (unlikely(++priv->wq_gc_seq == 0)) + priv->wq_gc_seq++; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; - continue; + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } - if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { - struct nft_expr *expr = nft_set_ext_expr(&he->ext); - - if (expr->ops->gc && - expr->ops->gc(read_pnet(&set->net), expr)) - goto gc; + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } - if (!nft_set_elem_expired(&he->ext)) - continue; -gc: - if (nft_set_elem_mark_busy(&he->ext)) + + /* rhashtable walk is unstable, already seen in this gc run? + * Then, skip this element. In case of (unlikely) sequence + * wraparound and stale element wq_gc_seq, next gc run will + * just find this expired element. + */ + if (he->wq_gc_seq == priv->wq_gc_seq) continue; - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && + nft_rhash_expr_needs_gc_run(set, &he->ext)) + goto needs_gc_run; + + if (!nft_set_elem_expired(&he->ext)) + continue; +needs_gc_run: + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + /* annotate gc sequence for this attempt. */ + he->wq_gc_seq = priv->wq_gc_seq; + nft_trans_gc_elem_add(gc, he); } + + gc = nft_trans_gc_catchall_async(gc, gc_seq); + +try_later: + /* catchall list iteration requires rcu read side lock. */ rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); + +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -364,6 +517,8 @@ static int nft_rhash_init(const struct nft_set *set, struct rhashtable_params params = nft_rhash_params; int err; + BUILD_BUG_ON(offsetof(struct nft_rhash_elem, priv) != 0); + params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT; params.key_len = set->klen; @@ -372,30 +527,50 @@ static int nft_rhash_init(const struct nft_set *set, return err; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); - if (set->flags & NFT_SET_TIMEOUT) + if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) nft_rhash_gc_init(set); return 0; } +struct nft_rhash_ctx { + const struct nft_ctx ctx; + const struct nft_set *set; +}; + static void nft_rhash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy(arg, ptr, true); + struct nft_rhash_ctx *rhash_ctx = arg; + struct nft_rhash_elem *he = ptr; + + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, &he->priv); } -static void nft_rhash_destroy(const struct nft_set *set) +static void nft_rhash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_ctx rhash_ctx = { + .ctx = *ctx, + .set = set, + }; cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, - (void *)set); + (void *)&rhash_ctx); } +/* Number of buckets is stored in u32, so cap our result to 1U<<31 */ +#define NFT_MAX_BUCKETS (1U << 31) + static u32 nft_hash_buckets(u32 size) { - return roundup_pow_of_two(size * 4 / 3); + u64 val = div_u64((u64)size * 4, 3); + + if (val >= NFT_MAX_BUCKETS) + return NFT_MAX_BUCKETS; + + return roundup_pow_of_two(val); } static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features, @@ -415,12 +590,15 @@ struct nft_hash { }; struct nft_hash_elem { + struct nft_elem_priv priv; struct hlist_node node; struct nft_set_ext ext; }; -static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +const struct nft_set_ext * +nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -431,16 +609,15 @@ static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, hash = reciprocal_scale(hash, priv->buckets); hlist_for_each_entry_rcu(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) && - nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; - return true; - } + nft_set_elem_active(&he->ext, genmask)) + return &he->ext; } - return false; + return NULL; } -static void *nft_hash_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_hash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -452,14 +629,15 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && nft_set_elem_active(&he->ext, genmask)) - return he; + return &he->priv; } return ERR_PTR(-ENOENT); } -static bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +const struct nft_set_ext * +nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -472,12 +650,10 @@ static bool nft_hash_lookup_fast(const struct net *net, hlist_for_each_entry_rcu(he, &priv->table[hash], node) { k2 = *(u32 *)nft_set_ext_key(&he->ext)->data; if (k1 == k2 && - nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; - return true; - } + nft_set_elem_active(&he->ext, genmask)) + return &he->ext; } - return false; + return NULL; } static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv, @@ -499,9 +675,9 @@ static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv, static int nft_hash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { - struct nft_hash_elem *this = elem->priv, *he; + struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he; struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 hash; @@ -511,7 +687,7 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, if (!memcmp(nft_set_ext_key(&this->ext), nft_set_ext_key(&he->ext), set->klen) && nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; + *elem_priv = &he->priv; return -EEXIST; } } @@ -520,28 +696,28 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, } static void nft_hash_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = elem->priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } -static bool nft_hash_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_hash_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &he->ext); - return true; } -static void *nft_hash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_hash_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he; struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *this = elem->priv, *he; u8 genmask = nft_genmask_next(net); u32 hash; @@ -551,7 +727,7 @@ static void *nft_hash_deactivate(const struct net *net, set->klen) && nft_set_elem_active(&he->ext, genmask)) { nft_set_elem_change_active(net, set, &he->ext); - return he; + return &he->priv; } } return NULL; @@ -559,9 +735,9 @@ static void *nft_hash_deactivate(const struct net *net, static void nft_hash_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = elem->priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); hlist_del_rcu(&he->node); } @@ -571,19 +747,15 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; - struct nft_set_elem elem; int i; for (i = 0; i < priv->buckets; i++) { - hlist_for_each_entry_rcu(he, &priv->table[i], node) { + hlist_for_each_entry_rcu(he, &priv->table[i], node, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; - - elem.priv = he; - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) return; cont: @@ -596,7 +768,7 @@ static u64 nft_hash_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { return sizeof(struct nft_hash) + - nft_hash_buckets(desc->size) * sizeof(struct hlist_head); + (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head); } static int nft_hash_init(const struct nft_set *set, @@ -611,7 +783,8 @@ static int nft_hash_init(const struct nft_set *set, return 0; } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_hash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; @@ -621,7 +794,7 @@ static void nft_hash_destroy(const struct nft_set *set) for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nft_set_elem_destroy(set, he, true); + nf_tables_set_elem_destroy(ctx, set, &he->priv); } } } @@ -636,8 +809,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, return false; est->size = sizeof(struct nft_hash) + - nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + - desc->size * sizeof(struct nft_hash_elem); + (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + + (u64)desc->size * sizeof(struct nft_hash_elem); est->lookup = NFT_SET_CLASS_O_1; est->space = NFT_SET_CLASS_O_N; @@ -654,8 +827,8 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features return false; est->size = sizeof(struct nft_hash) + - nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + - desc->size * sizeof(struct nft_hash_elem); + (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + + (u64)desc->size * sizeof(struct nft_hash_elem); est->lookup = NFT_SET_CLASS_O_1; est->space = NFT_SET_CLASS_O_N; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 8b5acc6910fd..112fe46788b6 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -312,7 +312,7 @@ * Jay Ligatti, Josh Kuhn, and Chris Gage. * Proceedings of the IEEE International Conference on Computer * Communication Networks (ICCCN), August 2010. - * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf * * [Rottenstreich 2010] * Worst-Case TCAM Rule Expansion @@ -325,7 +325,7 @@ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, * and Patrick Eugster. * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. - * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf */ #include <linux/kernel.h> @@ -342,9 +342,6 @@ #include "nft_set_pipapo_avx2.h" #include "nft_set_pipapo.h" -/* Current working bitmap index, toggled between field matches */ -static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); - /** * pipapo_refill() - For each set bit, set bits from selected mapping table item * @map: Bitmap to be scanned for set bits @@ -362,11 +359,13 @@ static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); * * Return: -1 on no match, bit position on 'match_only', 0 otherwise. */ -int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, - union nft_pipapo_map_bucket *mt, bool match_only) +int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, + unsigned long *dst, + const union nft_pipapo_map_bucket *mt, bool match_only) { unsigned long bitset; - int k, ret = -1; + unsigned int k; + int ret = -1; for (k = 0; k < len; k++) { bitset = map[k]; @@ -398,41 +397,47 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, } /** - * nft_pipapo_lookup() - Lookup function - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * @ext: nftables API extension pointer, filled with matching reference + * pipapo_get_slow() - Get matching element reference given key data + * @m: storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: timestamp to check for expired elements * * For more details, see DOC: Theory of Operation. * - * Return: true on match, false otherwise. + * This is the main lookup function. It matches key data against either + * the working match set or the uncommitted copy, depending on what the + * caller passed to us. + * nft_pipapo_get (lookup from userspace/control plane) and nft_pipapo_lookup + * (datapath lookup) pass the active copy. + * The insertion path will pass the uncommitted working copy. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); - unsigned long *res_map, *fill_map; - u8 genmask = nft_genmask_cur(net); - const u8 *rp = (const u8 *)key; - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; + unsigned long *res_map, *fill_map, *map; + struct nft_pipapo_scratch *scratch; + const struct nft_pipapo_field *f; bool map_index; int i; local_bh_disable(); - map_index = raw_cpu_read(nft_pipapo_scratch_index); - - m = rcu_dereference(priv->match); - - if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) goto out; + __local_lock_nested_bh(&scratch->bh_lock); + + map_index = scratch->map_index; - res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); - fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res_map = map + (map_index ? m->bsize_max : 0); + fill_map = map + (map_index ? 0 : m->bsize_max); - memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; @@ -442,12 +447,12 @@ static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, * packet bytes value, then AND bucket value */ if (likely(f->bb == 8)) - pipapo_and_field_buckets_8bit(f, res_map, rp); + pipapo_and_field_buckets_8bit(f, res_map, data); else - pipapo_and_field_buckets_4bit(f, res_map, rp); + pipapo_and_field_buckets_4bit(f, res_map, data); NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; - rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); + data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' @@ -460,16 +465,19 @@ next_match: b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, last); if (b < 0) { - raw_cpu_write(nft_pipapo_scratch_index, map_index); + scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); - return false; + return NULL; } if (last) { - *ext = &f->mt[b].e->ext; - if (unlikely(nft_set_elem_expired(*ext) || - !nft_set_elem_active(*ext, genmask))) + struct nft_pipapo_elem *e; + + e = f->mt[b].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) goto next_match; /* Last field: we're just returning the key without @@ -477,10 +485,10 @@ next_match: * current inactive bitmap is clean and can be reused as * *next* bitmap (not initial) for the next packet. */ - raw_cpu_write(nft_pipapo_scratch_index, map_index); + scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); - - return true; + return e; } /* Swap bitmap indices: res_map is the initial bitmap for the @@ -490,119 +498,202 @@ next_match: map_index = !map_index; swap(res_map, fill_map); - rp += NFT_PIPAPO_GROUPS_PADDING(f); + data += NFT_PIPAPO_GROUPS_PADDING(f); } + __local_unlock_nested_bh(&scratch->bh_lock); out: local_bh_enable(); - return false; + return NULL; } /** * pipapo_get() - Get matching element reference given key data - * @net: Network namespace - * @set: nftables API set representation + * @m: Storage containing the set elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements * - * This is essentially the same as the lookup function, except that it matches - * key data against the uncommitted copy and doesn't use preallocated maps for - * bitmap results. + * This is a dispatcher function, either calling out the generic C + * implementation or, if available, the AVX2 one. + * This helper is only called from the control plane, with either RCU + * read lock or transaction mutex held. * - * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -static struct nft_pipapo_elem *pipapo_get(const struct net *net, - const struct nft_set *set, - const u8 *data, u8 genmask) +static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m = priv->clone; - unsigned long *res_map, *fill_map = NULL; - struct nft_pipapo_field *f; - int i; + struct nft_pipapo_elem *e; - res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); - if (!res_map) { - ret = ERR_PTR(-ENOMEM); - goto out; - } + local_bh_disable(); - fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); - if (!fill_map) { - ret = ERR_PTR(-ENOMEM); - goto out; +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) { + e = pipapo_get_avx2(m, data, genmask, tstamp); + local_bh_enable(); + return e; } +#endif + e = pipapo_get_slow(m, data, genmask, tstamp); + local_bh_enable(); + return e; +} - memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); +/** + * nft_pipapo_lookup() - Dataplane fronted for main lookup function + * @net: Network namespace + * @set: nftables API set representation + * @key: pointer to nft registers containing key data + * + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy. + * Unlike other set types, this uses 0 instead of nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK_ANY doesn't work for the same reason: old-gen entries get + * skipped, new-gen entries are only reachable from priv->clone. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip. As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones. + * + * Return: ntables API extension pointer or NULL if no match. + */ +const struct nft_set_ext * +nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + const struct nft_pipapo_elem *e; - nft_pipapo_for_each_field(f, i, m) { - bool last = i == m->field_count - 1; - int b; + m = rcu_dereference(priv->match); + e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64()); - /* For each bit group: select lookup table bucket depending on - * packet bytes value, then AND bucket value - */ - if (f->bb == 8) - pipapo_and_field_buckets_8bit(f, res_map, data); - else if (f->bb == 4) - pipapo_and_field_buckets_4bit(f, res_map, data); - else - BUG(); + return e ? &e->ext : NULL; +} - data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); +/** + * nft_pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @flags: Unused + * + * This function is called from the control plane path under + * RCU read lock. + * + * Return: set element private pointer or ERR_PTR(-ENOENT). + */ +static struct nft_elem_priv * +nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = rcu_dereference(priv->match); + struct nft_pipapo_elem *e; - /* Now populate the bitmap for the next field, unless this is - * the last field, in which case return the matched 'ext' - * pointer if any. - * - * Now res_map contains the matching bitmap, and fill_map is the - * bitmap for the next field. - */ -next_match: - b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, - last); - if (b < 0) - goto out; + e = pipapo_get(m, (const u8 *)elem->key.val.data, + nft_genmask_cur(net), get_jiffies_64()); + if (!e) + return ERR_PTR(-ENOENT); - if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext) || - (genmask && - !nft_set_elem_active(&f->mt[b].e->ext, genmask))) - goto next_match; + return &e->priv; +} - ret = f->mt[b].e; - goto out; - } +/** + * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize + * @f: Field containing mapping table + * @old_rules: Amount of existing mapped rules + * @rules: Amount of new rules to map + * + * Return: 0 on success, negative error code on failure. + */ +static int pipapo_realloc_mt(struct nft_pipapo_field *f, + unsigned int old_rules, unsigned int rules) +{ + union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt; + const unsigned int extra = PAGE_SIZE / sizeof(*new_mt); + unsigned int rules_alloc = rules; - data += NFT_PIPAPO_GROUPS_PADDING(f); + might_sleep(); - /* Swap bitmap indices: fill_map will be the initial bitmap for - * the next field (i.e. the new res_map), and res_map is - * guaranteed to be all-zeroes at this point, ready to be filled - * according to the next mapping table. - */ - swap(res_map, fill_map); + if (unlikely(rules == 0)) + goto out_free; + + /* growing and enough space left, no action needed */ + if (rules > old_rules && f->rules_alloc > rules) + return 0; + + /* downsize and extra slack has not grown too large */ + if (rules < old_rules) { + unsigned int remove = f->rules_alloc - rules; + + if (remove < (2u * extra)) + return 0; } -out: - kfree(fill_map); - kfree(res_map); - return ret; + /* If set needs more than one page of memory for rules then + * allocate another extra page to avoid frequent reallocation. + */ + if (rules > extra && + check_add_overflow(rules, extra, &rules_alloc)) + return -EOVERFLOW; + + if (rules_alloc > (INT_MAX / sizeof(*new_mt))) + return -ENOMEM; + + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); + if (!new_mt) + return -ENOMEM; + + if (old_mt) + memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt)); + + if (rules > old_rules) { + memset(new_mt + old_rules, 0, + (rules - old_rules) * sizeof(*new_mt)); + } +out_free: + f->rules_alloc = rules_alloc; + f->mt = new_mt; + + kvfree(old_mt); + + return 0; } + /** - * nft_pipapo_get() - Get matching element reference given key data - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * @flags: Unused + * lt_calculate_size() - Get storage size for lookup table with overflow check + * @groups: Amount of bit groups + * @bb: Number of bits grouped together in lookup table buckets + * @bsize: Size of each bucket in lookup table, in longs + * + * Return: allocation size including alignment overhead, negative on overflow */ -static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb, + unsigned int bsize) { - return pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long); + + if (check_mul_overflow(ret, bsize, &ret)) + return -1; + if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret)) + return -1; + if (ret > INT_MAX) + return -1; + + return ret; } /** @@ -617,12 +708,16 @@ static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, * * Return: 0 on success, -ENOMEM on allocation failure. */ -static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) +static int pipapo_resize(struct nft_pipapo_field *f, + unsigned int old_rules, unsigned int rules) { long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; - union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; - size_t new_bucket_size, copy; - int group, bucket; + unsigned int new_bucket_size, copy; + int group, bucket, err; + ssize_t lt_size; + + if (rules >= NFT_PIPAPO_RULE0_MAX) + return -ENOSPC; new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); #ifdef NFT_PIPAPO_ALIGN @@ -638,10 +733,11 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) else copy = new_bucket_size; - new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * - new_bucket_size * sizeof(*new_lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size); + if (lt_size < 0) + return -ENOMEM; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return -ENOMEM; @@ -662,27 +758,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) } mt: - new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); - if (!new_mt) { + err = pipapo_realloc_mt(f, old_rules, rules); + if (err) { kvfree(new_lt); - return -ENOMEM; - } - - memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); - if (rules > old_rules) { - memset(new_mt + old_rules, 0, - (rules - old_rules) * sizeof(*new_mt)); + return err; } if (new_lt) { f->bsize = new_bucket_size; - NFT_PIPAPO_LT_ASSIGN(f, new_lt); + f->lt = new_lt; kvfree(old_lt); } - f->mt = new_mt; - kvfree(old_mt); - return 0; } @@ -833,9 +920,9 @@ static void pipapo_lt_8b_to_4b(int old_groups, int bsize, */ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) { + unsigned int groups, bb; unsigned long *new_lt; - int groups, bb; - size_t lt_size; + ssize_t lt_size; lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * sizeof(*f->lt); @@ -845,15 +932,17 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) groups = f->groups * 2; bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && lt_size < NFT_PIPAPO_LT_SIZE_LOW) { groups = f->groups / 2; bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; /* Don't increase group width if the resulting lookup table size * would exceed the upper size threshold for a "small" set. @@ -864,7 +953,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) return; } - new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return; @@ -884,7 +973,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) f->groups = groups; f->bb = bb; kvfree(f->lt); - NFT_PIPAPO_LT_ASSIGN(f, new_lt); + f->lt = new_lt; } /** @@ -901,12 +990,14 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { - int rule = f->rules++, group, ret, bit_offset = 0; + unsigned int rule = f->rules, group, ret, bit_offset = 0; - ret = pipapo_resize(f, f->rules - 1, f->rules); + ret = pipapo_resize(f, f->rules, f->rules + 1); if (ret) return ret; + f->rules++; + for (group = 0; group < f->groups; group++) { int i, v; u8 mask; @@ -1051,7 +1142,9 @@ static int pipapo_expand(struct nft_pipapo_field *f, step++; if (step >= len) { if (!masks) { - pipapo_insert(f, base, 0); + err = pipapo_insert(f, base, 0); + if (err < 0) + return err; masks = 1; } goto out; @@ -1075,7 +1168,7 @@ out: * @m: Matching data, including mapping table * @map: Table of rule maps: array of first rule and amount of rules * in next field a given rule maps to, for each field - * @ext: For last field, nft_set_ext pointer matching rules map to + * @e: For last field, nft_set_ext pointer matching rules map to */ static void pipapo_map(struct nft_pipapo_match *m, union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], @@ -1097,9 +1190,23 @@ static void pipapo_map(struct nft_pipapo_match *m, } /** + * pipapo_free_scratch() - Free per-CPU map at original address + * @m: Matching data + * @cpu: CPU number + */ +static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) +{ + struct nft_pipapo_scratch *s; + + s = *per_cpu_ptr(m->scratch, cpu); + + kvfree(s); +} + +/** * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results * @clone: Copy of matching data with pending insertions and deletions - * @bsize_max Maximum bucket size, scratch maps cover two buckets + * @bsize_max: Maximum bucket size, scratch maps cover two buckets * * Return: 0 on success, -ENOMEM on failure. */ @@ -1109,14 +1216,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, int i; for_each_possible_cpu(i) { - unsigned long *scratch; -#ifdef NFT_PIPAPO_ALIGN - unsigned long *scratch_aligned; -#endif + struct nft_pipapo_scratch *scratch; - scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL, cpu_to_node(i)); + scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) + + NFT_PIPAPO_ALIGN_HEADROOM, + GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous * allocations: this means that some scratch maps have @@ -1128,49 +1232,82 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, return -ENOMEM; } - kfree(*per_cpu_ptr(clone->scratch, i)); - + pipapo_free_scratch(clone, i); + local_lock_init(&scratch->bh_lock); *per_cpu_ptr(clone->scratch, i) = scratch; - -#ifdef NFT_PIPAPO_ALIGN - scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch); - *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned; -#endif } return 0; } +static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) +{ +#ifdef CONFIG_PROVE_LOCKING + const struct net *net = read_pnet(&set->net); + + return lockdep_is_held(&nft_pernet(net)->commit_mutex); +#else + return true; +#endif +} + +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old); + +/** + * pipapo_maybe_clone() - Build clone for pending data changes, if not existing + * @set: nftables API set representation + * + * Return: newly created or existing clone, if any. NULL on allocation failure + */ +static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + + if (priv->clone) + return priv->clone; + + m = rcu_dereference_protected(priv->match, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = pipapo_clone(m); + + return priv->clone; +} + /** * nft_pipapo_insert() - Validate and insert ranged elements * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data - * @ext2: Filled with pointer to &struct nft_set_ext in inserted element + * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element * * Return: 0 on success, error pointer on failure. */ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext2) + struct nft_elem_priv **elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; - struct nft_pipapo_elem *e = elem->priv, *dup; - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m = priv->clone; + struct nft_pipapo_match *m = pipapo_maybe_clone(set); u8 genmask = nft_genmask_next(net); + struct nft_pipapo_elem *e, *dup; + u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_field *f; + const u8 *start_p, *end_p; int i, bsize_max, err = 0; + if (!m) + return -ENOMEM; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) end = (const u8 *)nft_set_ext_key_end(ext)->data; else end = start; - dup = pipapo_get(net, set, start, genmask); - if (!IS_ERR(dup)) { + dup = pipapo_get(m, start, genmask, tstamp); + if (dup) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1182,30 +1319,31 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { - *ext2 = &dup->ext; + *elem_priv = &dup->priv; return -EEXIST; } return -ENOTEMPTY; } - if (PTR_ERR(dup) == -ENOENT) { - /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, end, nft_genmask_next(net)); - } - - if (PTR_ERR(dup) != -ENOENT) { - if (IS_ERR(dup)) - return PTR_ERR(dup); - *ext2 = &dup->ext; + /* Look for partially overlapping entries */ + dup = pipapo_get(m, end, nft_genmask_next(net), tstamp); + if (dup) { + *elem_priv = &dup->priv; return -ENOTEMPTY; } /* Validate */ - nft_pipapo_for_each_field(f, i, m) { - const u8 *start_p = start, *end_p = end; + start_p = start; + end_p = end; + + /* some helpers return -1, or 0 >= for valid rule pos, + * so we cannot support more than INT_MAX rules at this time. + */ + BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX); - if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) + nft_pipapo_for_each_field(f, i, m) { + if (f->rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; if (memcmp(start_p, end_p, @@ -1217,8 +1355,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, } /* Insert */ - priv->dirty = true; - bsize_max = m->bsize_max; nft_pipapo_for_each_field(f, i, m) { @@ -1233,6 +1369,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else ret = pipapo_expand(f, start, end, f->groups * f->bb); + if (ret < 0) + return ret; + if (f->bsize > bsize_max) bsize_max = f->bsize; @@ -1242,17 +1381,20 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } - if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + put_cpu_ptr(m->scratch); + err = pipapo_realloc_scratch(m, bsize_max); if (err) return err; - this_cpu_write(nft_pipapo_scratch_index, false); - m->bsize_max = bsize_max; + } else { + put_cpu_ptr(m->scratch); } - *ext2 = &e->ext; + e = nft_elem_priv_cast(elem->priv); + *elem_priv = &e->priv; pipapo_map(m, rulemap, e); @@ -1263,7 +1405,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, * pipapo_clone() - Clone matching data to create new working copy * @old: Existing matching data * - * Return: copy of matching data passed as 'old', error pointer on failure + * Return: copy of matching data passed as 'old' or NULL. */ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) { @@ -1271,10 +1413,9 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) struct nft_pipapo_match *new; int i; - new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, - GFP_KERNEL); + new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); if (!new) - return ERR_PTR(-ENOMEM); + return NULL; new->field_count = old->field_count; new->bsize_max = old->bsize_max; @@ -1283,11 +1424,11 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (!new->scratch) goto out_scratch; -#ifdef NFT_PIPAPO_ALIGN - new->scratch_aligned = alloc_percpu(*new->scratch_aligned); - if (!new->scratch_aligned) - goto out_scratch; -#endif + for_each_possible_cpu(i) + *per_cpu_ptr(new->scratch, i) = NULL; + + if (pipapo_realloc_scratch(new, old->bsize_max)) + goto out_scratch_realloc; rcu_head_init(&new->rcu); @@ -1296,28 +1437,41 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) for (i = 0; i < old->field_count; i++) { unsigned long *new_lt; + ssize_t lt_size; memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); - new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * - src->bsize * sizeof(*dst->lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + lt_size = lt_calculate_size(src->groups, src->bb, src->bsize); + if (lt_size < 0) + goto out_lt; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) goto out_lt; - NFT_PIPAPO_LT_ASSIGN(dst, new_lt); + dst->lt = new_lt; memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), NFT_PIPAPO_LT_ALIGN(src->lt), src->bsize * sizeof(*dst->lt) * src->groups * NFT_PIPAPO_BUCKETS(src->bb)); - dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); - if (!dst->mt) - goto out_mt; + if (src->rules > 0) { + if (src->rules_alloc > (INT_MAX / sizeof(*src->mt))) + goto out_mt; + + dst->mt = kvmalloc_array(src->rules_alloc, + sizeof(*src->mt), + GFP_KERNEL_ACCOUNT); + if (!dst->mt) + goto out_mt; + + memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); + } else { + dst->mt = NULL; + dst->rules_alloc = 0; + } - memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); src++; dst++; } @@ -1332,14 +1486,14 @@ out_lt: kvfree(dst->lt); dst--; } -#ifdef NFT_PIPAPO_ALIGN - free_percpu(new->scratch_aligned); -#endif +out_scratch_realloc: + for_each_possible_cpu(i) + pipapo_free_scratch(new, i); out_scratch: free_percpu(new->scratch); kfree(new); - return ERR_PTR(-ENOMEM); + return NULL; } /** @@ -1371,10 +1525,10 @@ out_scratch: * * Return: Number of rules that originated from the same entry as @first. */ -static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) +static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first) { struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ - int r; + unsigned int r; for (r = first; r < f->rules; r++) { if (r != first && e != f->mt[r].e) @@ -1427,8 +1581,9 @@ static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) * 0 1 2 * element pointers: 0x42 0x42 0x44 */ -static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, - int start, int n, int to_offset, bool is_last) +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules, + unsigned int start, unsigned int n, + unsigned int to_offset, bool is_last) { int i; @@ -1445,7 +1600,7 @@ static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, /** * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map * @m: Matching data - * @rulemap Table of rule maps, arrays of first rule and amount of rules + * @rulemap: Table of rule maps, arrays of first rule and amount of rules * in next field a given entry maps to, for each field * * For each rule in lookup table buckets mapping to this set of rules, drop @@ -1518,21 +1673,35 @@ static void pipapo_drop(struct nft_pipapo_match *m, } } +static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, + struct nft_pipapo_elem *e) + +{ + nft_setelem_data_deactivate(net, set, &e->priv); +} + /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); - int rules_f0, first_rule = 0; + struct net *net = read_pnet(&set->net); + unsigned int rules_f0, first_rule = 0; + u64 tstamp = nft_net_tstamp(net); + struct nft_pipapo_elem *e; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; - struct nft_pipapo_field *f; - struct nft_pipapo_elem *e; - int i, start, rules_fx; + const struct nft_pipapo_field *f; + unsigned int i, start, rules_fx; start = first_rule; rules_fx = rules_f0; @@ -1551,13 +1720,18 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) f--; i--; e = f->mt[rulemap[i].to].e; - if (nft_set_elem_expired(&e->ext) && - !nft_set_elem_mark_busy(&e->ext)) { - priv->dirty = true; - pipapo_drop(m, rulemap); - rcu_barrier(); - nft_set_elem_destroy(set, e, true); + /* synchronous gc never fails, there is no need to set on + * NFT_SET_ELEM_DEAD_BIT. + */ + if (__nft_set_elem_expired(&e->ext, tstamp)) { + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); + nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. @@ -1567,7 +1741,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } - priv->last_gc = jiffies; + gc = nft_trans_gc_catchall_sync(gc); + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } /** @@ -1585,32 +1763,33 @@ static void pipapo_free_fields(struct nft_pipapo_match *m) } } -/** - * pipapo_reclaim_match - RCU callback to free fields from old matching data - * @rcu: RCU head - */ -static void pipapo_reclaim_match(struct rcu_head *rcu) +static void pipapo_free_match(struct nft_pipapo_match *m) { - struct nft_pipapo_match *m; int i; - m = container_of(rcu, struct nft_pipapo_match, rcu); - for_each_possible_cpu(i) - kfree(*per_cpu_ptr(m->scratch, i)); + pipapo_free_scratch(m, i); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif free_percpu(m->scratch); - pipapo_free_fields(m); kfree(m); } /** - * pipapo_commit() - Replace lookup data with current working copy + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + pipapo_free_match(m); +} + +/** + * nft_pipapo_commit() - Replace lookup data with current working copy * @set: nftables API set representation * * While at it, check if we should perform garbage collection on the working @@ -1620,108 +1799,91 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * We also need to create a new working copy for subsequent insertions and * deletions. */ -static void pipapo_commit(const struct nft_set *set) +static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *new_clone, *old; - - if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - pipapo_gc(set, priv->clone); + struct nft_pipapo_match *old; - if (!priv->dirty) + if (!priv->clone) return; - new_clone = pipapo_clone(priv->clone); - if (IS_ERR(new_clone)) - return; + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); - priv->dirty = false; + old = rcu_replace_pointer(priv->match, priv->clone, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = NULL; - old = rcu_access_pointer(priv->match); - rcu_assign_pointer(priv->match, priv->clone); if (old) call_rcu(&old->rcu, pipapo_reclaim_match); +} + +static void nft_pipapo_abort(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); - priv->clone = new_clone; + if (!priv->clone) + return; + pipapo_free_match(priv->clone); + priv->clone = NULL; } /** * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently - * in use for lookups, and not directly inserted into current lookup data, so - * we'll take care of that by calling pipapo_commit() here. Both + * in use for lookups, and not directly inserted into current lookup data. Both * nft_pipapo_insert() and nft_pipapo_activate() are called once for each * element, hence we can't purpose either one as a real commit operation. */ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_pipapo_elem *e; + struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); - if (IS_ERR(e)) - return; - - nft_set_elem_change_active(net, set, &e->ext); - nft_set_elem_clear_busy(&e->ext); - - pipapo_commit(set); + nft_clear(net, &e->ext); } /** - * pipapo_deactivate() - Check that element is in set, mark as inactive + * nft_pipapo_deactivate() - Search for element and make it inactive * @net: Network namespace * @set: nftables API set representation - * @data: Input key data - * @ext: nftables API extension pointer, used to check for end element - * - * This is a convenience function that can be called from both - * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same - * operation. + * @elem: nftables API element representation containing key data * * Return: deactivated element if found, NULL otherwise. */ -static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, - const u8 *data, const struct nft_set_ext *ext) +static struct nft_elem_priv * +nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_pipapo_match *m = pipapo_maybe_clone(set); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, data, nft_genmask_next(net)); - if (IS_ERR(e)) + /* removal must occur on priv->clone, if we are low on memory + * we have no choice and must fail the removal request. + */ + if (!m) return NULL; - nft_set_elem_change_active(net, set, &e->ext); - - return e; -} + e = pipapo_get(m, (const u8 *)elem->key.val.data, + nft_genmask_next(net), nft_net_tstamp(net)); + if (!e) + return NULL; -/** - * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * - * Return: deactivated element if found, NULL otherwise. - */ -static void *nft_pipapo_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + nft_set_elem_change_active(net, set, &e->ext); - return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); + return &e->priv; } /** - * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * nft_pipapo_flush() - make element inactive * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * This is functionally the same as nft_pipapo_deactivate(), with a slightly * different interface, and it's also called once for each element in a set @@ -1735,13 +1897,12 @@ static void *nft_pipapo_deactivate(const struct net *net, * * Return: true if element was found and deactivated. */ -static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, - void *elem) +static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_pipapo_elem *e = elem; + struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), - &e->ext); + nft_set_elem_change_active(net, set, &e->ext); } /** @@ -1864,7 +2025,7 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, * nft_pipapo_remove() - Remove element given key, commit * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * Similarly to nft_pipapo_activate(), this is used as commit operation by the * API, but it's called once per element in the pending transaction, so we can't @@ -1872,20 +2033,17 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, * the matched element here, if any, and commit the updated matching data. */ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; - struct nft_pipapo_elem *e = elem->priv; - int rules_f0, first_rule = 0; + unsigned int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; const u8 *data; + e = nft_elem_priv_cast(elem_priv); data = (const u8 *)nft_set_ext_key(&e->ext); - e = pipapo_get(net, set, data, 0); - if (IS_ERR(e)) - return; - while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; @@ -1893,12 +2051,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, int i, start, rules_fx; match_start = data; - match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END)) + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + else + match_end = data; start = first_rule; rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; @@ -1911,49 +2075,42 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); - } - if (i == m->field_count) { - priv->dirty = true; - pipapo_drop(m, rulemap); - pipapo_commit(set); - return; + if (last && f->mt[rulemap[i].to].e == e) { + pipapo_drop(m, rulemap); + return; + } } first_rule += rules_f0; } + + WARN_ON_ONCE(1); /* elem_priv not found */ } /** - * nft_pipapo_walk() - Walk over elements + * nft_pipapo_do_walk() - Walk over elements in m * @ctx: nftables API context * @set: nftables API set representation + * @m: matching data pointing to key mapping array * @iter: Iterator * * As elements are referenced in the mapping array for the last field, directly * scan that array: there's no need to follow rule mappings from the first - * field. + * field. @m is protected either by RCU read lock or by transaction mutex. */ -static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_pipapo_match *m, + struct nft_set_iter *iter) { - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; - int i, r; - - rcu_read_lock(); - m = rcu_dereference(priv->match); - - if (unlikely(!m)) - goto out; + const struct nft_pipapo_field *f; + unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; for (r = 0; r < f->rules; r++) { struct nft_pipapo_elem *e; - struct nft_set_elem elem; if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) continue; @@ -1962,21 +2119,52 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, goto cont; e = f->mt[r].e; - if (nft_set_elem_expired(&e->ext)) - goto cont; - - elem.priv = e; - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) - goto out; + return; cont: iter->count++; } +} -out: - rcu_read_unlock(); +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * Test if destructive action is needed or not, clone active backend if needed + * and call the real function to work on the data. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + + switch (iter->type) { + case NFT_ITER_UPDATE: + m = pipapo_maybe_clone(set); + if (!m) { + iter->err = -ENOMEM; + return; + } + + nft_pipapo_do_walk(ctx, set, m, iter); + break; + case NFT_ITER_READ: + rcu_read_lock(); + m = rcu_dereference(priv->match); + nft_pipapo_do_walk(ctx, set, m, iter); + rcu_read_unlock(); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } /** @@ -2039,20 +2227,24 @@ static int nft_pipapo_init(const struct nft_set *set, struct nft_pipapo_field *f; int err, i, field_count; + BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0); + field_count = desc->field_count ? : 1; + BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255); + BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT); + if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; - m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count, - GFP_KERNEL); + m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); if (!m) return -ENOMEM; m->field_count = field_count; m->bsize_max = 0; - m->scratch = alloc_percpu(unsigned long *); + m->scratch = alloc_percpu(struct nft_pipapo_scratch *); if (!m->scratch) { err = -ENOMEM; goto out_scratch; @@ -2060,20 +2252,14 @@ static int nft_pipapo_init(const struct nft_set *set, for_each_possible_cpu(i) *per_cpu_ptr(m->scratch, i) = NULL; -#ifdef NFT_PIPAPO_ALIGN - m->scratch_aligned = alloc_percpu(unsigned long *); - if (!m->scratch_aligned) { - err = -ENOMEM; - goto out_free; - } - for_each_possible_cpu(i) - *per_cpu_ptr(m->scratch_aligned, i) = NULL; -#endif - rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { - int len = desc->field_len[i] ? : set->klen; + unsigned int len = desc->field_len[i] ? : set->klen; + + /* f->groups is u8 */ + BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES * + BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256); f->bb = NFT_PIPAPO_GROUP_BITS_INIT; f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); @@ -2082,28 +2268,15 @@ static int nft_pipapo_init(const struct nft_set *set, f->bsize = 0; f->rules = 0; - NFT_PIPAPO_LT_ASSIGN(f, NULL); + f->rules_alloc = 0; + f->lt = NULL; f->mt = NULL; } - /* Create an initial clone of matching data for next insertion */ - priv->clone = pipapo_clone(m); - if (IS_ERR(priv->clone)) { - err = PTR_ERR(priv->clone); - goto out_free; - } - - priv->dirty = false; - rcu_assign_pointer(priv->match, m); return 0; -out_free: -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif - free_percpu(m->scratch); out_scratch: kfree(m); @@ -2111,57 +2284,55 @@ out_scratch: } /** - * nft_pipapo_destroy() - Free private data for set and all committed elements + * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array + * @ctx: context * @set: nftables API set representation + * @m: matching data pointing to key mapping array */ -static void nft_pipapo_destroy(const struct nft_set *set) +static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + struct nft_pipapo_match *m) { - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m; struct nft_pipapo_field *f; - int i, r, cpu; + unsigned int i, r; - m = rcu_dereference_protected(priv->match, true); - if (m) { - rcu_barrier(); + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; - for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) - ; + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; - for (r = 0; r < f->rules; r++) { - struct nft_pipapo_elem *e; + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; - if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) - continue; + e = f->mt[r].e; - e = f->mt[r].e; + nf_tables_set_elem_destroy(ctx, set, &e->priv); + } +} - nft_set_elem_destroy(set, e, true); - } +/** + * nft_pipapo_destroy() - Free private data for set and all committed elements + * @ctx: context + * @set: nftables API set representation + */ +static void nft_pipapo_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif - for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(m->scratch, cpu)); - free_percpu(m->scratch); - pipapo_free_fields(m); - kfree(m); - priv->match = NULL; - } + m = rcu_dereference_protected(priv->match, true); if (priv->clone) { -#ifdef NFT_PIPAPO_ALIGN - free_percpu(priv->clone->scratch_aligned); -#endif - for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); - free_percpu(priv->clone->scratch); - - pipapo_free_fields(priv->clone); - kfree(priv->clone); + nft_set_pipapo_match_destroy(ctx, set, priv->clone); + pipapo_free_match(priv->clone); priv->clone = NULL; + } else { + nft_set_pipapo_match_destroy(ctx, set, m); } + + pipapo_free_match(m); } /** @@ -2197,6 +2368,8 @@ const struct nft_set_type nft_set_pipapo_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2219,6 +2392,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 25a75591583e..eaab422aa56a 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -70,15 +70,9 @@ #define NFT_PIPAPO_ALIGN_HEADROOM \ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN) #define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN)) -#define NFT_PIPAPO_LT_ASSIGN(field, x) \ - do { \ - (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \ - (field)->lt = (x); \ - } while (0) #else #define NFT_PIPAPO_ALIGN_HEADROOM 0 #define NFT_PIPAPO_LT_ALIGN(lt) (lt) -#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x)) #endif /* NFT_PIPAPO_ALIGN */ #define nft_pipapo_for_each_field(field, index, match) \ @@ -110,44 +104,50 @@ union nft_pipapo_map_bucket { /** * struct nft_pipapo_field - Lookup, mapping tables and related data for a field - * @groups: Amount of bit groups * @rules: Number of inserted rules * @bsize: Size of each bucket in lookup table, in longs + * @rules_alloc: Number of allocated rules, always >= rules + * @groups: Amount of bit groups * @bb: Number of bits grouped together in lookup table buckets * @lt: Lookup table: 'groups' rows of buckets - * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes * @mt: Mapping table: one bucket per rule */ struct nft_pipapo_field { - int groups; - unsigned long rules; - size_t bsize; - int bb; -#ifdef NFT_PIPAPO_ALIGN - unsigned long *lt_aligned; -#endif + unsigned int rules; + unsigned int bsize; + unsigned int rules_alloc; + u8 groups; + u8 bb; unsigned long *lt; union nft_pipapo_map_bucket *mt; }; /** + * struct nft_pipapo_scratch - percpu data used for lookup and matching + * @bh_lock: PREEMPT_RT local spinlock + * @map_index: Current working bitmap index, toggled between field matches + * @__map: store partial matching results during lookup + */ +struct nft_pipapo_scratch { + local_lock_t bh_lock; + u8 map_index; + unsigned long __map[]; +}; + +/** * struct nft_pipapo_match - Data used for lookup and matching - * @field_count Amount of fields in set - * @scratch: Preallocated per-CPU maps for partial matching results - * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes + * @field_count: Amount of fields in set * @bsize_max: Maximum lookup table bucket size of all fields, in longs - * @rcu Matching data is swapped on commits + * @scratch: Preallocated per-CPU maps for partial matching results + * @rcu: Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { - int field_count; -#ifdef NFT_PIPAPO_ALIGN - unsigned long * __percpu *scratch_aligned; -#endif - unsigned long * __percpu *scratch; - size_t bsize_max; + u8 field_count; + unsigned int bsize_max; + struct nft_pipapo_scratch * __percpu *scratch; struct rcu_head rcu; - struct nft_pipapo_field f[]; + struct nft_pipapo_field f[] __counted_by(field_count); }; /** @@ -155,14 +155,12 @@ struct nft_pipapo_match { * @match: Currently in-use matching data * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding - * @dirty: Working copy has pending insertions or deletions * @last_gc: Timestamp of last garbage collection run, jiffies */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; - bool dirty; unsigned long last_gc; }; @@ -170,14 +168,17 @@ struct nft_pipapo_elem; /** * struct nft_pipapo_elem - API-facing representation of single set element + * @priv: element placeholder * @ext: nftables API extensions */ struct nft_pipapo_elem { - struct nft_set_ext ext; + struct nft_elem_priv priv; + struct nft_set_ext ext; }; -int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, - union nft_pipapo_map_bucket *mt, bool match_only); +int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, + unsigned long *dst, + const union nft_pipapo_map_bucket *mt, bool match_only); /** * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets @@ -185,7 +186,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, * @dst: Area to store result * @data: Input data selecting table buckets */ -static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f, +static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { @@ -213,7 +214,7 @@ static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f, * @dst: Area to store result * @data: Input data selecting table buckets */ -static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f, +static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { @@ -277,4 +278,25 @@ static u64 pipapo_estimate_size(const struct nft_set_desc *desc) return size; } +/** + * pipapo_resmap_init() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Initialize all bits covered by the first field to one, so that after + * the first step, only the matching bits of the first bit group remain. + * + * If other fields have a large bitmap, set remainder of res_map to 0. + */ +static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + for (i = 0; i < f->bsize; i++) + res_map[i] = ULONG_MAX; + + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} #endif /* _NFT_SET_PIPAPO_H */ diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index d65ae0e23028..7ff90325c97f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -57,7 +57,7 @@ /* Jump to label if @reg is zero */ #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ - asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ + asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ "je %l[" #label "]" : : : : label) /* Store 256 bits from YMM register into memory. Contrary to bucket load @@ -71,9 +71,6 @@ #define NFT_PIPAPO_AVX2_ZERO(reg) \ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) -/* Current working bitmap index, toggled between field matches */ -static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index); - /** * nft_pipapo_avx2_prepare() - Prepare before main algorithm body * @@ -142,7 +139,6 @@ static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) * @map: Bitmap to be scanned for set bits * @dst: Destination bitmap * @mt: Mapping table containing bit set specifiers - * @len: Length of bitmap in longs * @last: Return index of first set bit, if this is the last field * * This is an alternative implementation of pipapo_refill() suitable for usage @@ -216,8 +212,9 @@ static int nft_pipapo_avx2_refill(int offset, unsigned long *map, * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; @@ -278,8 +275,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; @@ -354,8 +352,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -449,8 +448,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -538,8 +538,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -673,8 +674,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -730,8 +732,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -794,8 +797,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -869,8 +873,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -887,7 +892,7 @@ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); - NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize); NFT_PIPAPO_AVX2_AND(7, 2, 3); /* Stall */ @@ -954,8 +959,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -988,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_AND(3, 4, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); - NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_AND(0, 3, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); @@ -1030,6 +1037,7 @@ nothing: /** * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes + * @mdata: Matching data, including mapping table * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables @@ -1045,17 +1053,17 @@ nothing: * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ -static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) +static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata, + unsigned long *map, unsigned long *fill, + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { - unsigned long *lt = f->lt, bsize = f->bsize; + unsigned long bsize = f->bsize; int i, ret = -1, b; - lt += offset * NFT_PIPAPO_LONGS_PER_M256; - if (first) - memset(map, 0xff, bsize * sizeof(*map)); + pipapo_resmap_init(mdata, map); for (i = offset; i < bsize; i++) { if (f->bb == 8) @@ -1091,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; - if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) + if (!boot_cpu_has(X86_FEATURE_AVX2)) return false; est->size = pipapo_estimate_size(desc); @@ -1106,57 +1114,78 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, } /** - * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * @ext: nftables API extension pointer, filled with matching reference + * pipapo_resmap_init_avx2() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Like pipapo_resmap_init() but do not set start map bits covered by the first field. + */ +static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + /* Starting map doesn't need to be set to all-ones for this implementation, + * but we do need to zero the remaining bits, if any. + */ + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} + +/** + * pipapo_get_avx2() - Lookup function for AVX2 implementation + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * * This implementation exploits the repetitive characteristic of the algorithm * to provide a fast, vectorised version using the AVX2 SIMD instruction set. * - * Return: true on match, false otherwise. + * The caller must check that the FPU is usable. + * This function must be called with BH disabled. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); - unsigned long *res, *fill, *scratch; - u8 genmask = nft_genmask_cur(net); - const u8 *rp = (const u8 *)key; - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; + struct nft_pipapo_scratch *scratch; + const struct nft_pipapo_field *f; + unsigned long *res, *fill, *map; bool map_index; - int i, ret = 0; + int i; - m = rcu_dereference(priv->match); + scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) + return NULL; - /* This also protects access to all data related to scratch maps */ - kernel_fpu_begin(); + __local_lock_nested_bh(&scratch->bh_lock); + map_index = scratch->map_index; + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res = map + (map_index ? m->bsize_max : 0); + fill = map + (map_index ? 0 : m->bsize_max); - scratch = *raw_cpu_ptr(m->scratch_aligned); - if (unlikely(!scratch)) { - kernel_fpu_end(); - return false; - } - map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index); - - res = scratch + (map_index ? m->bsize_max : 0); - fill = scratch + (map_index ? 0 : m->bsize_max); + pipapo_resmap_init_avx2(m, res); - /* Starting map doesn't need to be set for this implementation */ + /* Note that we don't need a valid MXCSR state for any of the + * operations we use here, so pass 0 as mask and spare a LDMXCSR + * instruction. + */ + kernel_fpu_begin_mask(0); nft_pipapo_avx2_prepare(); -next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; + int ret = 0; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ - ret, rp, \ + ret, data, \ first, last)) if (likely(f->bb == 8)) { @@ -1171,8 +1200,8 @@ next_match: } else if (f->groups == 16) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { - ret = nft_pipapo_avx2_lookup_slow(res, fill, f, - ret, rp, + ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, + ret, data, first, last); } } else { @@ -1187,8 +1216,8 @@ next_match: } else if (f->groups == 32) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { - ret = nft_pipapo_avx2_lookup_slow(res, fill, f, - ret, rp, + ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, + ret, data, first, last); } } @@ -1196,28 +1225,78 @@ next_match: #undef NFT_SET_PIPAPO_AVX2_LOOKUP - if (ret < 0) - goto out; +next_match: + if (ret < 0) { + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; + } if (last) { - *ext = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(*ext) || - !nft_set_elem_active(*ext, genmask))) { - ret = 0; + struct nft_pipapo_elem *e; + + e = f->mt[ret].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) { + ret = pipapo_refill(res, f->bsize, f->rules, + fill, f->mt, last); goto next_match; } - goto out; + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return e; } + map_index = !map_index; swap(res, fill); - rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } -out: - if (i % 2) - raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index); kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; +} + +/** + * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation + * @net: Network namespace + * @set: nftables API set representation + * @key: nftables API element representation containing key data + * + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy using + * the AVX2 routines if the FPU is usable or fall back to the generic + * implementation of the algorithm otherwise. + * + * Return: nftables API extension pointer or NULL if no match. + */ +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + const u8 *rp = (const u8 *)key; + const struct nft_pipapo_elem *e; + + local_bh_disable(); + + if (unlikely(!irq_fpu_usable())) { + const struct nft_set_ext *ext; + + ext = nft_pipapo_lookup(net, set, key); + + local_bh_enable(); + return ext; + } + + m = rcu_dereference(priv->match); + + e = pipapo_get_avx2(m, rp, 0, get_jiffies_64()); + local_bh_enable(); - return ret >= 0; + return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index 394bcb704db7..c2999b63da3f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,10 +5,12 @@ #include <asm/fpu/xstate.h> #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +struct nft_pipapo_match; bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ #endif /* _NFT_SET_PIPAPO_AVX2_H */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 62f416bc0579..ca594161b840 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -18,11 +18,12 @@ struct nft_rbtree { struct rb_root root; rwlock_t lock; - seqcount_t count; - struct delayed_work gc_work; + seqcount_rwlock_t count; + unsigned long last_gc; }; struct nft_rbtree_elem { + struct nft_elem_priv priv; struct rb_node node; struct nft_set_ext ext; }; @@ -38,40 +39,47 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } -static bool nft_rbtree_equal(const struct nft_set *set, const void *this, - const struct nft_rbtree_elem *interval) +static int nft_rbtree_cmp(const struct nft_set *set, + const struct nft_rbtree_elem *e1, + const struct nft_rbtree_elem *e2) { - return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; + return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), + set->klen); } -static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext, - unsigned int seq) +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +{ + return nft_set_elem_expired(&rbe->ext); +} + +static const struct nft_set_ext * +__nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, unsigned int seq) { struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - const void *this; int d; parent = rcu_dereference_raw(priv->root.rb_node); while (parent != NULL) { if (read_seqcount_retry(&priv->count, seq)) - return false; + return NULL; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (interval && - nft_rbtree_equal(set, this, interval) && + !nft_rbtree_cmp(set, rbe, interval) && nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; - interval = rbe; + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_rbtree_elem_expired(rbe)) + interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { @@ -80,50 +88,47 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) - return false; + if (nft_rbtree_elem_expired(rbe)) + return NULL; if (nft_rbtree_interval_end(rbe)) { if (nft_set_is_anonymous(set)) - return false; + return NULL; parent = rcu_dereference_raw(parent->rb_left); interval = NULL; continue; } - *ext = &rbe->ext; - return true; + return &rbe->ext; } } if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && - nft_rbtree_interval_start(interval)) { - *ext = &interval->ext; - return true; - } + nft_rbtree_interval_start(interval)) + return &interval->ext; - return false; + return NULL; } -static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +const struct nft_set_ext * +nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); - bool ret; + const struct nft_set_ext *ext; - ret = __nft_rbtree_lookup(net, set, key, ext, seq); - if (ret || !read_seqcount_retry(&priv->count, seq)) - return ret; + ext = __nft_rbtree_lookup(net, set, key, seq); + if (ext || !read_seqcount_retry(&priv->count, seq)) + return ext; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); - ret = __nft_rbtree_lookup(net, set, key, ext, seq); + ext = __nft_rbtree_lookup(net, set, key, seq); read_unlock_bh(&priv->lock); - return ret; + return ext; } static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, @@ -190,8 +195,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, return false; } -static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_rbtree_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); @@ -202,118 +208,274 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); if (ret || !read_seqcount_retry(&priv->count, seq)) - return rbe; + return &rbe->priv; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - if (!ret) - rbe = ERR_PTR(-ENOENT); read_unlock_bh(&priv->lock); - return rbe; + if (!ret) + return ERR_PTR(-ENOENT); + + return &rbe->priv; +} + +static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + lockdep_assert_held_write(&priv->lock); + nft_setelem_data_deactivate(net, set, &rbe->priv); + rb_erase(&rbe->node, &priv->root); +} + +static const struct nft_rbtree_elem * +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); + struct nft_rbtree_elem *rbe_prev; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) + return ERR_PTR(-ENOMEM); + + /* search for end interval coming before this element. + * end intervals don't carry a timeout extension, they + * are coupled with the interval start element. + */ + while (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_end(rbe_prev) && + nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) + break; + + prev = rb_prev(prev); + } + + rbe_prev = NULL; + if (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); + + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return ERR_PTR(-ENOMEM); + + nft_trans_gc_elem_add(gc, rbe_prev); + } + + nft_rbtree_gc_elem_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return ERR_PTR(-ENOMEM); + + nft_trans_gc_elem_add(gc, rbe); + + nft_trans_gc_queue_sync_done(gc); + + return rbe_prev; +} + +static bool nft_rbtree_update_first(const struct nft_set *set, + struct nft_rbtree_elem *rbe, + struct rb_node *first) +{ + struct nft_rbtree_elem *first_elem; + + first_elem = rb_entry(first, struct nft_rbtree_elem, node); + /* this element is closest to where the new element is to be inserted: + * update the first element for the node list path. + */ + if (nft_rbtree_cmp(set, rbe, first_elem) < 0) + return true; + + return false; } static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); + u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - struct nft_rbtree_elem *rbe; - struct rb_node *parent, **p; - bool overlap = false; + u64 tstamp = nft_net_tstamp(net); int d; - /* Detect overlaps as we descend the tree. Set the flag in these cases: - * - * a1. _ _ __>| ?_ _ __| (insert end before existing end) - * a2. _ _ ___| ?_ _ _>| (insert end after existing end) - * a3. _ _ ___? >|_ _ __| (insert start before existing end) - * - * and clear it later on, as we eventually reach the points indicated by - * '?' above, in the cases described below. We'll always meet these - * later, locally, due to tree ordering, and overlaps for the intervals - * that are the closest together are always evaluated last. - * - * b1. _ _ __>| !_ _ __| (insert end before existing start) - * b2. _ _ ___| !_ _ _>| (insert end after existing start) - * b3. _ _ ___! >|_ _ __| (insert start after existing end) - * - * Case a3. resolves to b3.: - * - if the inserted start element is the leftmost, because the '0' - * element in the tree serves as end element - * - otherwise, if an existing end is found. Note that end elements are - * always inserted after corresponding start elements. - * - * For a new, rightmost pair of elements, we'll hit cases b3. and b2., - * in that order. - * - * The flag is also cleared in two special cases: - * - * b4. |__ _ _!|<_ _ _ (insert start right before existing end) - * b5. |__ _ >|!__ _ _ (insert end right after existing start) - * - * which always happen as last step and imply that no further - * overlapping is possible. + /* Descend the tree to search for an existing element greater than the + * key value to insert that is greater than the new element. This is the + * first element to walk the ordered elements to find possible overlap. */ - parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), - nft_set_ext_key(&new->ext), - set->klen); + d = nft_rbtree_cmp(set, rbe, new); + if (d < 0) { p = &parent->rb_left; - - if (nft_rbtree_interval_start(new)) { - if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask)) - overlap = false; - } else { - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask); - } } else if (d > 0) { - p = &parent->rb_right; + if (!first || + nft_rbtree_update_first(set, rbe, first)) + first = &rbe->node; - if (nft_rbtree_interval_end(new)) { - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask); - } else if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask)) { - overlap = true; - } + p = &parent->rb_right; } else { - if (nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(new)) { + if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; - - if (nft_set_elem_active(&rbe->ext, genmask)) - overlap = false; - } else if (nft_rbtree_interval_start(rbe) && - nft_rbtree_interval_end(new)) { + else p = &parent->rb_right; + } + } - if (nft_set_elem_active(&rbe->ext, genmask)) - overlap = false; - } else if (nft_set_elem_active(&rbe->ext, genmask)) { - *ext = &rbe->ext; - return -EEXIST; - } else { - p = &parent->rb_left; + if (!first) + first = rb_first(&priv->root); + + /* Detect overlap by going through the list of valid tree nodes. + * Values stored in the tree are in reversed order, starting from + * highest to lowest value. + */ + for (node = first; node != NULL; node = next) { + next = rb_next(node); + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* perform garbage collection to avoid bogus overlap reports + * but skip new elements in this transaction. + */ + if (__nft_set_elem_expired(&rbe->ext, tstamp) && + nft_set_elem_active(&rbe->ext, cur_genmask)) { + const struct nft_rbtree_elem *removed_end; + + removed_end = nft_rbtree_gc_elem(set, priv, rbe); + if (IS_ERR(removed_end)) + return PTR_ERR(removed_end); + + if (removed_end == rbe_le || removed_end == rbe_ge) + return -EAGAIN; + + continue; + } + + d = nft_rbtree_cmp(set, rbe, new); + if (d == 0) { + /* Matching end element: no need to look for an + * overlapping greater or equal element. + */ + if (nft_rbtree_interval_end(rbe)) { + rbe_le = rbe; + break; + } + + /* first element that is greater or equal to key value. */ + if (!rbe_ge) { + rbe_ge = rbe; + continue; } + + /* this is a closer more or equal element, update it. */ + if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { + rbe_ge = rbe; + continue; + } + + /* element is equal to key value, make sure flags are + * the same, an existing more or equal start element + * must not be replaced by more or equal end element. + */ + if ((nft_rbtree_interval_start(new) && + nft_rbtree_interval_start(rbe_ge)) || + (nft_rbtree_interval_end(new) && + nft_rbtree_interval_end(rbe_ge))) { + rbe_ge = rbe; + continue; + } + } else if (d > 0) { + /* annotate element greater than the new element. */ + rbe_ge = rbe; + continue; + } else if (d < 0) { + /* annotate element less than the new element. */ + rbe_le = rbe; + break; } } - if (overlap) + /* - new start element matching existing start element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && + nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { + *elem_priv = &rbe_ge->priv; + return -EEXIST; + } + + /* - new end element matching existing end element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && + nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + *elem_priv = &rbe_le->priv; + return -EEXIST; + } + + /* - new start element with existing closest, less or equal key value + * being a start element: partial overlap, reported as -ENOTEMPTY. + * Anonymous sets allow for two consecutive start element since they + * are constant, skip them to avoid bogus overlap reports. + */ + if (!nft_set_is_anonymous(set) && rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, less or equal key value + * being a end element: partial overlap, reported as -ENOTEMPTY. + */ + if (rbe_le && + nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, greater or equal key value + * being an end element: partial overlap, reported as -ENOTEMPTY + */ + if (rbe_ge && + nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; + /* Accepted element: pick insertion point depending on key value */ + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_rbtree_cmp(set, rbe, new); + + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else if (nft_rbtree_interval_end(rbe)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; @@ -321,66 +483,74 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->priv; int err; + do { + if (fatal_signal_pending(current)) + return -EINTR; + + cond_resched(); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, elem_priv); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + } while (err == -EAGAIN); + + return err; +} + +static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) +{ write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); + rb_erase(&rbe->node, &priv->root); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); - - return err; } static void nft_rbtree_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->priv; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - rb_erase(&rbe->node, &priv->root); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + nft_rbtree_erase(priv, rbe); } static void nft_rbtree_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_rbtree_elem *rbe = elem->priv; + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); + nft_clear(net, &rbe->ext); } -static bool nft_rbtree_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_rbtree_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_rbtree_elem *rbe = priv; + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); } -static void *nft_rbtree_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_next(net); + u64 tstamp = nft_net_tstamp(net); int d; while (parent != NULL) { @@ -401,106 +571,132 @@ static void *nft_rbtree_deactivate(const struct net *net, nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; + } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) { + break; } else if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; continue; } - nft_rbtree_flush(net, set, rbe); - return rbe; + nft_rbtree_flush(net, set, &rbe->priv); + return &rbe->priv; } } return NULL; } -static void nft_rbtree_walk(const struct nft_ctx *ctx, - struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rbtree_do_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; - struct nft_set_elem elem; struct rb_node *node; - read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&rbe->ext)) - goto cont; - if (!nft_set_elem_active(&rbe->ext, iter->genmask)) - goto cont; - elem.priv = rbe; - - iter->err = iter->fn(ctx, set, iter, &elem); - if (iter->err < 0) { - read_unlock_bh(&priv->lock); + iter->err = iter->fn(ctx, set, iter, &rbe->priv); + if (iter->err < 0) return; - } cont: iter->count++; } - read_unlock_bh(&priv->lock); } -static void nft_rbtree_gc(struct work_struct *work) +static void nft_rbtree_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; - struct nft_rbtree *priv; - struct rb_node *node; - struct nft_set *set; + struct nft_rbtree *priv = nft_set_priv(set); + + switch (iter->type) { + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); + break; + case NFT_ITER_READ: + read_lock_bh(&priv->lock); + nft_rbtree_do_walk(ctx, set, iter); + read_unlock_bh(&priv->lock); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + nft_setelem_data_deactivate(net, set, &rbe->priv); + nft_rbtree_erase(priv, rbe); +} + +static void nft_rbtree_gc(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe, *rbe_end = NULL; + struct net *net = read_pnet(&set->net); + u64 tstamp = nft_net_tstamp(net); + struct rb_node *node, *next; + struct nft_trans_gc *gc; - priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; + + for (node = rb_first(&priv->root); node ; node = next) { + next = rb_next(node); - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is + * always visited before the start element. + */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } - if (!nft_set_elem_expired(&rbe->ext)) - continue; - if (nft_set_elem_mark_busy(&rbe->ext)) + if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; - - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + goto try_later; + /* end element needs to be removed first, it has + * no timeout extension. + */ if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); + nft_rbtree_gc_remove(net, set, priv, rbe_end); + nft_trans_gc_elem_add(gc, rbe_end); rbe_end = NULL; } - node = rb_next(node); - if (!node) - break; + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + goto try_later; + + nft_rbtree_gc_remove(net, set, priv, rbe); + nft_trans_gc_elem_add(gc, rbe); } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); - nft_set_gc_batch_complete(gcb); +try_later: - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); + if (gc) { + gc = nft_trans_gc_catchall_sync(gc); + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], @@ -515,30 +711,26 @@ static int nft_rbtree_init(const struct nft_set *set, { struct nft_rbtree *priv = nft_set_priv(set); + BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); + rwlock_init(&priv->lock); - seqcount_init(&priv->count); + seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; - INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); - if (set->flags & NFT_SET_TIMEOUT) - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); - return 0; } -static void nft_rbtree_destroy(const struct nft_set *set) +static void nft_rbtree_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; - cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe, true); + nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } } @@ -560,6 +752,61 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static void nft_rbtree_commit(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + nft_rbtree_gc(set); +} + +static void nft_rbtree_gc_init(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + +/* rbtree stores ranges as singleton elements, each range is composed of two + * elements ... + */ +static u32 nft_rbtree_ksize(u32 size) +{ + return size * 2; +} + +/* ... hide this detail to userspace. */ +static u32 nft_rbtree_usize(u32 size) +{ + if (!size) + return 0; + + return size / 2; +} + +static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *node; + const void *key; + + node = rb_last(&priv->root); + if (!node) + return 0; + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_rbtree_interval_end(rbe)) + return 0; + + key = nft_set_ext_key(&rbe->ext); + if (memchr(key, 1, set->klen)) + return 0; + + /* this is the all-zero no-match element. */ + return 1; +} + const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -573,8 +820,13 @@ const struct nft_set_type nft_set_rbtree_type = { .deactivate = nft_rbtree_deactivate, .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, + .commit = nft_rbtree_commit, + .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, + .ksize = nft_rbtree_ksize, + .usize = nft_rbtree_usize, + .adjust_maxsize = nft_rbtree_adjust_maxsize, }, }; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 637ce3e8c575..36affbb697c2 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -9,11 +9,101 @@ struct nft_socket { enum nft_socket_keys key:8; + u8 level; /* cgroupv2 level to extract */ + u8 level_user; /* cgroupv2 level provided by userspace */ + u8 len; union { - enum nft_registers dreg:8; + u8 dreg; }; }; +static void nft_socket_wildcard(const struct nft_pktinfo *pkt, + struct nft_regs *regs, struct sock *sk, + u32 *dest) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr)); + break; +#endif + default: + regs->verdict.code = NFT_BREAK; + return; + } +} + +#ifdef CONFIG_SOCK_CGROUP_DATA +static noinline bool +nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level) +{ + struct cgroup *cgrp; + u64 cgid; + + if (!sk_fullsock(sk)) + return false; + + cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level); + if (!cgrp) + return false; + + cgid = cgroup_id(cgrp); + memcpy(dest, &cgid, sizeof(u64)); + return true; +} + +/* process context only, uses current->nsproxy. */ +static noinline int nft_socket_cgroup_subtree_level(void) +{ + struct cgroup *cgrp = cgroup_get_from_path("/"); + int level; + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + level = cgrp->level; + + cgroup_put(cgrp); + + if (level > 255) + return -ERANGE; + + if (WARN_ON_ONCE(level < 0)) + return -EINVAL; + + return level; +} +#endif + +static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt) +{ + const struct net_device *indev = nft_in(pkt); + const struct sk_buff *skb = pkt->skb; + struct sock *sk = NULL; + + if (!indev) + return NULL; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, indev); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, indev); + break; +#endif + default: + WARN_ON_ONCE(1); + break; + } + + return sk; +} + static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -27,20 +117,7 @@ static void nft_socket_eval(const struct nft_expr *expr, sk = NULL; if (!sk) - switch(nft_pf(pkt)) { - case NFPROTO_IPV4: - sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt)); - break; -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - case NFPROTO_IPV6: - sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt)); - break; -#endif - default: - WARN_ON_ONCE(1); - regs->verdict.code = NFT_BREAK; - return; - } + sk = nft_socket_do_lookup(pkt); if (!sk) { regs->verdict.code = NFT_BREAK; @@ -53,24 +130,41 @@ static void nft_socket_eval(const struct nft_expr *expr, break; case NFT_SOCKET_MARK: if (sk_fullsock(sk)) { - *dest = sk->sk_mark; + *dest = READ_ONCE(sk->sk_mark); } else { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; + } + break; + case NFT_SOCKET_WILDCARD: + if (!sk_fullsock(sk)) { + regs->verdict.code = NFT_BREAK; + goto out_put_sk; } + nft_socket_wildcard(pkt, regs, sk, dest); break; +#ifdef CONFIG_SOCK_CGROUP_DATA + case NFT_SOCKET_CGROUPV2: + if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) { + regs->verdict.code = NFT_BREAK; + goto out_put_sk; + } + break; +#endif default: WARN_ON(1); regs->verdict.code = NFT_BREAK; } +out_put_sk: if (sk != skb->sk) sock_gen_put(sk); } static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { - [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, + [NFTA_SOCKET_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, + [NFTA_SOCKET_LEVEL] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_socket_init(const struct nft_ctx *ctx, @@ -94,35 +188,106 @@ static int nft_socket_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY])); + priv->key = ntohl(nla_get_be32(tb[NFTA_SOCKET_KEY])); switch(priv->key) { case NFT_SOCKET_TRANSPARENT: + case NFT_SOCKET_WILDCARD: len = sizeof(u8); break; case NFT_SOCKET_MARK: len = sizeof(u32); break; +#ifdef CONFIG_SOCK_CGROUP_DATA + case NFT_SOCKET_CGROUPV2: { + unsigned int level; + int err; + + if (!tb[NFTA_SOCKET_LEVEL]) + return -EINVAL; + + level = ntohl(nla_get_be32(tb[NFTA_SOCKET_LEVEL])); + if (level > 255) + return -EOPNOTSUPP; + + err = nft_socket_cgroup_subtree_level(); + if (err < 0) + return err; + + priv->level_user = level; + + level += err; + /* Implies a giant cgroup tree */ + if (level > 255) + return -EOPNOTSUPP; + + priv->level = level; + len = sizeof(u64); + break; + } +#endif default: return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + priv->len = len; + return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_socket_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_socket *priv = nft_expr_priv(expr); - if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key))) + if (nla_put_be32(skb, NFTA_SOCKET_KEY, htonl(priv->key))) return -1; if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; + if (priv->key == NFT_SOCKET_CGROUPV2 && + nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user))) + return -1; return 0; } +static bool nft_socket_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_socket *priv = nft_expr_priv(expr); + const struct nft_socket *socket; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + socket = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->key != socket->key || + priv->dreg != socket->dreg || + priv->level != socket->level) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return nft_expr_reduce_bitwise(track, expr); +} + +static int nft_socket_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT)); +} + static struct nft_expr_type nft_socket_type; static const struct nft_expr_ops nft_socket_ops = { .type = &nft_socket_type, @@ -130,6 +295,8 @@ static const struct nft_expr_ops nft_socket_ops = { .eval = nft_socket_eval, .init = nft_socket_init, .dump = nft_socket_dump, + .validate = nft_socket_validate, + .reduce = nft_socket_reduce, }; static struct nft_expr_type nft_socket_type __read_mostly = { diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index e2c1fc608841..5d3e51825985 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -109,7 +109,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); const struct tcphdr *tcp; struct tcphdr _tcph; @@ -123,7 +123,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, return; } - tcp = skb_header_pointer(skb, pkt->xt.thoff, + tcp = skb_header_pointer(skb, thoff, sizeof(struct tcphdr), &_tcph); if (!tcp) { @@ -186,13 +186,14 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx, break; #endif case NFPROTO_INET: - case NFPROTO_BRIDGE: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; err = nf_synproxy_ipv6_init(snet, ctx->net); - if (err) + if (err) { + nf_synproxy_ipv4_fini(snet, ctx->net); goto nf_ct_failure; + } break; } @@ -217,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) break; #endif case NFPROTO_INET: - case NFPROTO_BRIDGE: nf_synproxy_ipv4_fini(snet, ctx->net); nf_synproxy_ipv6_fini(snet, ctx->net); break; @@ -248,9 +248,13 @@ static void nft_synproxy_eval(const struct nft_expr *expr, } static int nft_synproxy_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD)); } @@ -270,7 +274,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx, nft_synproxy_do_destroy(ctx); } -static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_synproxy_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_synproxy *priv = nft_expr_priv(expr); @@ -286,6 +291,7 @@ static const struct nft_expr_ops nft_synproxy_ops = { .dump = nft_synproxy_dump, .type = &nft_synproxy_type, .validate = nft_synproxy_validate, + .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_synproxy_type __read_mostly = { @@ -388,3 +394,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("synproxy"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY); +MODULE_DESCRIPTION("nftables SYNPROXY expression support"); diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index d67f83a0958d..50481280abd2 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -13,9 +13,9 @@ #endif struct nft_tproxy { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_port:8; - u8 family; + u8 sreg_addr; + u8 sreg_port; + u8 family; }; static void nft_tproxy_eval_v4(const struct nft_expr *expr, @@ -30,6 +30,12 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, __be16 tport = 0; struct sock *sk; + if (pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) { + regs->verdict.code = NFT_BREAK; + return; + } + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); if (!hp) { regs->verdict.code = NFT_BREAK; @@ -46,11 +52,11 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); if (priv->sreg_addr) - taddr = regs->data[priv->sreg_addr]; + taddr = nft_reg_load_be32(®s->data[priv->sreg_addr]); taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); if (priv->sreg_port) - tport = nft_reg_load16(®s->data[priv->sreg_port]); + tport = nft_reg_load_be16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -82,16 +88,17 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct ipv6hdr *iph = ipv6_hdr(skb); - struct in6_addr taddr; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); struct udphdr _hdr, *hp; + struct in6_addr taddr; __be16 tport = 0; struct sock *sk; int l4proto; memset(&taddr, 0, sizeof(taddr)); - if (!pkt->tprot_set) { + if (pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) { regs->verdict.code = NFT_BREAK; return; } @@ -117,7 +124,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); if (priv->sreg_port) - tport = nft_reg_load16(®s->data[priv->sreg_port]); + tport = nft_reg_load_be16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -176,7 +183,7 @@ static void nft_tproxy_eval(const struct nft_expr *expr, } static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = { - [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 }, + [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 }, [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 }, }; @@ -247,15 +254,15 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, } if (tb[NFTA_TPROXY_REG_ADDR]) { - priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, alen); + err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_ADDR], + &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { - priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]); - err = nft_validate_register_load(priv->sreg_port, sizeof(u16)); + err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_PORT], + &priv->sreg_port, sizeof(u16)); if (err < 0) return err; } @@ -263,8 +270,31 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, return 0; } +static void nft_tproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_tproxy *priv = nft_expr_priv(expr); + + switch (priv->family) { + case NFPROTO_IPV4: + nf_defrag_ipv4_disable(ctx->net); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nf_defrag_ipv6_disable(ctx->net); + break; +#endif + case NFPROTO_UNSPEC: + nf_defrag_ipv4_disable(ctx->net); +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + nf_defrag_ipv6_disable(ctx->net); +#endif + break; + } +} + static int nft_tproxy_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_tproxy *priv = nft_expr_priv(expr); @@ -282,13 +312,27 @@ static int nft_tproxy_dump(struct sk_buff *skb, return 0; } +static int nft_tproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + + return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); +} + static struct nft_expr_type nft_tproxy_type; static const struct nft_expr_ops nft_tproxy_ops = { .type = &nft_tproxy_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)), .eval = nft_tproxy_eval, .init = nft_tproxy_init, + .destroy = nft_tproxy_destroy, .dump = nft_tproxy_dump, + .reduce = NFT_REDUCE_READONLY, + .validate = nft_tproxy_validate, }; static struct nft_expr_type nft_tproxy_type __read_mostly = { diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 30be5787fbde..a12486ae089d 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -15,8 +15,9 @@ struct nft_tunnel { enum nft_tunnel_keys key:8; - enum nft_registers dreg:8; + u8 dreg; enum nft_tunnel_mode mode:8; + u8 len; }; static void nft_tunnel_get_eval(const struct nft_expr *expr, @@ -65,9 +66,9 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr, } static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = { - [NFTA_TUNNEL_KEY] = { .type = NLA_U32 }, + [NFTA_TUNNEL_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TUNNEL_DREG] = { .type = NLA_U32 }, - [NFTA_TUNNEL_MODE] = { .type = NLA_U32 }, + [NFTA_TUNNEL_MODE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_tunnel_get_init(const struct nft_ctx *ctx, @@ -93,8 +94,6 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]); - if (tb[NFTA_TUNNEL_MODE]) { priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE])); if (priv->mode > NFT_TUNNEL_MODE_MAX) @@ -103,12 +102,13 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, priv->mode = NFT_TUNNEL_MODE_NONE; } - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + priv->len = len; + return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_tunnel_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_tunnel *priv = nft_expr_priv(expr); @@ -124,6 +124,31 @@ nla_put_failure: return -1; } +static bool nft_tunnel_get_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_tunnel *priv = nft_expr_priv(expr); + const struct nft_tunnel *tunnel; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + tunnel = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->key != tunnel->key || + priv->dreg != tunnel->dreg || + priv->mode != tunnel->mode) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return false; +} + static struct nft_expr_type nft_tunnel_type; static const struct nft_expr_ops nft_tunnel_get_ops = { .type = &nft_tunnel_type, @@ -131,10 +156,12 @@ static const struct nft_expr_ops nft_tunnel_get_ops = { .eval = nft_tunnel_get_eval, .init = nft_tunnel_get_init, .dump = nft_tunnel_get_dump, + .reduce = nft_tunnel_get_reduce, }; static struct nft_expr_type nft_tunnel_type __read_mostly = { .name = "tunnel", + .family = NFPROTO_NETDEV, .ops = &nft_tunnel_get_ops, .policy = nft_tunnel_policy, .maxattr = NFTA_TUNNEL_MAX, @@ -147,8 +174,8 @@ struct nft_tunnel_opts { struct erspan_metadata erspan; u8 data[IP_TUNNEL_OPTS_MAX]; } u; + IP_TUNNEL_DECLARE_FLAGS(flags); u32 len; - __be16 flags; }; struct nft_tunnel_obj { @@ -244,7 +271,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP])); opts->len = sizeof(struct vxlan_metadata); - opts->flags = TUNNEL_VXLAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags); return 0; } @@ -298,7 +326,8 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, opts->u.erspan.version = version; opts->len = sizeof(struct erspan_metadata); - opts->flags = TUNNEL_ERSPAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags); return 0; } @@ -306,13 +335,13 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, - [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { - struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len; + struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len); struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; int err, data_len; @@ -339,7 +368,8 @@ static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, opt->length = data_len / 4; opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]); opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]); - opts->flags = TUNNEL_GENEVE_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags); return 0; } @@ -357,8 +387,9 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, struct ip_tunnel_info *info, struct nft_tunnel_opts *opts) { - int err, rem, type = 0; struct nlattr *nla; + int err, rem; + u32 type = 0; err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX, nft_tunnel_opts_policy, NULL); @@ -373,7 +404,7 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_vxlan_init(nla, opts); if (err) return err; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_ERSPAN: if (type) @@ -381,15 +412,15 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_erspan_init(nla, opts); if (err) return err; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; err = nft_tunnel_obj_geneve_init(nla, opts); if (err) return err; - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; default: return -EOPNOTSUPP; @@ -426,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, memset(&info, 0, sizeof(info)); info.mode = IP_TUNNEL_INFO_TX; info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID])); - info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags); if (tb[NFTA_TUNNEL_KEY_IP]) { err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info); @@ -455,18 +488,16 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX) - info.key.tun_flags &= ~TUNNEL_CSUM; + __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT) - info.key.tun_flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER) - info.key.tun_flags |= TUNNEL_SEQ; + __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags); } if (tb[NFTA_TUNNEL_KEY_TOS]) info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]); - if (tb[NFTA_TUNNEL_KEY_TTL]) - info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]); - else - info.key.ttl = U8_MAX; + info.key.ttl = nla_get_u8_default(tb[NFTA_TUNNEL_KEY_TTL], U8_MAX); if (tb[NFTA_TUNNEL_KEY_OPTS]) { err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS], @@ -475,13 +506,14 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return err; } - md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL); + md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, + GFP_KERNEL_ACCOUNT); if (!md) return -ENOMEM; memcpy(&md->u.tun_info, &info, sizeof(info)); #ifdef CONFIG_DST_CACHE - err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL); + err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT); if (err < 0) { metadata_dst_free(md); return err; @@ -555,7 +587,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, if (!nest) return -1; - if (opts->flags & TUNNEL_VXLAN_OPT) { + if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); if (!inner) goto failure; @@ -563,7 +595,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, htonl(opts->u.vxlan.gbp))) goto inner_failure; nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); if (!inner) goto failure; @@ -585,15 +617,15 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, break; } nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_GENEVE_OPT) { + } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) { struct geneve_opt *opt; int offset = 0; - inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); - if (!inner) - goto failure; while (opts->len > offset) { - opt = (struct geneve_opt *)opts->u.data + offset; + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; + opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, @@ -602,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, opt->length * 4, opt->opt_data)) goto inner_failure; offset += sizeof(*opt) + opt->length * 4; + nla_nest_end(skb, inner); } - nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; @@ -630,11 +662,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb, { u32 flags = 0; - if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_DONT_FRAGMENT; - if (!(info->key.tun_flags & TUNNEL_CSUM)) + if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_ZERO_CSUM_TX; - if (info->key.tun_flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_SEQ_NUMBER; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0) @@ -685,6 +717,7 @@ static const struct nft_object_ops nft_tunnel_obj_ops = { static struct nft_object_type nft_tunnel_obj_type __read_mostly = { .type = NFT_OBJECT_TUNNEL, + .family = NFPROTO_NETDEV, .ops = &nft_tunnel_obj_ops, .maxattr = NFTA_TUNNEL_KEY_MAX, .policy = nft_tunnel_key_policy, @@ -719,3 +752,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("tunnel"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL); +MODULE_DESCRIPTION("nftables tunnel expression support"); diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 06d5cabf1d7c..3210cfc966ab 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -16,17 +16,18 @@ #include <net/xfrm.h> static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { - [NFTA_XFRM_KEY] = { .type = NLA_U32 }, + [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, - [NFTA_XFRM_SPNUM] = { .type = NLA_U32 }, + [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; struct nft_xfrm { enum nft_xfrm_keys key:8; - enum nft_registers dreg:8; + u8 dreg; u8 dir; u8 spnum; + u8 len; }; static int nft_xfrm_get_init(const struct nft_ctx *ctx, @@ -50,7 +51,7 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY])); + priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY])); switch (priv->key) { case NFT_XFRM_KEY_REQID: case NFT_XFRM_KEY_SPI: @@ -86,9 +87,9 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx, priv->spnum = spnum; - priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + priv->len = len; + return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current @@ -111,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) return true; } - return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || + mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, @@ -133,13 +135,13 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, WARN_ON_ONCE(1); break; case NFT_XFRM_KEY_DADDR_IP4: - *dest = state->id.daddr.a4; + *dest = (__force __u32)state->id.daddr.a4; return; case NFT_XFRM_KEY_DADDR_IP6: memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_SADDR_IP4: - *dest = state->props.saddr.a4; + *dest = (__force __u32)state->props.saddr.a4; return; case NFT_XFRM_KEY_SADDR_IP6: memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); @@ -148,7 +150,7 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, *dest = state->props.reqid; return; case NFT_XFRM_KEY_SPI: - *dest = state->id.spi; + *dest = (__force __u32)state->id.spi; return; } @@ -211,7 +213,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr, } static int nft_xfrm_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_xfrm *priv = nft_expr_priv(expr); @@ -228,12 +230,16 @@ static int nft_xfrm_get_dump(struct sk_buff *skb, return 0; } -static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | @@ -253,6 +259,31 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e return nft_chain_validate_hooks(ctx->chain, hooks); } +static bool nft_xfrm_reduce(struct nft_regs_track *track, + const struct nft_expr *expr) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + const struct nft_xfrm *xfrm; + + if (!nft_reg_track_cmp(track, expr, priv->dreg)) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + xfrm = nft_expr_priv(track->regs[priv->dreg].selector); + if (priv->key != xfrm->key || + priv->dreg != xfrm->dreg || + priv->dir != xfrm->dir || + priv->spnum != xfrm->spnum) { + nft_reg_track_update(track, expr, priv->dreg, priv->len); + return false; + } + + if (!track->regs[priv->dreg].bitwise) + return true; + + return nft_expr_reduce_bitwise(track, expr); +} static struct nft_expr_type nft_xfrm_type; static const struct nft_expr_ops nft_xfrm_get_ops = { @@ -262,6 +293,7 @@ static const struct nft_expr_ops nft_xfrm_get_ops = { .init = nft_xfrm_get_init, .dump = nft_xfrm_get_dump, .validate = nft_xfrm_validate, + .reduce = nft_xfrm_reduce, }; static struct nft_expr_type nft_xfrm_type __read_mostly = { diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 51b454d8fa9c..008419db815a 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -25,7 +25,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - /* fall through */ + fallthrough; case CHECKSUM_NONE: if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP) skb->csum = 0; @@ -51,7 +51,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip_checksum(skb, hook, dataoff, protocol); - /* fall through */ + fallthrough; case CHECKSUM_NONE: skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, skb->len - dataoff, 0); @@ -79,7 +79,7 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - /* fall through */ + fallthrough; case CHECKSUM_NONE: skb->csum = ~csum_unfold( csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -106,7 +106,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip6_checksum(skb, hook, dataoff, protocol); - /* fall through */ + fallthrough; case CHECKSUM_NONE: hsum = skb_checksum(skb, 0, dataoff, 0); skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, @@ -179,39 +179,54 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, } EXPORT_SYMBOL_GPL(nf_route); -static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) +/* Only get and check the lengths, not do any hop-by-hop stuff. */ +int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen) { -#ifdef CONFIG_INET - const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - if (!(iph->tos == rt_info->tos && - skb->mark == rt_info->mark && - iph->daddr == rt_info->daddr && - iph->saddr == rt_info->saddr)) - return ip_route_me_harder(entry->state.net, skb, - RTN_UNSPEC); + int len, off = sizeof(struct ipv6hdr); + unsigned char *nh; + + if (!pskb_may_pull(skb, off + 8)) + return -ENOMEM; + nh = (unsigned char *)(ipv6_hdr(skb) + 1); + len = (nh[1] + 1) << 3; + + if (!pskb_may_pull(skb, off + len)) + return -ENOMEM; + nh = skb_network_header(skb); + + off += 2; + len -= 2; + while (len > 0) { + int optlen; + + if (nh[off] == IPV6_TLV_PAD1) { + off++; + len--; + continue; + } + if (len < 2) + return -EBADMSG; + optlen = nh[off + 1] + 2; + if (optlen > len) + return -EBADMSG; + + if (nh[off] == IPV6_TLV_JUMBO) { + u32 pkt_len; + + if (nh[off + 1] != 4 || (off & 3) != 2) + return -EBADMSG; + pkt_len = ntohl(*(__be32 *)(nh + off + 2)); + if (pkt_len <= IPV6_MAXPLEN || + ipv6_hdr(skb)->payload_len) + return -EBADMSG; + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + return -EBADMSG; + *plen = pkt_len; + } + off += optlen; + len -= optlen; } -#endif - return 0; -} - -int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) -{ - const struct nf_ipv6_ops *v6ops; - int ret = 0; - switch (entry->state.pf) { - case AF_INET: - ret = nf_ip_reroute(skb, entry); - break; - case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - ret = v6ops->reroute(skb, entry); - break; - } - return ret; + return len ? -EBADMSG : 0; } +EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 99a468be4a59..90b7630421c4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -24,6 +24,7 @@ #include <linux/audit.h> #include <linux/user_namespace.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp.h> @@ -38,6 +39,24 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define XT_PCPU_BLOCK_SIZE 4096 #define XT_MAX_TABLE_SIZE (512 * 1024 * 1024) +struct xt_template { + struct list_head list; + + /* called when table is needed in the given netns */ + int (*table_init)(struct net *net); + + struct module *me; + + /* A unique name... */ + char name[XT_TABLE_MAXNAMELEN]; +}; + +static struct list_head xt_templates[NFPROTO_NUMPROTO]; + +struct xt_pernet { + struct list_head tables[NFPROTO_NUMPROTO]; +}; + struct compat_delta { unsigned int offset; /* offset in kernel */ int delta; /* delta in 32bit user land */ @@ -47,7 +66,7 @@ struct xt_af { struct mutex mutex; struct list_head match; struct list_head target; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct mutex compat_mutex; struct compat_delta *compat_tab; unsigned int number; /* number of slots in compat_tab[] */ @@ -55,7 +74,8 @@ struct xt_af { #endif }; -static struct xt_af *xt; +static unsigned int xt_pernet_id __read_mostly; +static struct xt_af *xt __read_mostly; static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_UNSPEC] = "x", @@ -330,6 +350,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) const struct xt_match *m; int have_rev = 0; + mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision > *bestp) @@ -338,6 +359,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); @@ -350,6 +372,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) const struct xt_target *t; int have_rev = 0; + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision > *bestp) @@ -358,6 +381,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); @@ -371,12 +395,10 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, { int have_rev, best = -1; - mutex_lock(&xt[af].mutex); if (target == 1) have_rev = target_revfn(af, name, revision, &best); else have_rev = match_revfn(af, name, revision, &best); - mutex_unlock(&xt[af].mutex); /* Nothing at all? Return 0 to try loading module. */ if (best == -1) { @@ -639,7 +661,7 @@ static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, return usersize == kernsize && strnlen(msg, msglen) < msglen; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { struct xt_af *xp = &xt[af]; @@ -731,7 +753,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, { const struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; - int pad, off = xt_compat_match_offset(match); + int off = xt_compat_match_offset(match); u_int16_t msize = cm->u.user.match_size; char name[sizeof(m->u.user.name)]; @@ -741,15 +763,12 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, match->compat_from_user(m->data, cm->data); else memcpy(m->data, cm->data, msize - sizeof(*cm)); - pad = XT_ALIGN(match->matchsize) - match->matchsize; - if (pad > 0) - memset(m->data + match->matchsize, 0, pad); msize += off; m->u.user.match_size = msize; - strlcpy(name, match->name, sizeof(name)); + strscpy(name, match->name, sizeof(name)); module_put(match->me); - strncpy(m->u.user.name, name, sizeof(m->u.user.name)); + strscpy_pad(m->u.user.name, name, sizeof(m->u.user.name)); *size += off; *dstptr += msize; @@ -845,7 +864,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, __alignof__(struct compat_xt_entry_match)); } EXPORT_SYMBOL(xt_compat_check_entry_offsets); -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ /** * xt_check_entry_offsets - validate arp/ip/ip6t_entry @@ -863,7 +882,7 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets); * match structures are aligned, and that the last structure ends where * the target structure begins. * - * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version. + * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version. * * The arp/ip/ip6t_entry structure @base must have passed following tests: * - it must point to a valid memory location @@ -1028,34 +1047,34 @@ int xt_check_target(struct xt_tgchk_param *par, EXPORT_SYMBOL_GPL(xt_check_target); /** - * xt_copy_counters_from_user - copy counters and metadata from userspace + * xt_copy_counters - copy counters and metadata from a sockptr_t * - * @user: src pointer to userspace memory + * @arg: src sockptr * @len: alleged size of userspace memory * @info: where to store the xt_counters_info metadata - * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel * * Copies counter meta data from @user and stores it in @info. * * vmallocs memory to hold the counters, then copies the counter data * from @user to the new memory and returns a pointer to it. * - * If @compat is true, @info gets converted automatically to the 64bit - * representation. + * If called from a compat syscall, @info gets converted automatically to the + * 64bit representation. * * The metadata associated with the counters is stored in @info. * * Return: returns pointer that caller has to test via IS_ERR(). * If IS_ERR is false, caller has to vfree the pointer. */ -void *xt_copy_counters_from_user(const void __user *user, unsigned int len, - struct xt_counters_info *info, bool compat) +void *xt_copy_counters(sockptr_t arg, unsigned int len, + struct xt_counters_info *info) { + size_t offset; void *mem; u64 size; -#ifdef CONFIG_COMPAT - if (compat) { +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT + if (in_compat_syscall()) { /* structures only differ in size due to alignment */ struct compat_xt_counters_info compat_tmp; @@ -1063,12 +1082,12 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, return ERR_PTR(-EINVAL); len -= sizeof(compat_tmp); - if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0) + if (copy_from_sockptr(&compat_tmp, arg, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; - user += sizeof(compat_tmp); + offset = sizeof(compat_tmp); } else #endif { @@ -1076,10 +1095,10 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, return ERR_PTR(-EINVAL); len -= sizeof(*info); - if (copy_from_user(info, user, sizeof(*info)) != 0) + if (copy_from_sockptr(info, arg, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); - user += sizeof(*info); + offset = sizeof(*info); } info->name[sizeof(info->name) - 1] = '\0'; @@ -1093,15 +1112,15 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (!mem) return ERR_PTR(-ENOMEM); - if (copy_from_user(mem, user, len) == 0) + if (copy_from_sockptr_offset(mem, arg, offset, len) == 0) return mem; vfree(mem); return ERR_PTR(-EFAULT); } -EXPORT_SYMBOL_GPL(xt_copy_counters_from_user); +EXPORT_SYMBOL_GPL(xt_copy_counters); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_target_offset(const struct xt_target *target) { u_int16_t csize = target->compatsize ? : target->targetsize; @@ -1114,7 +1133,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, { const struct xt_target *target = t->u.kernel.target; struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t; - int pad, off = xt_compat_target_offset(target); + int off = xt_compat_target_offset(target); u_int16_t tsize = ct->u.user.target_size; char name[sizeof(t->u.user.name)]; @@ -1123,16 +1142,14 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, if (target->compat_from_user) target->compat_from_user(t->data, ct->data); else - memcpy(t->data, ct->data, tsize - sizeof(*ct)); - pad = XT_ALIGN(target->targetsize) - target->targetsize; - if (pad > 0) - memset(t->data + target->targetsize, 0, pad); + unsafe_memcpy(t->data, ct->data, tsize - sizeof(*ct), + /* UAPI 0-sized destination */); tsize += off; t->u.user.target_size = tsize; - strlcpy(name, target->name, sizeof(name)); + strscpy(name, target->name, sizeof(name)); module_put(target->me); - strncpy(t->u.user.name, name, sizeof(t->u.user.name)); + strscpy_pad(t->u.user.name, name, sizeof(t->u.user.name)); *size += off; *dstptr += tsize; @@ -1197,50 +1214,65 @@ void xt_free_table_info(struct xt_table_info *info) } EXPORT_SYMBOL(xt_free_table_info); +struct xt_table *xt_find_table(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_net->tables[af], list) { + if (strcmp(t->name, name) == 0) { + mutex_unlock(&xt[af].mutex); + return t; + } + } + mutex_unlock(&xt[af].mutex); + return NULL; +} +EXPORT_SYMBOL(xt_find_table); + /* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { - struct xt_table *t, *found = NULL; + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct module *owner = NULL; + struct xt_template *tmpl; + struct xt_table *t; mutex_lock(&xt[af].mutex); - list_for_each_entry(t, &net->xt.tables[af], list) + list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; - if (net == &init_net) - goto out; - - /* Table doesn't exist in this netns, re-try init */ - list_for_each_entry(t, &init_net.xt.tables[af], list) { + /* Table doesn't exist in this netns, check larval list */ + list_for_each_entry(tmpl, &xt_templates[af], list) { int err; - if (strcmp(t->name, name)) + if (strcmp(tmpl->name, name)) continue; - if (!try_module_get(t->me)) + if (!try_module_get(tmpl->me)) goto out; + + owner = tmpl->me; + mutex_unlock(&xt[af].mutex); - err = t->table_init(net); + err = tmpl->table_init(net); if (err < 0) { - module_put(t->me); + module_put(owner); return ERR_PTR(err); } - found = t; - mutex_lock(&xt[af].mutex); break; } - if (!found) - goto out; - /* and once again: */ - list_for_each_entry(t, &net->xt.tables[af], list) - if (strcmp(t->name, name) == 0) + list_for_each_entry(t, &xt_net->tables[af], list) + if (strcmp(t->name, name) == 0 && owner == t->me) return t; - module_put(found->me); + module_put(owner); out: mutex_unlock(&xt[af].mutex); return ERR_PTR(-ENOENT); @@ -1271,7 +1303,7 @@ void xt_table_unlock(struct xt_table *table) } EXPORT_SYMBOL_GPL(xt_table_unlock); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT void xt_compat_lock(u_int8_t af) { mutex_lock(&xt[af].compat_mutex); @@ -1285,12 +1317,13 @@ void xt_compat_unlock(u_int8_t af) EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif -DEFINE_PER_CPU(seqcount_t, xt_recseq); -EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); - struct static_key xt_tee_enabled __read_mostly; EXPORT_SYMBOL_GPL(xt_tee_enabled); +#ifdef CONFIG_NETFILTER_XTABLES_LEGACY +DEFINE_PER_CPU(seqcount_t, xt_recseq); +EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); + static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; @@ -1387,7 +1420,7 @@ xt_replace_table(struct xt_table *table, table->private = newinfo; /* make sure all cpus see new ->private value */ - smp_wmb(); + smp_mb(); /* * Even though table entries have now been swapped, other CPU's @@ -1410,7 +1443,8 @@ xt_replace_table(struct xt_table *table, audit_log_nfcfg(table->name, table->af, private->number, !private->number ? AUDIT_XT_OP_REGISTER : - AUDIT_XT_OP_REPLACE); + AUDIT_XT_OP_REPLACE, + GFP_KERNEL); return private; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -1420,9 +1454,10 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { - int ret; + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct xt_table_info *private; struct xt_table *t, *table; + int ret; /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); @@ -1433,7 +1468,7 @@ struct xt_table *xt_register_table(struct net *net, mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ - list_for_each_entry(t, &net->xt.tables[table->af], list) { + list_for_each_entry(t, &xt_net->tables[table->af], list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; goto unlock; @@ -1452,7 +1487,7 @@ struct xt_table *xt_register_table(struct net *net, /* save number of initial entries */ private->initial_entries = private->number; - list_add(&table->list, &net->xt.tables[table->af]); + list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; @@ -1473,34 +1508,42 @@ void *xt_unregister_table(struct xt_table *table) list_del(&table->list); mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, - AUDIT_XT_OP_UNREGISTER); + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + kfree(table->ops); kfree(table); return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); +#endif #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { + u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); - u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); + struct xt_pernet *xt_net; + + xt_net = net_generic(net, xt_pernet_id); mutex_lock(&xt[af].mutex); - return seq_list_start(&net->xt.tables[af], *pos); + return seq_list_start(&xt_net->tables[af], *pos); } static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); - u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); + struct xt_pernet *xt_net; - return seq_list_next(v, &net->xt.tables[af], pos); + xt_net = net_generic(net, xt_pernet_id); + + return seq_list_next(v, &xt_net->tables[af], pos); } static void xt_table_seq_stop(struct seq_file *seq, void *v) { - u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u_int8_t af = (unsigned long)pde_data(file_inode(seq->file)); mutex_unlock(&xt[af].mutex); } @@ -1544,7 +1587,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, }; - uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); + uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; if (ppos != NULL) @@ -1571,7 +1614,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, trav->curr = trav->curr->next; if (trav->curr != trav->head) break; - /* fall through */ + fallthrough; default: return NULL; } @@ -1593,7 +1636,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, static void xt_mttg_seq_stop(struct seq_file *seq, void *v) { - uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); + uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; switch (trav->class) { @@ -1718,6 +1761,58 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) } EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); +int xt_register_template(const struct xt_table *table, + int (*table_init)(struct net *net)) +{ + int ret = -EEXIST, af = table->af; + struct xt_template *t; + + mutex_lock(&xt[af].mutex); + + list_for_each_entry(t, &xt_templates[af], list) { + if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0)) + goto out_unlock; + } + + ret = -ENOMEM; + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + goto out_unlock; + + BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name)); + + strscpy(t->name, table->name, sizeof(t->name)); + t->table_init = table_init; + t->me = table->me; + list_add(&t->list, &xt_templates[af]); + ret = 0; +out_unlock: + mutex_unlock(&xt[af].mutex); + return ret; +} +EXPORT_SYMBOL_GPL(xt_register_template); + +void xt_unregister_template(const struct xt_table *table) +{ + struct xt_template *t; + int af = table->af; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_templates[af], list) { + if (strcmp(table->name, t->name)) + continue; + + list_del(&t->list); + mutex_unlock(&xt[af].mutex); + kfree(t); + return; + } + + mutex_unlock(&xt[af].mutex); + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL_GPL(xt_unregister_template); + int xt_proto_init(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS @@ -1735,7 +1830,7 @@ int xt_proto_init(struct net *net, u_int8_t af) root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops, sizeof(struct seq_net_private), @@ -1745,7 +1840,7 @@ int xt_proto_init(struct net *net, u_int8_t af) if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); proc = proc_create_seq_private(buf, 0440, net->proc_net, &xt_match_seq_ops, sizeof(struct nf_mttg_trav), @@ -1755,7 +1850,7 @@ int xt_proto_init(struct net *net, u_int8_t af) if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); proc = proc_create_seq_private(buf, 0440, net->proc_net, &xt_target_seq_ops, sizeof(struct nf_mttg_trav), @@ -1770,12 +1865,12 @@ int xt_proto_init(struct net *net, u_int8_t af) #ifdef CONFIG_PROC_FS out_remove_matches: - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); out_remove_tables: - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); out: @@ -1789,21 +1884,22 @@ void xt_proto_fini(struct net *net, u_int8_t af) #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); remove_proc_entry(buf, net->proc_net); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); #endif /*CONFIG_PROC_FS*/ } EXPORT_SYMBOL_GPL(xt_proto_fini); +#ifdef CONFIG_NETFILTER_XTABLES_LEGACY /** * xt_percpu_counter_alloc - allocate x_tables rule counter * @@ -1858,27 +1954,32 @@ void xt_percpu_counter_free(struct xt_counters *counters) free_percpu((void __percpu *)pcnt); } EXPORT_SYMBOL_GPL(xt_percpu_counter_free); +#endif static int __net_init xt_net_init(struct net *net) { + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; for (i = 0; i < NFPROTO_NUMPROTO; i++) - INIT_LIST_HEAD(&net->xt.tables[i]); + INIT_LIST_HEAD(&xt_net->tables[i]); return 0; } static void __net_exit xt_net_exit(struct net *net) { + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; for (i = 0; i < NFPROTO_NUMPROTO; i++) - WARN_ON_ONCE(!list_empty(&net->xt.tables[i])); + WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); } static struct pernet_operations xt_net_ops = { .init = xt_net_init, .exit = xt_net_exit, + .id = &xt_pernet_id, + .size = sizeof(struct xt_pernet), }; static int __init xt_init(void) @@ -1886,8 +1987,10 @@ static int __init xt_init(void) unsigned int i; int rv; - for_each_possible_cpu(i) { - seqcount_init(&per_cpu(xt_recseq, i)); + if (IS_ENABLED(CONFIG_NETFILTER_XTABLES_LEGACY)) { + for_each_possible_cpu(i) { + seqcount_init(&per_cpu(xt_recseq, i)); + } } xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); @@ -1896,12 +1999,13 @@ static int __init xt_init(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) { mutex_init(&xt[i].mutex); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT mutex_init(&xt[i].compat_mutex); xt[i].compat_tab = NULL; #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); + INIT_LIST_HEAD(&xt_templates[i]); } rv = register_pernet_subsys(&xt_net_ops); if (rv < 0) @@ -1917,4 +2021,3 @@ static void __exit xt_fini(void) module_init(xt_init); module_exit(xt_fini); - diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 9cdc16b0d0d8..b6a015aee0ce 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -117,7 +117,7 @@ static int audit_tg_check(const struct xt_tgchk_param *par) const struct xt_audit_info *info = par->targinfo; if (info->type > XT_AUDIT_TYPE_MAX) { - pr_info_ratelimited("Audit type out of range (valid range: 0..%hhu)\n", + pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n", XT_AUDIT_TYPE_MAX); return -ERANGE; } diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index c8a639f56168..9d99f5a3d176 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -63,24 +63,37 @@ static int checksum_tg_check(const struct xt_tgchk_param *par) return 0; } -static struct xt_target checksum_tg_reg __read_mostly = { - .name = "CHECKSUM", - .family = NFPROTO_UNSPEC, - .target = checksum_tg, - .targetsize = sizeof(struct xt_CHECKSUM_info), - .table = "mangle", - .checkentry = checksum_tg_check, - .me = THIS_MODULE, +static struct xt_target checksum_tg_reg[] __read_mostly = { + { + .name = "CHECKSUM", + .family = NFPROTO_IPV4, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CHECKSUM", + .family = NFPROTO_IPV6, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, + }, +#endif }; static int __init checksum_tg_init(void) { - return xt_register_target(&checksum_tg_reg); + return xt_register_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } static void __exit checksum_tg_exit(void) { - xt_unregister_target(&checksum_tg_reg); + xt_unregister_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } module_init(checksum_tg_init); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 0accac98dea7..0ae8d8a1216e 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -38,9 +38,9 @@ static struct xt_target classify_tg_reg[] __read_mostly = { { .name = "CLASSIFY", .revision = 0, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING), + (1 << NF_INET_POST_ROUTING), .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), .me = THIS_MODULE, @@ -54,6 +54,18 @@ static struct xt_target classify_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_classify_target_info), .me = THIS_MODULE, }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, +#endif }; static int __init classify_tg_init(void) diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index a5c8b653476a..1494b3ee30e1 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -6,7 +6,7 @@ * with the SECMARK target and state match. * * Based somewhat on CONNMARK: - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> @@ -114,25 +114,39 @@ static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target connsecmark_tg_reg __read_mostly = { - .name = "CONNSECMARK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = connsecmark_tg_check, - .destroy = connsecmark_tg_destroy, - .target = connsecmark_tg, - .targetsize = sizeof(struct xt_connsecmark_target_info), - .me = THIS_MODULE, +static struct xt_target connsecmark_tg_reg[] __read_mostly = { + { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, + }, +#endif }; static int __init connsecmark_tg_init(void) { - return xt_register_target(&connsecmark_tg_reg); + return xt_register_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg)); } static void __exit connsecmark_tg_exit(void) { - xt_unregister_target(&connsecmark_tg_reg); + xt_unregister_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg)); } module_init(connsecmark_tg_init); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index d4deee39158b..3ba94c34297c 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -24,7 +24,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) return XT_CONTINUE; if (ct) { - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } else { nf_ct_set(skb, ct, IP_CT_UNTRACKED); @@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, return -ENOMEM; } - help->helper = helper; + rcu_assign_pointer(help->helper, helper); return 0; } @@ -136,6 +136,21 @@ static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info) } } +static void xt_ct_put_helper(struct nf_conn_help *help) +{ + struct nf_conntrack_helper *helper; + + if (!help) + return; + + /* not yet exposed to other cpus, or ruleset + * already detached (post-replacement). + */ + helper = rcu_dereference_raw(help->helper); + if (helper) + nf_conntrack_helper_put(helper); +} + static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { @@ -172,7 +187,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err2; } - ret = 0; if ((info->ct_events || info->exp_events) && !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, GFP_KERNEL)) { @@ -202,15 +216,13 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); - nf_conntrack_get(&ct->ct_general); out: info->ct = ct; return 0; err4: help = nfct_help(ct); - if (help) - nf_conntrack_helper_put(help->helper); + xt_ct_put_helper(help); err3: nf_ct_tmpl_free(ct); err2: @@ -272,8 +284,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, if (ct) { help = nfct_help(ct); - if (help) - nf_conntrack_helper_put(help->helper); + xt_ct_put_helper(help); nf_ct_netns_put(par->net, par->family); @@ -302,10 +313,30 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) xt_ct_tg_destroy(par, par->targinfo); } +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->_nfct != 0) + return XT_CONTINUE; + + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + + return XT_CONTINUE; +} + static struct xt_target xt_ct_tg_reg[] __read_mostly = { { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_IPV4, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, + }, + { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .targetsize = sizeof(struct xt_ct_target_info), .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v0, @@ -316,7 +347,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), @@ -328,7 +359,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), @@ -338,60 +369,61 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .table = "raw", .me = THIS_MODULE, }, -}; - -static unsigned int -notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - /* Previously seen (loopback)? Ignore. */ - if (skb->_nfct != 0) - return XT_CONTINUE; - - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - - return XT_CONTINUE; -} - -static int notrack_chk(const struct xt_tgchk_param *par) -{ - if (!par->net->xt.notrack_deprecated_warning) { - pr_info("netfilter: NOTRACK target is deprecated, " - "use CT instead or upgrade iptables\n"); - par->net->xt.notrack_deprecated_warning = true; - } - return 0; -} - -static struct xt_target notrack_tg_reg __read_mostly = { - .name = "NOTRACK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = notrack_chk, - .target = notrack_tg, - .table = "raw", - .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_IPV6, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .targetsize = sizeof(struct xt_ct_target_info), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v0, + .destroy = xt_ct_tg_destroy_v0, + .target = xt_ct_target_v0, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .revision = 1, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v1, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .revision = 2, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v2, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, +#endif }; static int __init xt_ct_tg_init(void) { - int ret; - - ret = xt_register_target(¬rack_tg_reg); - if (ret < 0) - return ret; - - ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); - if (ret < 0) { - xt_unregister_target(¬rack_tg_reg); - return ret; - } - return 0; + return xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); } static void __exit xt_ct_tg_exit(void) { xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); - xt_unregister_target(¬rack_tg_reg); } module_init(xt_ct_tg_init); diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index eababc354ff1..cfa44515ab72 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -24,6 +24,8 @@ MODULE_ALIAS("ip6t_DSCP"); MODULE_ALIAS("ipt_TOS"); MODULE_ALIAS("ip6t_TOS"); +#define XT_DSCP_ECN_MASK 3u + static unsigned int dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -34,8 +36,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; - ipv4_change_dsfield(ip_hdr(skb), - (__force __u8)(~XT_DSCP_MASK), + ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } @@ -52,8 +53,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; - ipv6_change_dsfield(ipv6_hdr(skb), - (__force __u8)(~XT_DSCP_MASK), + ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index 713fb38541df..8928ec56c388 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -276,7 +276,7 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t, return 0; /* follow-up fragments don't contain ports, skip all fragments */ - if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + if (ip_is_fragment(ip)) return 0; hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 7b2f359bfce4..d73957592c9d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -85,9 +85,9 @@ static ssize_t idletimer_tg_show(struct device *dev, mutex_unlock(&list_mutex); if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) - return snprintf(buf, PAGE_SIZE, "%ld\n", time_diff); + return sysfs_emit(buf, "%ld\n", time_diff); - return snprintf(buf, PAGE_SIZE, "0\n"); + return sysfs_emit(buf, "0\n"); } static void idletimer_tg_work(struct work_struct *work) @@ -100,21 +100,19 @@ static void idletimer_tg_work(struct work_struct *work) static void idletimer_tg_expired(struct timer_list *t) { - struct idletimer_tg *timer = from_timer(timer, t, timer); + struct idletimer_tg *timer = timer_container_of(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } -static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm, - ktime_t now) +static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); - return ALARMTIMER_NORESTART; } static int idletimer_check_sysfs_name(const char *name, unsigned int size) @@ -137,7 +135,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; - info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; @@ -170,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return 0; @@ -231,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return 0; @@ -256,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, info->label, info->timeout); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } @@ -277,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; @@ -322,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -384,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) } } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -409,21 +407,23 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) mutex_lock(&list_mutex); - if (--info->timer->refcnt == 0) { - pr_debug("deleting timer %s\n", info->label); - - list_del(&info->timer->entry); - del_timer_sync(&info->timer->timer); - cancel_work_sync(&info->timer->work); - sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); - kfree(info->timer->attr.attr.name); - kfree(info->timer); - } else { + if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); + mutex_unlock(&list_mutex); + return; } + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); mutex_unlock(&list_mutex); + + timer_shutdown_sync(&info->timer->timer); + cancel_work_sync(&info->timer->work); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) @@ -434,52 +434,75 @@ static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) mutex_lock(&list_mutex); - if (--info->timer->refcnt == 0) { - pr_debug("deleting timer %s\n", info->label); - - list_del(&info->timer->entry); - if (info->timer->timer_type & XT_IDLETIMER_ALARM) { - alarm_cancel(&info->timer->alarm); - } else { - del_timer_sync(&info->timer->timer); - } - cancel_work_sync(&info->timer->work); - sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); - kfree(info->timer->attr.attr.name); - kfree(info->timer); - } else { + if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); + mutex_unlock(&list_mutex); + return; } + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); mutex_unlock(&list_mutex); + + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + alarm_cancel(&info->timer->alarm); + } else { + timer_shutdown_sync(&info->timer->timer); + } + cancel_work_sync(&info->timer->work); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { - .name = "IDLETIMER", - .family = NFPROTO_UNSPEC, - .target = idletimer_tg_target, - .targetsize = sizeof(struct idletimer_tg_info), - .usersize = offsetof(struct idletimer_tg_info, timer), - .checkentry = idletimer_tg_checkentry, - .destroy = idletimer_tg_destroy, - .me = THIS_MODULE, + .name = "IDLETIMER", + .family = NFPROTO_IPV4, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, }, { - .name = "IDLETIMER", - .family = NFPROTO_UNSPEC, - .revision = 1, - .target = idletimer_tg_target_v1, - .targetsize = sizeof(struct idletimer_tg_info_v1), - .usersize = offsetof(struct idletimer_tg_info_v1, timer), - .checkentry = idletimer_tg_checkentry_v1, - .destroy = idletimer_tg_destroy_v1, - .me = THIS_MODULE, + .name = "IDLETIMER", + .family = NFPROTO_IPV4, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, }, - - +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "IDLETIMER", + .family = NFPROTO_IPV6, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "IDLETIMER", + .family = NFPROTO_IPV6, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, + }, +#endif }; static struct class *idletimer_tg_class; @@ -490,7 +513,7 @@ static int __init idletimer_tg_init(void) { int err; - idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer"); + idletimer_tg_class = class_create("xt_idletimer"); err = PTR_ERR(idletimer_tg_class); if (IS_ERR(idletimer_tg_class)) { pr_debug("couldn't register device class\n"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 0371c387b0d1..90dcf088071a 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -43,7 +43,6 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_led_info *ledinfo = par->targinfo; struct xt_led_info_internal *ledinternal = ledinfo->internal_data; - unsigned long led_delay = XT_LED_BLINK_DELAY; /* * If "always blink" is enabled, and there's still some time until the @@ -52,7 +51,7 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) if ((ledinfo->delay > 0) && ledinfo->always_blink && timer_pending(&ledinternal->timer)) led_trigger_blink_oneshot(&ledinternal->netfilter_led_trigger, - &led_delay, &led_delay, 1); + XT_LED_BLINK_DELAY, XT_LED_BLINK_DELAY, 1); else led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL); @@ -73,8 +72,9 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) static void led_timeout_callback(struct timer_list *t) { - struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t, - timer); + struct xt_led_info_internal *ledinternal = timer_container_of(ledinternal, + t, + timer); led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); } @@ -97,7 +97,9 @@ static int led_tg_check(const struct xt_tgchk_param *par) struct xt_led_info_internal *ledinternal; int err; - if (ledinfo->id[0] == '\0') + /* Bail out if empty string or not a string at all. */ + if (ledinfo->id[0] == '\0' || + !memchr(ledinfo->id, '\0', sizeof(ledinfo->id))) return -EINVAL; mutex_lock(&xt_led_mutex); @@ -166,7 +168,7 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par) list_del(&ledinternal->list); - del_timer_sync(&ledinternal->timer); + timer_shutdown_sync(&ledinternal->timer); led_trigger_unregister(&ledinternal->netfilter_led_trigger); @@ -176,26 +178,41 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par) kfree(ledinternal); } -static struct xt_target led_tg_reg __read_mostly = { - .name = "LED", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = led_tg, - .targetsize = sizeof(struct xt_led_info), - .usersize = offsetof(struct xt_led_info, internal_data), - .checkentry = led_tg_check, - .destroy = led_tg_destroy, - .me = THIS_MODULE, +static struct xt_target led_tg_reg[] __read_mostly = { + { + .name = "LED", + .revision = 0, + .family = NFPROTO_IPV4, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "LED", + .revision = 0, + .family = NFPROTO_IPV6, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init led_tg_init(void) { - return xt_register_target(&led_tg_reg); + return xt_register_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg)); } static void __exit led_tg_exit(void) { - xt_unregister_target(&led_tg_reg); + xt_unregister_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg)); } module_init(led_tg_init); diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index a1e79b517c01..f39244f9c0ed 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -44,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) static int log_tg_check(const struct xt_tgchk_param *par) { const struct xt_log_info *loginfo = par->targinfo; + int ret; if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) return -EINVAL; @@ -58,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nf_log_syslog"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + } + + return ret; } static void log_tg_destroy(const struct xt_tgdtor_param *par) @@ -108,3 +116,4 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging"); MODULE_ALIAS("ipt_LOG"); MODULE_ALIAS("ip6t_LOG"); +MODULE_SOFTDEP("pre: nf_log_syslog"); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 6e83ce3000db..6dcf4bc7e30b 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -42,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) static int nflog_tg_check(const struct xt_tgchk_param *par) { const struct xt_nflog_info *info = par->targinfo; + int ret; if (info->flags & ~XT_NFLOG_MASK) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; - return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nfnetlink_log"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + } + + return ret; } static void nflog_tg_destroy(const struct xt_tgdtor_param *par) @@ -56,26 +64,41 @@ static void nflog_tg_destroy(const struct xt_tgdtor_param *par) nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } -static struct xt_target nflog_tg_reg __read_mostly = { - .name = "NFLOG", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = nflog_tg_check, - .destroy = nflog_tg_destroy, - .target = nflog_tg, - .targetsize = sizeof(struct xt_nflog_info), - .me = THIS_MODULE, +static struct xt_target nflog_tg_reg[] __read_mostly = { + { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, + }, +#endif }; static int __init nflog_tg_init(void) { - return xt_register_target(&nflog_tg_reg); + return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } static void __exit nflog_tg_exit(void) { - xt_unregister_target(&nflog_tg_reg); + xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } module_init(nflog_tg_init); module_exit(nflog_tg_exit); +MODULE_SOFTDEP("pre: nfnetlink_log"); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 37253d399c6b..4f49cfc27831 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -94,11 +94,11 @@ static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; - struct gnet_stats_basic_packed *stats = &info->est->bstats; + struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); - stats->bytes += skb->len; - stats->packets++; + u64_stats_add(&stats->bytes, skb->len); + u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; @@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } cfg; int ret; + if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) + return -ENAMETOOLONG; + net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); @@ -140,7 +143,8 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) if (!est) goto err1; - strlcpy(est->name, info->name, sizeof(est->name)); + gnet_stats_basic_sync_init(&est->bstats); + strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; @@ -175,16 +179,31 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) xt_rateest_put(par->net, info->est); } -static struct xt_target xt_rateest_tg_reg __read_mostly = { - .name = "RATEEST", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = xt_rateest_tg, - .checkentry = xt_rateest_tg_checkentry, - .destroy = xt_rateest_tg_destroy, - .targetsize = sizeof(struct xt_rateest_target_info), - .usersize = offsetof(struct xt_rateest_target_info, est), - .me = THIS_MODULE, +static struct xt_target xt_rateest_tg_reg[] __read_mostly = { + { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_IPV4, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_IPV6, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), + .me = THIS_MODULE, + }, +#endif }; static __net_init int xt_rateest_net_init(struct net *net) @@ -210,12 +229,12 @@ static int __init xt_rateest_tg_init(void) if (err) return err; - return xt_register_target(&xt_rateest_tg_reg); + return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { - xt_unregister_target(&xt_rateest_tg_reg); + xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index 353ca7801251..ff66b56a3f97 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -/* FIXME: Take multiple ranges --RR */ static int redirect_tg4_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; @@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par) static unsigned int redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par)); + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range2 range = { + .flags = mr->range[0].flags, + .min_proto = mr->range[0].min, + .max_proto = mr->range[0].max, + }; + + return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par)); } static struct xt_target redirect_tg_reg[] __read_mostly = { diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 75625d13e976..5bc5ea505eb9 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -24,10 +24,9 @@ MODULE_ALIAS("ip6t_SECMARK"); static u8 mode; static unsigned int -secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info) { u32 secmark = 0; - const struct xt_secmark_target_info *info = par->targinfo; switch (mode) { case SECMARK_MODE_SEL: @@ -41,7 +40,7 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static int checkentry_lsm(struct xt_secmark_target_info *info) +static int checkentry_lsm(struct xt_secmark_target_info_v1 *info) { int err; @@ -73,15 +72,15 @@ static int checkentry_lsm(struct xt_secmark_target_info *info) return 0; } -static int secmark_tg_check(const struct xt_tgchk_param *par) +static int +secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info) { - struct xt_secmark_target_info *info = par->targinfo; int err; - if (strcmp(par->table, "mangle") != 0 && - strcmp(par->table, "security") != 0) { + if (strcmp(table, "mangle") != 0 && + strcmp(table, "security") != 0) { pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n", - par->table); + table); return -EINVAL; } @@ -116,25 +115,99 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par) } } -static struct xt_target secmark_tg_reg __read_mostly = { - .name = "SECMARK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = secmark_tg_check, - .destroy = secmark_tg_destroy, - .target = secmark_tg, - .targetsize = sizeof(struct xt_secmark_target_info), - .me = THIS_MODULE, +static int secmark_tg_check_v0(const struct xt_tgchk_param *par) +{ + struct xt_secmark_target_info *info = par->targinfo; + struct xt_secmark_target_info_v1 newinfo = { + .mode = info->mode, + }; + int ret; + + memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX); + + ret = secmark_tg_check(par->table, &newinfo); + info->secid = newinfo.secid; + + return ret; +} + +static unsigned int +secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_secmark_target_info *info = par->targinfo; + struct xt_secmark_target_info_v1 newinfo = { + .secid = info->secid, + }; + + return secmark_tg(skb, &newinfo); +} + +static int secmark_tg_check_v1(const struct xt_tgchk_param *par) +{ + return secmark_tg_check(par->table, par->targinfo); +} + +static unsigned int +secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + return secmark_tg(skb, par->targinfo); +} + +static struct xt_target secmark_tg_reg[] __read_mostly = { + { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = secmark_tg_check_v0, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v0, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, + }, + { + .name = "SECMARK", + .revision = 1, + .family = NFPROTO_IPV4, + .checkentry = secmark_tg_check_v1, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v1, + .targetsize = sizeof(struct xt_secmark_target_info_v1), + .usersize = offsetof(struct xt_secmark_target_info_v1, secid), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = secmark_tg_check_v0, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v0, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, + }, + { + .name = "SECMARK", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = secmark_tg_check_v1, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v1, + .targetsize = sizeof(struct xt_secmark_target_info_v1), + .usersize = offsetof(struct xt_secmark_target_info_v1, secid), + .me = THIS_MODULE, + }, +#endif }; static int __init secmark_tg_init(void) { - return xt_register_target(&secmark_tg_reg); + return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } static void __exit secmark_tg_exit(void) { - xt_unregister_target(&secmark_tg_reg); + xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } module_init(secmark_tg_init); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 122db9fbb9f4..116a885adb3c 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -239,8 +239,8 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) oldlen = ipv6h->payload_len; newlen = htons(ntohs(oldlen) + ret); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(csum_sub(skb->csum, oldlen), - newlen); + skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen), + (__force __wsum)newlen); ipv6h->payload_len = newlen; } return XT_CONTINUE; diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 30e99464171b..93f064306901 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_tcpoptstrip_target_info), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TCPOPTSTRIP", .family = NFPROTO_IPV6, diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 194dc03341f3..e4bea1d346cf 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -74,18 +74,10 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, /* This should be in a separate target, but we don't do multiple targets on the same rule yet */ skb->mark = (skb->mark & ~mark_mask) ^ mark_value; - - pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", - iph->protocol, &iph->daddr, ntohs(hp->dest), - &laddr, ntohs(lport), skb->mark); - nf_tproxy_assign_sock(skb, sk); return NF_ACCEPT; } - pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", - iph->protocol, &iph->saddr, ntohs(hp->source), - &iph->daddr, ntohs(hp->dest), skb->mark); return NF_DROP; } @@ -122,16 +114,12 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) int tproto; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); - if (tproto < 0) { - pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + if (tproto < 0) return NF_DROP; - } hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); - if (hp == NULL) { - pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n"); + if (!hp) return NF_DROP; - } /* check if there's an ongoing connection on the packet * addresses, this happens if the redirect already happened @@ -168,19 +156,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) /* This should be in a separate target, but we don't do multiple targets on the same rule yet */ skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; - - pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", - tproto, &iph->saddr, ntohs(hp->source), - laddr, ntohs(lport), skb->mark); - nf_tproxy_assign_sock(skb, sk); return NF_ACCEPT; } - pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", - tproto, &iph->saddr, ntohs(hp->source), - &iph->daddr, ntohs(hp->dest), skb->mark); - return NF_DROP; } @@ -200,6 +179,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par) pr_info_ratelimited("Can be used only with -p tcp or -p udp\n"); return -EINVAL; } + +static void tproxy_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_defrag_ipv6_disable(par->net); +} #endif static int tproxy_tg4_check(const struct xt_tgchk_param *par) @@ -219,6 +203,11 @@ static int tproxy_tg4_check(const struct xt_tgchk_param *par) return -EINVAL; } +static void tproxy_tg4_destroy(const struct xt_tgdtor_param *par) +{ + nf_defrag_ipv4_disable(par->net); +} + static struct xt_target tproxy_tg_reg[] __read_mostly = { { .name = "TPROXY", @@ -228,6 +217,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 0, .targetsize = sizeof(struct xt_tproxy_target_info), .checkentry = tproxy_tg4_check, + .destroy = tproxy_tg4_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, @@ -239,6 +229,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 1, .targetsize = sizeof(struct xt_tproxy_target_info_v1), .checkentry = tproxy_tg4_check, + .destroy = tproxy_tg4_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, @@ -251,6 +242,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 1, .targetsize = sizeof(struct xt_tproxy_target_info_v1), .checkentry = tproxy_tg6_check, + .destroy = tproxy_tg6_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 349ab5609b1b..a642ff09fc8e 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -29,26 +29,41 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static struct xt_target trace_tg_reg __read_mostly = { - .name = "TRACE", - .revision = 0, - .family = NFPROTO_UNSPEC, - .table = "raw", - .target = trace_tg, - .checkentry = trace_tg_check, - .destroy = trace_tg_destroy, - .me = THIS_MODULE, +static struct xt_target trace_tg_reg[] __read_mostly = { + { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_IPV4, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_IPV6, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init trace_tg_init(void) { - return xt_register_target(&trace_tg_reg); + return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } static void __exit trace_tg_exit(void) { - xt_unregister_target(&trace_tg_reg); + xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } module_init(trace_tg_init); module_exit(trace_tg_exit); +MODULE_SOFTDEP("pre: nf_log_syslog"); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index e9b2181e8c42..a77088943107 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -208,13 +208,24 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { }, { .name = "addrtype", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE - } + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "addrtype", + .family = NFPROTO_IPV6, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct xt_addrtype_info_v1), + .me = THIS_MODULE + }, +#endif }; static int __init addrtype_mt_init(void) diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 13cf3f9b5938..849ac552a154 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; - return BPF_PROG_RUN(info->filter, skb); + return bpf_prog_run(info->filter, skb); } static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index c0f5e9a4f3c6..c437fbd59ec1 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); +#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n" + static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v0 *info = par->matchinfo; @@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) if (info->invert & ~1) return -EINVAL; + if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + return 0; } @@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -100,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { +#ifdef CONFIG_CGROUP_NET_CLASSID const struct xt_cgroup_info_v0 *info = par->matchinfo; struct sock *sk = skb->sk; @@ -108,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ info->invert; +#endif + return false; } static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) @@ -123,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) @@ -141,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index a047a545371e..908fd5f2c3c8 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -146,24 +146,37 @@ static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_match xt_cluster_match __read_mostly = { - .name = "cluster", - .family = NFPROTO_UNSPEC, - .match = xt_cluster_mt, - .checkentry = xt_cluster_mt_checkentry, - .matchsize = sizeof(struct xt_cluster_match_info), - .destroy = xt_cluster_mt_destroy, - .me = THIS_MODULE, +static struct xt_match xt_cluster_match[] __read_mostly = { + { + .name = "cluster", + .family = NFPROTO_IPV4, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "cluster", + .family = NFPROTO_IPV6, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init xt_cluster_mt_init(void) { - return xt_register_match(&xt_cluster_match); + return xt_register_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match)); } static void __exit xt_cluster_mt_fini(void) { - xt_unregister_match(&xt_cluster_match); + xt_unregister_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match)); } MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 93cb018c3055..2aabdcea8707 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -111,9 +111,11 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) + if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); + return ret; + } /* * This filter cannot function correctly unless connection tracking diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 46fcac75f726..848287ab79cf 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; - struct nf_conntrack_tuple tuple; - const struct nf_conntrack_tuple *tuple_ptr = &tuple; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; @@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) u32 key[5]; ct = nf_ct_get(skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if (ct) zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - xt_family(par), net, &tuple)) { - goto hotdrop; - } if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -62,17 +55,16 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) key[4] = zone->id; } else { const struct iphdr *iph = ip_hdr(skb); - key[0] = (info->flags & XT_CONNLIMIT_DADDR) ? - iph->daddr : iph->saddr; - key[0] &= info->mask.ip; + key[0] = (info->flags & XT_CONNLIMIT_DADDR) ? + (__force __u32)iph->daddr : (__force __u32)iph->saddr; + key[0] &= (__force __u32)info->mask.ip; key[1] = zone->id; } - connections = nf_conncount_count(net, info->data, key, tuple_ptr, - zone); + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); if (connections == 0) - /* kmalloc failed, drop it entirely */ + /* kmalloc failed or tuple couldn't be found, drop it entirely */ goto hotdrop; return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); @@ -86,6 +78,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_connlimit_info *info = par->matchinfo; unsigned int keylen; + int ret; keylen = sizeof(u32); if (par->family == NFPROTO_IPV6) @@ -93,8 +86,17 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) else keylen += sizeof(struct in_addr); + ret = nf_ct_netns_get(par->net, par->family); + if (ret < 0) { + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + /* init private data */ - info->data = nf_conncount_init(par->net, par->family, keylen); + info->data = nf_conncount_init(par->net, keylen); + if (IS_ERR(info->data)) + nf_ct_netns_put(par->net, par->family); return PTR_ERR_OR_ZERO(info->data); } @@ -103,29 +105,45 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; - nf_conncount_destroy(par->net, par->family, info->data); + nf_conncount_destroy(par->net, info->data); + nf_ct_netns_put(par->net, par->family); } -static struct xt_match connlimit_mt_reg __read_mostly = { - .name = "connlimit", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .usersize = offsetof(struct xt_connlimit_info, data), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, +static struct xt_match connlimit_mt_reg[] __read_mostly = { + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_IPV4, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init connlimit_mt_init(void) { - return xt_register_match(&connlimit_mt_reg); + return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } static void __exit connlimit_mt_exit(void) { - xt_unregister_match(&connlimit_mt_reg); + xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index eec2f3a88d73..4277084de2e7 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -2,7 +2,7 @@ /* * xt_connmark - Netfilter module to operate on connection marks * - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt <jengelh@medozas.de> @@ -30,6 +30,7 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) u_int32_t new_targetmark; struct nf_conn *ct; u_int32_t newmark; + u_int32_t oldmark; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) @@ -37,14 +38,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) switch (info->mode) { case XT_CONNMARK_SET: - newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + oldmark = READ_ONCE(ct->mark); + newmark = (oldmark & ~info->ctmask) ^ info->ctmark; if (info->shift_dir == D_SHIFT_RIGHT) newmark >>= info->shift_bits; else newmark <<= info->shift_bits; - if (ct->mark != newmark) { - ct->mark = newmark; + if (READ_ONCE(ct->mark) != newmark) { + WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; @@ -55,15 +57,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) else new_targetmark <<= info->shift_bits; - newmark = (ct->mark & ~info->ctmask) ^ + newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^ new_targetmark; - if (ct->mark != newmark) { - ct->mark = newmark; + if (READ_ONCE(ct->mark) != newmark) { + WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: - new_targetmark = (ct->mark & info->ctmask); + new_targetmark = (READ_ONCE(ct->mark) & info->ctmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else @@ -126,7 +128,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL) return false; - return ((ct->mark & info->mask) == info->mark) ^ info->invert; + return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert; } static int connmark_mt_check(const struct xt_mtchk_param *par) @@ -149,7 +151,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), @@ -159,13 +161,35 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 2, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, - } + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "CONNMARK", + .revision = 2, + .family = NFPROTO_IPV6, + .checkentry = connmark_tg_check, + .target = connmark_tg_v2, + .targetsize = sizeof(struct xt_connmark_tginfo2), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static struct xt_match connmark_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9c5cfd74a0ee..3b507694e81e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -15,7 +15,6 @@ #include <linux/random.h> #include <linux/jhash.h> #include <linux/slab.h> -#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/list.h> @@ -294,8 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, if (size < 16) size = 16; } - /* FIXME: don't use vmalloc() here or anywhere else -HW */ - hinfo = vmalloc(struct_size(hinfo, hash, size)); + hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; @@ -303,7 +301,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); if (ret) { - vfree(hinfo); + kvfree(hinfo); return ret; } @@ -322,7 +320,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, hinfo->rnd_initialized = false; hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); @@ -344,7 +342,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, ops, hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } hinfo->net = net; @@ -363,11 +361,15 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select unsigned int i; for (i = 0; i < ht->cfg.size; i++) { + struct hlist_head *head = &ht->hash[i]; struct dsthash_ent *dh; struct hlist_node *n; + if (hlist_empty(head)) + continue; + spin_lock_bh(&ht->lock); - hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { + hlist_for_each_entry_safe(dh, n, head, node) { if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } @@ -429,7 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) cancel_delayed_work_sync(&hinfo->gc_work); htable_selective_cleanup(hinfo, true); kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); } } @@ -1052,7 +1054,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { static void *dl_seq_start(struct seq_file *s, loff_t *pos) __acquires(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket; spin_lock_bh(&htable->lock); @@ -1069,7 +1071,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos) static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; *pos = ++(*bucket); @@ -1083,7 +1085,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) static void dl_seq_stop(struct seq_file *s, void *v) __releases(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; if (!IS_ERR(bucket)) @@ -1125,7 +1127,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1140,7 +1142,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1155,7 +1157,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1169,7 +1171,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_show_v2(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; @@ -1183,7 +1185,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v) static int dl_seq_show_v1(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; @@ -1197,7 +1199,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v) static int dl_seq_show(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 1873da3a945a..ca730cedb5d4 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -21,7 +21,7 @@ static bool length_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); + u32 pktlen = skb_ip_totlen(skb); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } @@ -30,8 +30,7 @@ static bool length_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + - sizeof(struct ipv6hdr); + u32 pktlen = skb->len; return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index bd1dea9c7b88..8b4fd27857f2 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -8,16 +8,14 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_limit.h> struct xt_limit_priv { - spinlock_t lock; unsigned long prev; - uint32_t credit; + u32 credit; }; MODULE_LICENSE("GPL"); @@ -66,22 +64,31 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateinfo *r = par->matchinfo; struct xt_limit_priv *priv = r->master; - unsigned long now = jiffies; - - spin_lock_bh(&priv->lock); - priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; - if (priv->credit > r->credit_cap) - priv->credit = r->credit_cap; - - if (priv->credit >= r->cost) { - /* We're not limited. */ - priv->credit -= r->cost; - spin_unlock_bh(&priv->lock); - return true; - } - - spin_unlock_bh(&priv->lock); - return false; + unsigned long now; + u32 old_credit, new_credit, credit_increase = 0; + bool ret; + + /* fastpath if there is nothing to update */ + if ((READ_ONCE(priv->credit) < r->cost) && (READ_ONCE(priv->prev) == jiffies)) + return false; + + do { + now = jiffies; + credit_increase += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; + old_credit = READ_ONCE(priv->credit); + new_credit = old_credit; + new_credit += credit_increase; + if (new_credit > r->credit_cap) + new_credit = r->credit_cap; + if (new_credit >= r->cost) { + ret = true; + new_credit -= r->cost; + } else { + ret = false; + } + } while (cmpxchg(&priv->credit, old_credit, new_credit) != old_credit); + + return ret; } /* Precision saver. */ @@ -122,7 +129,6 @@ static int limit_mt_check(const struct xt_mtchk_param *par) r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } - spin_lock_init(&priv->lock); return 0; } @@ -134,7 +140,7 @@ static void limit_mt_destroy(const struct xt_mtdtor_param *par) kfree(info->master); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_xt_rateinfo { u_int32_t avg; u_int32_t burst; @@ -176,7 +182,7 @@ static int limit_mt_compat_to_user(void __user *dst, const void *src) }; return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; } -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_match limit_mt_reg __read_mostly = { .name = "limit", @@ -186,7 +192,7 @@ static struct xt_match limit_mt_reg __read_mostly = { .checkentry = limit_mt_check, .destroy = limit_mt_destroy, .matchsize = sizeof(struct xt_rateinfo), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_xt_rateinfo), .compat_from_user = limit_mt_compat_from_user, .compat_to_user = limit_mt_compat_to_user, diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 1ad74b5920b5..59b9d04400ca 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -39,13 +39,35 @@ mark_mt(const struct sk_buff *skb, struct xt_action_param *par) return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -static struct xt_target mark_tg_reg __read_mostly = { - .name = "MARK", - .revision = 2, - .family = NFPROTO_UNSPEC, - .target = mark_tg, - .targetsize = sizeof(struct xt_mark_tginfo2), - .me = THIS_MODULE, +static struct xt_target mark_tg_reg[] __read_mostly = { + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_IPV4, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP) + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_ARP, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#endif +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_IPV6, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#endif }; static struct xt_match mark_mt_reg __read_mostly = { @@ -61,12 +83,12 @@ static int __init mark_mt_init(void) { int ret; - ret = xt_register_target(&mark_tg_reg); + ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&mark_mt_reg); if (ret < 0) { - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); return ret; } return 0; @@ -75,7 +97,7 @@ static int __init mark_mt_init(void) static void __exit mark_mt_exit(void) { xt_unregister_match(&mark_mt_reg); - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); } module_init(mark_mt_init); diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c index a8e5f6c8db7a..b4f7bbc3f3ca 100644 --- a/net/netfilter/xt_nat.c +++ b/net/netfilter/xt_nat.c @@ -244,3 +244,4 @@ MODULE_ALIAS("ipt_SNAT"); MODULE_ALIAS("ipt_DNAT"); MODULE_ALIAS("ip6t_SNAT"); MODULE_ALIAS("ip6t_DNAT"); +MODULE_DESCRIPTION("SNAT and DNAT targets support"); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 5aab6df74e0f..0ca1cdfc4095 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> - * (C) 2011 Intra2net AG <http://www.intra2net.com> + * (C) 2011 Intra2net AG <https://www.intra2net.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -27,7 +27,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) overquota = nfnl_acct_overquota(xt_net(par), info->nfacct); - return overquota == NFACCT_UNDERQUOTA ? false : true; + return overquota != NFACCT_UNDERQUOTA; } static int @@ -38,8 +38,8 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par) nfacct = nfnl_acct_find_get(par->net, info->name); if (nfacct == NULL) { - pr_info_ratelimited("accounting object `%s' does not exists\n", - info->name); + pr_info_ratelimited("accounting object `%.*s' does not exist\n", + NFACCT_NAME_MAX, info->name); return -ENOENT; } info->nfacct = nfacct; diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index e1990baf3a3b..dc9485854002 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); MODULE_ALIAS("ipt_osf"); MODULE_ALIAS("ip6t_osf"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index e85ce69924ae..50332888c8d2 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -76,18 +76,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) */ return false; - filp = sk->sk_socket->file; - if (filp == NULL) + read_lock_bh(&sk->sk_callback_lock); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + if (filp == NULL) { + read_unlock_bh(&sk->sk_callback_lock); return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; + } if (info->match & XT_OWNER_UID) { kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ - !(info->invert & XT_OWNER_UID)) + !(info->invert & XT_OWNER_UID)) { + read_unlock_bh(&sk->sk_callback_lock); return false; + } } if (info->match & XT_OWNER_GID) { @@ -112,10 +117,13 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } } - if (match ^ !(info->invert & XT_OWNER_GID)) + if (match ^ !(info->invert & XT_OWNER_GID)) { + read_unlock_bh(&sk->sk_callback_lock); return false; + } } + read_unlock_bh(&sk->sk_callback_lock); return true; } diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index ec6ed6fda96c..343e65f377d4 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -59,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) (!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED))) return false; - physdev = nf_bridge_get_physindev(skb); + physdev = nf_bridge_get_physindev(skb, xt_net(par)); indev = physdev ? physdev->name : NULL; if ((info->bitmask & XT_PHYSDEV_OP_ISIN && diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 19bef176145e..588a5e6ad899 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -59,9 +59,9 @@ MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* fil /* retained for backwards compatibility */ static unsigned int ip_pkt_list_tot __read_mostly; module_param(ip_pkt_list_tot, uint, 0400); -MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 65535)"); -#define XT_RECENT_MAX_NSTAMPS 256 +#define XT_RECENT_MAX_NSTAMPS 65536 struct recent_entry { struct list_head list; @@ -69,7 +69,7 @@ struct recent_entry { union nf_inet_addr addr; u_int16_t family; u_int8_t ttl; - u_int8_t index; + u_int16_t index; u_int16_t nstamps; unsigned long stamps[]; }; @@ -80,7 +80,7 @@ struct recent_table { union nf_inet_addr mask; unsigned int refcnt; unsigned int entries; - u8 nstamps_max_mask; + u_int16_t nstamps_max_mask; struct list_head lru_list; struct list_head iphash[]; }; @@ -152,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) /* * Drop entries with timestamps older then 'time'. */ -static void recent_entry_reap(struct recent_table *t, unsigned long time) +static void recent_entry_reap(struct recent_table *t, unsigned long time, + struct recent_entry *working, bool update) { struct recent_entry *e; @@ -162,6 +163,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time) e = list_entry(t->lru_list.next, struct recent_entry, lru_list); /* + * Do not reap the entry which are going to be updated. + */ + if (e == working && update) + return; + + /* * The last time stamp is the most recent. */ if (time_after(time, e->stamps[e->index-1])) @@ -303,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) /* info->seconds must be non-zero */ if (info->check_set & XT_RECENT_REAP) - recent_entry_reap(t, time); + recent_entry_reap(t, time, e, + info->check_set & XT_RECENT_UPDATE && ret); } if (info->check_set & XT_RECENT_SET || @@ -543,7 +551,7 @@ static int recent_seq_open(struct inode *inode, struct file *file) if (st == NULL) return -ENOMEM; - st->table = PDE_DATA(inode); + st->table = pde_data(inode); return 0; } @@ -551,9 +559,9 @@ static ssize_t recent_mt_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { - struct recent_table *t = PDE_DATA(file_inode(file)); + struct recent_table *t = pde_data(file_inode(file)); struct recent_entry *e; - char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; + char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")]; const char *c = buf; union nf_inet_addr addr = {}; u_int16_t family; @@ -640,7 +648,7 @@ static void __net_exit recent_proc_net_exit(struct net *net) struct recent_table *t; /* recent_net_exit() is called before recent_mt_destroy(). Make sure - * that the parent xt_recent proc entry is is empty before trying to + * that the parent xt_recent proc entry is empty before trying to * remove it. */ spin_lock_bh(&recent_lock); diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 68ccbe50bb1e..600060ca940a 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -29,7 +29,7 @@ if (tbl == NULL) \ return NULL; \ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ - strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + strscpy(tbl->repl.name, info->name); \ *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index 680015ba7cb6..b46a6a512058 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -4,7 +4,6 @@ #include <linux/skbuff.h> #include <net/ip.h> #include <net/ipv6.h> -#include <net/sctp/sctp.h> #include <linux/sctp.h> #include <linux/netfilter/x_tables.h> @@ -150,6 +149,8 @@ static int sctp_mt_check(const struct xt_mtchk_param *par) { const struct xt_sctp_info *info = par->matchinfo; + if (info->flag_count > ARRAY_SIZE(info->flag_info)) + return -EINVAL; if (info->flags & ~XT_SCTP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~XT_SCTP_VALID_FLAGS) diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 5f973987265d..76e01f292aaf 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -77,7 +77,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) - pskb->mark = sk->sk_mark; + pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); @@ -138,7 +138,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) - pskb->mark = sk->sk_mark; + pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); @@ -216,6 +216,16 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par) return 0; } +static void socket_mt_destroy(const struct xt_mtdtor_param *par) +{ + if (par->family == NFPROTO_IPV4) + nf_defrag_ipv4_disable(par->net); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + else if (par->family == NFPROTO_IPV6) + nf_defrag_ipv6_disable(par->net); +#endif +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -231,6 +241,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .revision = 1, .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, + .destroy = socket_mt_destroy, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -245,6 +256,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), + .destroy = socket_mt_destroy, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, @@ -256,6 +268,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v2_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -268,6 +281,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v2_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -280,6 +294,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v3_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -292,6 +307,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v3_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 203e24ae472c..b26c1dcfc27b 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -34,7 +34,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: - if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability) + if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability) ret = !ret; break; case XT_STATISTIC_MODE_NTH: diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 11ec2abf0c72..e8991130a3de 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -4,6 +4,7 @@ #include <linux/module.h> #include <net/ip.h> #include <linux/ipv6.h> +#include <linux/icmp.h> #include <net/ipv6.h> #include <net/tcp.h> #include <net/udp.h> @@ -20,6 +21,8 @@ MODULE_ALIAS("ipt_udp"); MODULE_ALIAS("ipt_tcp"); MODULE_ALIAS("ip6t_udp"); MODULE_ALIAS("ip6t_tcp"); +MODULE_ALIAS("ipt_icmp"); +MODULE_ALIAS("ip6t_icmp6"); /* Returns 1 if the port is matched by the range, 0 otherwise */ static inline bool @@ -161,6 +164,95 @@ static int udp_mt_check(const struct xt_mtchk_param *par) return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; } +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code, + u8 type, u8 code) +{ + return type == test_type && code >= min_code && code <= max_code; +} + +static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code, + u8 type, u8 code, bool invert) +{ + return (test_type == 0xFF || + type_code_in_range(test_type, min_code, max_code, type, code)) + ^ invert; +} + +static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code, + u8 type, u8 code, bool invert) +{ + return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert; +} + +static bool +icmp_match(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct icmphdr *ic; + struct icmphdr _icmph; + const struct ipt_icmp *icmpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); + if (!ic) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + par->hotdrop = true; + return false; + } + + return icmp_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->type, ic->code, + !!(icmpinfo->invflags & IPT_ICMP_INV)); +} + +static bool +icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct icmp6hdr *ic; + struct icmp6hdr _icmph; + const struct ip6t_icmp *icmpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); + if (!ic) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + par->hotdrop = true; + return false; + } + + return icmp6_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->icmp6_type, ic->icmp6_code, + !!(icmpinfo->invflags & IP6T_ICMP_INV)); +} + +static int icmp_checkentry(const struct xt_mtchk_param *par) +{ + const struct ipt_icmp *icmpinfo = par->matchinfo; + + return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0; +} + +static int icmp6_checkentry(const struct xt_mtchk_param *par) +{ + const struct ip6t_icmp *icmpinfo = par->matchinfo; + + return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; +} + static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", @@ -216,6 +308,24 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, + { + .name = "icmp", + .match = icmp_match, + .matchsize = sizeof(struct ipt_icmp), + .checkentry = icmp_checkentry, + .proto = IPPROTO_ICMP, + .family = NFPROTO_IPV4, + .me = THIS_MODULE, + }, + { + .name = "icmp6", + .match = icmp6_match, + .matchsize = sizeof(struct ip6t_icmp), + .checkentry = icmp6_checkentry, + .proto = IPPROTO_ICMPV6, + .family = NFPROTO_IPV6, + .me = THIS_MODULE, + }, }; static int __init tcpudp_mt_init(void) diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 67cb98489415..6aa12d0f54e2 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -5,7 +5,7 @@ * based on ipt_time by Fabrice MARIE <fabrice@netfilter.org> * This is a module which is used for time matching * It is using some modified code from dietlibc (localtime() function) - * that you can find at http://www.fefe.de/dietlibc/ + * that you can find at https://www.fefe.de/dietlibc/ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index 177b40d08098..117d4615d668 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -96,11 +96,32 @@ static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par) return ret ^ data->invert; } +static int u32_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_u32 *data = par->matchinfo; + const struct xt_u32_test *ct; + unsigned int i; + + if (data->ntests > ARRAY_SIZE(data->tests)) + return -EINVAL; + + for (i = 0; i < data->ntests; ++i) { + ct = &data->tests[i]; + + if (ct->nnums > ARRAY_SIZE(ct->location) || + ct->nvalues > ARRAY_SIZE(ct->value)) + return -EINVAL; + } + + return 0; +} + static struct xt_match xt_u32_mt_reg __read_mostly = { .name = "u32", .revision = 0, .family = NFPROTO_UNSPEC, .match = u32_mt, + .checkentry = u32_mt_checkentry, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }; |
