diff options
Diffstat (limited to 'net/netfilter')
179 files changed, 11262 insertions, 6318 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f71b41c7ce2f..6cdc994fdc8a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -30,6 +30,9 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config NETFILTER_BPF_LINK + def_bool BPF_SYSCALL + config NETFILTER_NETLINK_HOOK tristate "Netfilter base hook dump support" depends on NETFILTER_ADVANCED @@ -189,15 +192,8 @@ config NF_CONNTRACK_LABELS to connection tracking entries. It can be used with xtables connlabel match and the nftables ct expression. -config NF_CT_PROTO_DCCP - bool 'DCCP protocol connection tracking support' - depends on NETFILTER_ADVANCED - default y - help - With this option enabled, the layer 3 independent connection - tracking code will be able to do state tracking on DCCP connections. - - If unsure, say Y. +config NF_CONNTRACK_OVS + bool config NF_CT_PROTO_GRE bool @@ -206,7 +202,7 @@ config NF_CT_PROTO_SCTP bool 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default y - select LIBCRC32C + select NET_CRC32C help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -469,7 +465,7 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK - select LIBCRC32C + select NET_CRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to @@ -510,6 +506,12 @@ config NFT_CT This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. +config NFT_EXTHDR_DCCP + bool "Netfilter nf_tables exthdr DCCP support (DEPRECATED)" + default n + help + This option adds support for matching on DCCP extension headers. + config NFT_FLOW_OFFLOAD depends on NF_CONNTRACK && NF_FLOW_TABLE tristate "Netfilter nf_tables hardware flow offload module" @@ -750,13 +752,22 @@ if NETFILTER_XTABLES config NETFILTER_XTABLES_COMPAT bool "Netfilter Xtables 32bit support" depends on COMPAT - default y help This option provides a translation layer to run 32bit arp,ip(6),ebtables binaries on 64bit kernels. If unsure, say N. +config NETFILTER_XTABLES_LEGACY + bool "Netfilter legacy tables support" + depends on !PREEMPT_RT + help + Say Y here if you still require support for legacy tables. This is + required by the legacy tools (iptables-legacy) and is not needed if + you use iptables over nftables (iptables-nft). + Legacy support is not limited to IP, it also includes EBTABLES and + ARPTABLES. + comment "Xtables combined modules" config NETFILTER_XT_MARK @@ -813,7 +824,7 @@ config NETFILTER_XT_TARGET_AUDIT config NETFILTER_XT_TARGET_CHECKSUM tristate "CHECKSUM target support" - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `CHECKSUM' target, which can be used in the iptables mangle @@ -864,7 +875,7 @@ config NETFILTER_XT_TARGET_CONNSECMARK config NETFILTER_XT_TARGET_CT tristate '"CT" target support' depends on NF_CONNTRACK - depends on IP_NF_RAW || IP6_NF_RAW + depends on IP_NF_RAW || IP6_NF_RAW || NFT_COMPAT depends on NETFILTER_ADVANCED help This options adds a `CT' target, which allows to specify initial @@ -875,7 +886,7 @@ config NETFILTER_XT_TARGET_CT config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `DSCP' target, which allows you to manipulate @@ -891,7 +902,7 @@ config NETFILTER_XT_TARGET_DSCP config NETFILTER_XT_TARGET_HL tristate '"HL" hoplimit target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds the "HL" (for IPv6) and "TTL" (for IPv4) @@ -1075,7 +1086,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on NETFILTER_ADVANCED depends on IPV6 || IPV6=n depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n - depends on IP_NF_MANGLE + depends on IP_NF_MANGLE || NFT_COMPAT select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n select NF_TPROXY_IPV4 @@ -1142,7 +1153,7 @@ config NETFILTER_XT_TARGET_TCPMSS config NETFILTER_XT_TARGET_TCPOPTSTRIP tristate '"TCPOPTSTRIP" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a "TCPOPTSTRIP" target, which allows you to strip @@ -1175,7 +1186,7 @@ config NETFILTER_XT_MATCH_CGROUP tristate '"control group" match support' depends on NETFILTER_ADVANCED depends on CGROUPS - select CGROUP_NET_CLASSID + select SOCK_CGROUP_DATA help Socket/process control group matching allows you to match locally generated packets based on which net_cls control group processes @@ -1273,9 +1284,9 @@ config NETFILTER_XT_MATCH_CPU To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_MATCH_DCCP - tristate '"dccp" protocol match support' + tristate '"dccp" protocol match support (DEPRECATED)' depends on NETFILTER_ADVANCED - default IP_DCCP + default n help With this option enabled, you will be able to use the iptables `dccp' match in order to match on DCCP source/destination ports diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3754eb06fb41..6bfc250e474f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -11,7 +11,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o -nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o ifeq ($(CONFIG_NF_CONNTRACK),m) @@ -21,6 +21,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o endif obj-$(CONFIG_NETFILTER) = netfilter.o +obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o @@ -98,6 +99,12 @@ nf_tables-objs += nft_set_pipapo_avx2.o endif endif +ifdef CONFIG_NFT_CT +ifdef CONFIG_MITIGATION_RETPOLINE +nf_tables-objs += nft_ct_fast.o +endif +endif + obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o @@ -134,8 +141,14 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ - nf_flow_table_offload.o + nf_flow_table_path.o \ + nf_flow_table_offload.o nf_flow_table_xdp.o nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o +ifeq ($(CONFIG_NF_FLOW_TABLE),m) +nf_flow_table-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_flow_table_bpf.o +else ifeq ($(CONFIG_NF_FLOW_TABLE),y) +nf_flow_table-$(CONFIG_DEBUG_INFO_BTF) += nf_flow_table_bpf.o +endif obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5a6705a0e4ec..11a702065bab 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@ const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); - #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -119,6 +116,18 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, for (i = 0; i < old_entries; i++) { if (orig_ops[i] != &dummy_ops) alloc_entries++; + + /* Restrict BPF hook type to force a unique priority, not + * shared at attach time. + * + * This is mainly to avoid ordering issues between two + * different bpf programs, this doesn't prevent a normal + * hook at same priority as a bpf one (we don't want to + * prevent defrag, conntrack, iptables etc from attaching). + */ + if (reg->priority == orig_ops[i]->priority && + reg->hook_ops_type == NF_HOOK_OP_BPF) + return ERR_PTR(-EBUSY); } } @@ -627,10 +636,10 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, if (ret == 1) continue; return ret; + case NF_STOLEN: + return NF_DROP_GETERR(verdict); default: - /* Implicit handling for NF_STOLEN, as well as any other - * non conventional verdicts. - */ + WARN_ON_ONCE(1); return 0; } } @@ -643,11 +652,9 @@ void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); int ret; - INIT_LIST_HEAD(&sublist); - list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); ret = nf_hook_slow(skb, state, e, 0); @@ -668,7 +675,16 @@ EXPORT_SYMBOL_GPL(nfnl_ct_hook); const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); +const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v4_hook); + +const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v6_hook); + #if IS_ENABLED(CONFIG_NF_CONNTRACK) +u8 nf_ctnetlink_has_listener; +EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener); + const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_hook); @@ -696,12 +712,30 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - BUG_ON(ct_hook == NULL); - ct_hook->destroy(nfct); + if (ct_hook) + ct_hook->destroy(nfct); rcu_read_unlock(); + + WARN_ON(!ct_hook); } EXPORT_SYMBOL(nf_conntrack_destroy); +void nf_ct_set_closing(struct nf_conntrack *nfct) +{ + const struct nf_ct_hook *ct_hook; + + if (!nfct) + return; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ct_hook->set_closing(nfct); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_set_closing); + bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { @@ -776,12 +810,21 @@ int __init netfilter_init(void) if (ret < 0) goto err; +#ifdef CONFIG_LWTUNNEL + ret = netfilter_lwtunnel_init(); + if (ret < 0) + goto err_lwtunnel_pernet; +#endif ret = netfilter_log_init(); if (ret < 0) - goto err_pernet; + goto err_log_pernet; return 0; -err_pernet: +err_log_pernet: +#ifdef CONFIG_LWTUNNEL + netfilter_lwtunnel_fini(); +err_lwtunnel_pernet: +#endif unregister_pernet_subsys(&netfilter_net_ops); err: return ret; diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 3c273483df23..b1ea054bb82c 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -30,7 +30,7 @@ config IP_SET_BITMAP_IP depends on IP_SET help This option adds the bitmap:ip set type support, by which one - can store IPv4 addresses (or network addresse) from a range. + can store IPv4 addresses (or network addresses) from a range. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 26ab0e9612d8..798c7993635e 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -4,6 +4,8 @@ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H +#include <linux/rcupdate_wait.h> + #define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) #define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) #define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) @@ -28,6 +30,7 @@ #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) @@ -57,9 +60,6 @@ mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; - if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); - if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); @@ -264,7 +264,7 @@ out: static void mtype_gc(struct timer_list *t) { - struct mtype *map = from_timer(map, t, gc); + struct mtype *map = timer_container_of(map, t, gc); struct ip_set *set = map->set; void *x; u32 id; @@ -288,6 +288,15 @@ mtype_gc(struct timer_list *t) add_timer(&map->gc); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + timer_delete_sync(&map->gc); +} + static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, @@ -301,6 +310,7 @@ static const struct ip_set_type_variant mtype = { .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e4fa00abde6a..5988b9bb9029 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -163,11 +163,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) { + if (ip > ip_to) swap(ip, ip_to); - if (ip < map->first_ip) - return -IPSET_ERR_BITMAP_RANGE; - } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -178,7 +175,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ip_to = ip; } - if (ip_to > map->last_ip) + if (ip < map->first_ip || ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; for (; !before(ip_to, ip); ip += map->hosts) { diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 46ebee9400da..cc20e6d56807 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -53,14 +53,17 @@ MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ -#define ip_set_dereference(p) \ - rcu_dereference_protected(p, \ +#define ip_set_dereference(inst) \ + rcu_dereference_protected((inst)->ip_set_list, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ - lockdep_is_held(&ip_set_ref_lock)) + lockdep_is_held(&ip_set_ref_lock) || \ + (inst)->is_deleted) #define ip_set(inst, id) \ - ip_set_dereference((inst)->ip_set_list)[id] + ip_set_dereference(inst)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] +#define ip_set_dereference_nfnl(p) \ + rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -101,14 +104,19 @@ find_set_type(const char *name, u8 family, u8 revision) static bool load_settype(const char *name) { + if (!try_module_get(THIS_MODULE)) + return false; + nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { pr_warn("Can't find ip_set type %s\n", name); nfnl_lock(NFNL_SUBSYS_IPSET); + module_put(THIS_MODULE); return false; } nfnl_lock(NFNL_SUBSYS_IPSET); + module_put(THIS_MODULE); return true; } @@ -683,6 +691,14 @@ __ip_set_put(struct ip_set *set) * a separate reference counter */ static void +__ip_set_get_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref_netlink++; + write_unlock_bh(&ip_set_ref_lock); +} + +static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -700,15 +716,10 @@ __ip_set_put_netlink(struct ip_set *set) static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { - struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); - rcu_read_lock(); - /* ip_set_list itself needs to be protected */ - set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock(); - - return set; + /* ip_set_list and the set pointer need to be protected */ + return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } static inline void @@ -739,9 +750,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; - rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); - rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ @@ -874,7 +883,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) BUG_ON(!set); read_lock_bh(&ip_set_ref_lock); - strncpy(name, set->name, IPSET_MAXNAMELEN); + strscpy_pad(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -1130,7 +1139,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ - tmp = ip_set_dereference(inst->ip_set_list); + tmp = ip_set_dereference(inst); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ @@ -1151,6 +1160,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, return ret; cleanup: + set->variant->cancel_gc(set); set->variant->destroy(set); put_out: module_put(set->type->me); @@ -1168,17 +1178,52 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; +/* In order to return quickly when destroying a single set, it is split + * into two stages: + * - Cancel garbage collector + * - Destroy the set itself via call_rcu() + */ + static void -ip_set_destroy_set(struct ip_set *set) +ip_set_destroy_set_rcu(struct rcu_head *head) { - pr_debug("set: %s\n", set->name); + struct ip_set *set = container_of(head, struct ip_set, rcu); - /* Must call it without holding any lock */ set->variant->destroy(set); module_put(set->type->me); kfree(set); } +static void +_destroy_all_sets(struct ip_set_net *inst) +{ + struct ip_set *set; + ip_set_id_t i; + bool need_wait = false; + + /* First cancel gc's: set:list sets are flushed as well */ + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + set->variant->cancel_gc(set); + if (set->type->features & IPSET_TYPE_NAME) + need_wait = true; + } + } + /* Must wait for flush to be really finished */ + if (need_wait) + rcu_barrier(); + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); + } + } +} + static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { @@ -1190,21 +1235,18 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); - /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call - * ip_set_put|get_nfnl_* functions, that way we + * ip_set_nfnl_get_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference * counter, so if it's already zero, we can proceed * without holding the lock. */ - read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { + read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && (s->ref || s->ref_netlink)) { @@ -1214,17 +1256,14 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); - for (i = 0; i < inst->ip_set_max; i++) { - s = ip_set(inst, i); - if (s) { - ip_set(inst, i) = NULL; - ip_set_destroy_set(s); - } - } + _destroy_all_sets(inst); /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { u32 flags = flag_exist(info->nlh); + u16 features = 0; + + read_lock_bh(&ip_set_ref_lock); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { @@ -1235,10 +1274,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, ret = -IPSET_ERR_BUSY; goto out; } + features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - - ip_set_destroy_set(s); + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); + if (features & IPSET_TYPE_NAME) { + /* Must wait for flush to be really finished */ + rcu_barrier(); + } + call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; out: @@ -1328,7 +1373,7 @@ static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, goto out; } } - strncpy(set->name, name2, IPSET_MAXNAMELEN); + strscpy_pad(set->name, name2, IPSET_MAXNAMELEN); out: write_unlock_bh(&ip_set_ref_lock); @@ -1382,9 +1427,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, return -EBUSY; } - strncpy(from_name, from->name, IPSET_MAXNAMELEN); - strncpy(from->name, to->name, IPSET_MAXNAMELEN); - strncpy(to->name, from_name, IPSET_MAXNAMELEN); + strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN); + strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN); + strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN); swap(from->ref, to->ref); ip_set(inst, from_id) = to; @@ -1694,6 +1739,14 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { + if (retried) { + __ip_set_get_netlink(set); + nfnl_unlock(NFNL_SUBSYS_IPSET); + cond_resched(); + nfnl_lock(NFNL_SUBSYS_IPSET); + __ip_set_put_netlink(set); + } + ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); @@ -2334,29 +2387,25 @@ ip_set_net_init(struct net *net) } static void __net_exit -ip_set_net_exit(struct net *net) +ip_set_net_pre_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set = NULL; - ip_set_id_t i; - inst->is_deleted = true; /* flag for ip_set_nfnl_put */ +} - nfnl_lock(NFNL_SUBSYS_IPSET); - for (i = 0; i < inst->ip_set_max; i++) { - set = ip_set(inst, i); - if (set) { - ip_set(inst, i) = NULL; - ip_set_destroy_set(set); - } - } - nfnl_unlock(NFNL_SUBSYS_IPSET); +static void __net_exit +ip_set_net_exit(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); + + _destroy_all_sets(inst); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, + .pre_exit = ip_set_net_pre_exit, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), @@ -2395,8 +2444,11 @@ ip_set_fini(void) { nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - unregister_pernet_subsys(&ip_set_net_ops); + + /* Wait for call_rcu() in destroy */ + rcu_barrier(); + pr_debug("these are the famous last words\n"); } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7c2399541771..5e4453e9ef8e 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -5,6 +5,7 @@ #define _IP_SET_HASH_GEN_H #include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/jhash.h> #include <linux/types.h> #include <linux/netfilter/nfnetlink.h> @@ -62,8 +63,8 @@ struct hbucket { : jhash_size((htable_bits) - HTABLE_REGION_BITS)) #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) -#define ahash_region(n, htable_bits) \ - ((n) % ahash_numof_locks(htable_bits)) +#define ahash_region(n) \ + ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ : (h) * jhash_size(HTABLE_REGION_BITS)) @@ -221,6 +222,7 @@ static const union nf_inet_addr zeromask = {}; #undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init +#undef mtype_cancel_gc #undef mtype_variant #undef mtype_data_match @@ -265,6 +267,7 @@ static const union nf_inet_addr zeromask = {}; #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) @@ -429,7 +432,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference(hbucket(t, i)); + n = (__force struct hbucket *)hbucket(t, i); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -449,10 +452,7 @@ mtype_destroy(struct ip_set *set) struct htype *h = set->data; struct list_head *l, *lt; - if (SET_WITH_TIMEOUT(set)) - cancel_delayed_work_sync(&h->gc.dwork); - - mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + mtype_ahash_destroy(set, (__force struct htable *)h->table, true); list_for_each_safe(l, lt, &h->ad) { list_del(l); kfree(l); @@ -598,6 +598,15 @@ mtype_gc_init(struct htable_gc *gc) queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct htype *h = set->data; + + if (SET_WITH_TIMEOUT(set)) + cancel_delayed_work_sync(&h->gc.dwork); +} + static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags); @@ -693,7 +702,7 @@ retry: #endif key = HKEY(data, h->initval, htable_bits); m = __ipset_dereference(hbucket(t, key)); - nr = ahash_region(key, htable_bits); + nr = ahash_region(key); if (!m) { m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, @@ -843,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); elements = t->hregion[r].elements; maxelem = t->maxelem; @@ -1041,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); rcu_read_unlock_bh(); @@ -1440,6 +1449,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, .region_lock = true, }; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 031073286236..30a655e5c4fd 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -40,7 +40,7 @@ MODULE_ALIAS("ip_set_hash:net,iface"); #define IP_SET_HASH_WITH_MULTI #define IP_SET_HASH_WITH_NET0 -#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ) +#define STRSCPY(a, b) strscpy(a, b, IFNAMSIZ) /* IPv4 variant */ @@ -138,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next, #include "ip_set_hash_gen.h" #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -static const char *get_physindev_name(const struct sk_buff *skb) +static const char *get_physindev_name(const struct sk_buff *skb, struct net *net) { - struct net_device *dev = nf_bridge_get_physindev(skb); + struct net_device *dev = nf_bridge_get_physindev(skb, net); return dev ? dev->name : NULL; } @@ -177,16 +177,16 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const char *eiface = SRCDIR ? get_physindev_name(skb) : + const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) return -EINVAL; - STRLCPY(e.iface, eiface); + STRSCPY(e.iface, eiface); e.physdev = 1; #endif } else { - STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); } if (strlen(e.iface) == 0) @@ -395,16 +395,16 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const char *eiface = SRCDIR ? get_physindev_name(skb) : + const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) return -EINVAL; - STRLCPY(e.iface, eiface); + STRSCPY(e.iface, eiface); e.physdev = 1; #endif } else { - STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); } if (strlen(e.iface) == 0) diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 005a7ce87217..bf4f91b78e1d 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -36,6 +36,7 @@ MODULE_ALIAS("ip_set_hash:net,port,net"); #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS #define IPSET_NET_COUNT 2 +#define IP_SET_HASH_WITH_NET0 /* IPv4 variant */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index e162636525cf..13c7a08aa868 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -79,7 +79,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -99,7 +99,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -188,9 +188,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *next, *prev = NULL; - int ret; + int ret = 0; - list_for_each_entry(e, &map->members, list) { + rcu_read_lock(); + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -201,6 +202,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (d->before == 0) { ret = 1; + goto out; } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && @@ -208,9 +210,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, } else { ret = prev && prev->id == d->refid; } - return ret; + goto out; } - return 0; +out: + rcu_read_unlock(); + return ret; } static void @@ -239,7 +243,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* Find where to add the new entry */ n = prev = next = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -316,9 +320,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e, *next, *prev = NULL; + struct set_elem *e, *n, *next, *prev = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_safe(e, n, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -424,17 +428,8 @@ static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e, *n; - if (SET_WITH_TIMEOUT(set)) - timer_shutdown_sync(&map->gc); - - list_for_each_entry_safe(e, n, &map->members, list) { - list_del(&e->list); - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - kfree(e); - } + WARN_ON_ONCE(!list_empty(&map->members)); kfree(map); set->data = NULL; @@ -545,6 +540,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } +static void +list_set_cancel_gc(struct ip_set *set) +{ + struct list_set *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + timer_shutdown_sync(&map->gc); + + /* Flush list to drop references to other ipsets */ + list_set_flush(set); +} + static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, @@ -558,12 +565,13 @@ static const struct ip_set_type_variant set_variant = { .head = list_set_head, .list = list_set_list, .same_set = list_set_same_set, + .cancel_gc = list_set_cancel_gc, }; static void list_set_gc(struct timer_list *t) { - struct list_set *map = from_timer(map, t, gc); + struct list_set *map = timer_container_of(map, t, gc); struct ip_set *set = map->set; spin_lock_bh(&set->lock); @@ -603,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) return true; } +static struct lock_class_key list_set_lockdep_key; + static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -619,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; + lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 271da8447b29..c203252e856d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -44,7 +44,8 @@ config IP_VS_DEBUG config IP_VS_TAB_BITS int "IPVS connection table size (the Nth power of 2)" - range 8 20 + range 8 20 if !64BIT + range 8 27 if 64BIT default 12 help The IPVS connection hash table uses the chaining scheme to handle @@ -54,24 +55,24 @@ config IP_VS_TAB_BITS Note the table size must be power of 2. The table size will be the value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). + from 8 to 27 for 64BIT(20 otherwise), the default number is 12, + which means the table size is 4096. Don't input the number too + small, otherwise you will lose performance on it. You can adapt the + table size yourself, according to your virtual server application. + It is good to set the table size not far less than the number of + connections per second multiplying average lasting time of + connection in the table. For example, your virtual server gets 200 + connections per second, the connection lasts for 200 seconds in + average in the connection table, the table size should be not far + less than 200x200, it is good to set the table size 32768 (2**15). Another note that each connection occupies 128 bytes effectively and each hash entry uses 8 bytes, so you can estimate how much memory is needed for your box. You can overwrite this number setting conn_tab_bits module parameter - or by appending ip_vs.conn_tab_bits=? to the kernel command line - if IP VS was compiled built-in. + or by appending ip_vs.conn_tab_bits=? to the kernel command line if + IP VS was compiled built-in. comment "IPVS transport protocol load balancing support" @@ -104,7 +105,7 @@ config IP_VS_PROTO_AH config IP_VS_PROTO_SCTP bool "SCTP load balancing support" - select LIBCRC32C + select NET_CRC32C help This option enables support for load balancing SCTP transport protocol. Say Y if unsure. diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index fdacbc3c15be..d54d7da58334 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -13,8 +13,7 @@ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 13534e02346c..50cc492c7553 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -17,8 +17,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/interrupt.h> #include <linux/in.h> @@ -26,12 +25,12 @@ #include <linux/net.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/vmalloc.h> #include <linux/proc_fs.h> /* for proc_net_* */ #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/rcupdate_wait.h> #include <net/net_namespace.h> #include <net/ip_vs.h> @@ -822,7 +821,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) /* Try to delete connection while not holding reference */ static void ip_vs_conn_del(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) { + if (timer_delete(&cp->timer)) { /* Drop cp->control chain too */ if (cp->control) cp->timeout = 0; @@ -833,7 +832,7 @@ static void ip_vs_conn_del(struct ip_vs_conn *cp) /* Try to delete connection while holding reference */ static void ip_vs_conn_del_put(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) { + if (timer_delete(&cp->timer)) { /* Drop cp->control chain too */ if (cp->control) cp->timeout = 0; @@ -846,7 +845,7 @@ static void ip_vs_conn_del_put(struct ip_vs_conn *cp) static void ip_vs_conn_expire(struct timer_list *t) { - struct ip_vs_conn *cp = from_timer(cp, t, timer); + struct ip_vs_conn *cp = timer_container_of(cp, t, timer); struct netns_ipvs *ipvs = cp->ipvs; /* @@ -860,7 +859,7 @@ static void ip_vs_conn_expire(struct timer_list *t) struct ip_vs_conn *ct = cp->control; /* delete the timer if it is activated by other users */ - del_timer(&cp->timer); + timer_delete(&cp->timer); /* does anybody control me? */ if (ct) { @@ -885,7 +884,7 @@ static void ip_vs_conn_expire(struct timer_list *t) * conntrack cleanup for the net. */ smp_rmb(); - if (ipvs->enable) + if (READ_ONCE(ipvs->enable)) ip_vs_conn_drop_conntrack(cp); } @@ -926,7 +925,7 @@ static void ip_vs_conn_expire(struct timer_list *t) void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { /* Using mod_timer_pending will ensure the timer is not - * modified after the final del_timer in ip_vs_conn_expire. + * modified after the final timer_delete in ip_vs_conn_expire. */ if (timer_pending(&cp->timer) && time_after(cp->timer.expires, jiffies)) @@ -1046,28 +1045,35 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, #ifdef CONFIG_PROC_FS struct ip_vs_iter_state { struct seq_net_private p; - struct hlist_head *l; + unsigned int bucket; + unsigned int skip_elems; }; -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +static void *ip_vs_conn_array(struct ip_vs_iter_state *iter) { int idx; struct ip_vs_conn *cp; - struct ip_vs_iter_state *iter = seq->private; - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) { + unsigned int skip = 0; + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { /* __ip_vs_conn_get() is not needed by * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show */ - if (pos-- == 0) { - iter->l = &ip_vs_conn_tab[idx]; + if (skip >= iter->skip_elems) { + iter->bucket = idx; return cp; } + + ++skip; } + + iter->skip_elems = 0; cond_resched_rcu(); } + iter->bucket = idx; return NULL; } @@ -1076,9 +1082,14 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) { struct ip_vs_iter_state *iter = seq->private; - iter->l = NULL; rcu_read_lock(); - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; + if (*pos == 0) { + iter->skip_elems = 0; + iter->bucket = 0; + return SEQ_START_TOKEN; + } + + return ip_vs_conn_array(iter); } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -1086,28 +1097,22 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct ip_vs_conn *cp = v; struct ip_vs_iter_state *iter = seq->private; struct hlist_node *e; - struct hlist_head *l = iter->l; - int idx; ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); + return ip_vs_conn_array(iter); /* more on same hash chain? */ e = rcu_dereference(hlist_next_rcu(&cp->c_list)); - if (e) + if (e) { + iter->skip_elems++; return hlist_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - while (++idx < ip_vs_conn_tab_size) { - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - iter->l = &ip_vs_conn_tab[idx]; - return cp; - } - cond_resched_rcu(); } - iter->l = NULL; - return NULL; + + iter->skip_elems = 0; + iter->bucket++; + + return ip_vs_conn_array(iter); } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) @@ -1433,7 +1438,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) cond_resched_rcu(); /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) break; } rcu_read_unlock(); @@ -1481,37 +1486,44 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_conn_init(void) { + size_t tab_array_size; + int max_avail; +#if BITS_PER_LONG > 32 + int max = 27; +#else + int max = 20; +#endif + int min = 8; int idx; - /* Compute size and mask */ - if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { - pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); - ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; - } + max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT; + max_avail -= 2; /* ~4 in hash row */ + max_avail -= 1; /* IPVS up to 1/2 of mem */ + max_avail -= order_base_2(sizeof(struct ip_vs_conn)); + max = clamp(max_avail, min, max); + ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; /* * Allocate the connection hash table and initialize its list heads */ - ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size, - sizeof(*ip_vs_conn_tab))); + tab_array_size = array_size(ip_vs_conn_tab_size, + sizeof(*ip_vs_conn_tab)); + ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size, + sizeof(*ip_vs_conn_tab), GFP_KERNEL); if (!ip_vs_conn_tab) return -ENOMEM; /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); + ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); + kvfree(ip_vs_conn_tab); return -ENOMEM; } - pr_info("Connection hash table configured " - "(size=%d, memory=%ldKbytes)\n", - ip_vs_conn_tab_size, - (long)(ip_vs_conn_tab_size*sizeof(*ip_vs_conn_tab))/1024); + pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n", + ip_vs_conn_tab_size, tab_array_size / 1024); IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n", sizeof(struct ip_vs_conn)); @@ -1534,5 +1546,5 @@ void ip_vs_conn_cleanup(void) rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - vfree(ip_vs_conn_tab); + kvfree(ip_vs_conn_tab); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 2fcc26507d69..90d56f92c0f6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -19,8 +19,7 @@ * Harald Welte don't use nfcache */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -1140,7 +1139,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, __be16 vport; unsigned int flags; - EnterFunction(12); vaddr = &svc->addr; vport = svc->port; daddr = &iph->saddr; @@ -1208,7 +1206,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), cp->flags, refcount_read(&cp->refcnt)); - LeaveFunction(12); return cp; } @@ -1316,13 +1313,11 @@ after_nat: ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); - LeaveFunction(11); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); - LeaveFunction(11); return NF_STOLEN; } @@ -1341,8 +1336,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat int af = state->pf; struct sock *sk; - EnterFunction(11); - /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; @@ -1352,16 +1345,13 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - if (!ipvs->enable) - return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1556,6 +1546,7 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 type; /* Only support version 0 and C (csum) */ @@ -1566,7 +1557,10 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (type != htons(ETH_P_IP)) goto unk; *proto = IPPROTO_IPIP; - return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + + gre_flags_to_tnl_flags(flags, greh->flags); + + return gre_calc_hlen(flags); } unk: @@ -1942,7 +1936,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); @@ -1952,7 +1946,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } @@ -2110,7 +2104,7 @@ ip_vs_forward_icmp(void *priv, struct sk_buff *skb, int r; /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; if (state->pf == NFPROTO_IPV4) { @@ -2297,7 +2291,7 @@ static int __net_init __ip_vs_init(struct net *net) return -ENOMEM; /* Hold the beast until a service is registered */ - ipvs->enable = 0; + WRITE_ONCE(ipvs->enable, 0); ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); @@ -2365,16 +2359,14 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) struct netns_ipvs *ipvs; struct net *net; - EnterFunction(2); list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_unregister_hooks(ipvs, AF_INET); ip_vs_unregister_hooks(ipvs, AF_INET6); - ipvs->enable = 0; /* Disable packet reception */ + WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } - LeaveFunction(2); } static struct pernet_operations ipvs_core_ops = { @@ -2458,3 +2450,4 @@ static void __exit ip_vs_cleanup(void) module_init(ip_vs_init); module_exit(ip_vs_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Virtual Server"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 2a5ed71c82c3..068702894377 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -13,8 +13,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/init.h> @@ -94,6 +93,7 @@ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; + int amemthresh; int nomem; int to_change = -1; @@ -105,7 +105,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ - nomem = (availmem < ipvs->sysctl_amemthresh); + amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); + nomem = (availmem < amemthresh); local_bh_disable(); @@ -145,9 +146,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 1: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; @@ -155,9 +155,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 2: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; @@ -256,7 +255,7 @@ static void est_reload_work_handler(struct work_struct *work) struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock; if (!kd) continue; @@ -848,7 +847,7 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; - del_timer_sync(&ipvs->dest_trash_timer); + timer_delete_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { list_del(&dest->t_list); @@ -1061,8 +1060,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) unsigned int atype; int ret; - EnterFunction(2); - #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); @@ -1111,7 +1108,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) spin_lock_init(&dest->dst_lock); __ip_vs_update_dest(svc, dest, udest, 1); - LeaveFunction(2); return 0; err_stats: @@ -1134,8 +1130,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) __be16 dport = udest->port; int ret; - EnterFunction(2); - if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; @@ -1183,7 +1177,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) - goto err; + return ret; __ip_vs_update_dest(svc, dest, udest, 1); } else { /* @@ -1192,9 +1186,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ret = ip_vs_new_dest(svc, udest); } -err: - LeaveFunction(2); - return ret; } @@ -1209,8 +1200,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) union nf_inet_addr daddr; __be16 dport = udest->port; - EnterFunction(2); - if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; @@ -1242,7 +1231,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) } __ip_vs_update_dest(svc, dest, udest, 0); - LeaveFunction(2); return 0; } @@ -1317,8 +1305,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) struct ip_vs_dest *dest; __be16 dport = udest->port; - EnterFunction(2); - /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); @@ -1339,14 +1325,13 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) */ __ip_vs_del_dest(svc->ipvs, dest, false); - LeaveFunction(2); - return 0; } static void ip_vs_dest_trash_expire(struct timer_list *t) { - struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); + struct netns_ipvs *ipvs = timer_container_of(ipvs, t, + dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; @@ -1474,18 +1459,18 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (ret < 0) goto out_err; - /* Bind the ct retriever */ - RCU_INIT_POINTER(svc->pe, pe); - pe = NULL; - /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - if (svc->pe && svc->pe->conn_out) + if (pe && pe->conn_out) atomic_inc(&ipvs->conn_out_counter); + /* Bind the ct retriever */ + RCU_INIT_POINTER(svc->pe, pe); + pe = NULL; + /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; @@ -1497,9 +1482,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; - if (!ipvs->enable) { + if (!READ_ONCE(ipvs->enable)) { /* Now there is a service - full throttle */ - ipvs->enable = 1; + WRITE_ONCE(ipvs->enable, 1); /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); @@ -1746,7 +1731,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list) struct netns_ipvs *ipvs; struct net *net; - EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); list_for_each_entry(net, net_list, exit_list) { @@ -1754,7 +1738,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list) ip_vs_flush(ipvs, true); } mutex_unlock(&__ip_vs_mutex); - LeaveFunction(2); } /* Put all references for device (dst_cache) */ @@ -1792,7 +1775,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); - EnterFunction(2); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { @@ -1821,7 +1803,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, } spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); - LeaveFunction(2); return NOTIFY_DONE; } @@ -1865,7 +1846,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) #ifdef CONFIG_SYSCTL static int -proc_do_defense_mode(struct ctl_table *table, int write, +proc_do_defense_mode(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; @@ -1892,9 +1873,10 @@ proc_do_defense_mode(struct ctl_table *table, int write, } static int -proc_do_sync_threshold(struct ctl_table *table, int write, +proc_do_sync_threshold(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { + struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val[2]; int rc; @@ -1904,6 +1886,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, .mode = table->mode, }; + mutex_lock(&ipvs->sync_mutex); memcpy(val, valp, sizeof(val)); rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { @@ -1913,11 +1896,12 @@ proc_do_sync_threshold(struct ctl_table *table, int write, else memcpy(valp, val, sizeof(val)); } + mutex_unlock(&ipvs->sync_mutex); return rc; } static int -proc_do_sync_ports(struct ctl_table *table, int write, +proc_do_sync_ports(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1940,7 +1924,8 @@ proc_do_sync_ports(struct ctl_table *table, int write, return rc; } -static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer) +static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, + void *buffer) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; @@ -1978,8 +1963,8 @@ out: return ret; } -static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, - size_t size) +static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, + void *buffer, size_t size) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; @@ -1999,7 +1984,7 @@ static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, return ret; } -static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, +static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2026,7 +2011,7 @@ static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, return ret; } -static int ipvs_proc_est_nice(struct ctl_table *table, int write, +static int ipvs_proc_est_nice(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; @@ -2056,7 +2041,7 @@ static int ipvs_proc_est_nice(struct ctl_table *table, int write, return ret; } -static int ipvs_proc_run_estimation(struct ctl_table *table, int write, +static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; @@ -2279,7 +2264,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif - { } }; #endif @@ -3107,12 +3091,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; - int size; + size_t size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } @@ -3148,12 +3132,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; - int size; + size_t size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } @@ -3678,10 +3662,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); - if (nla_addr_family) - udest->af = nla_get_u16(nla_addr_family); - else - udest->af = 0; + udest->af = nla_get_u16_default(nla_addr_family, 0); /* If a full entry was requested, check for the additional fields */ if (full_entry) { @@ -4285,6 +4266,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) struct net *net = ipvs->net; struct ctl_table *tbl; int idx, ret; + size_t ctl_table_size = ARRAY_SIZE(vs_vars); + bool unpriv = net->user_ns != &init_user_ns; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); @@ -4299,10 +4282,6 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - tbl[0].procname = NULL; } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -4328,10 +4307,17 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; @@ -4340,6 +4326,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx].extra2 = ipvs; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; @@ -4353,15 +4340,22 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; + ipvs->sysctl_run_estimation = 1; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_cpulist; ipvs->sysctl_est_nice = IPVS_EST_NICE; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; @@ -4372,7 +4366,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) #endif ret = -ENOMEM; - ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); + ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, + ctl_table_size); if (!ipvs->sysctl_hdr) goto err; ipvs->sysctl_tbl = tbl; @@ -4537,8 +4532,6 @@ int __init ip_vs_control_init(void) int idx; int ret; - EnterFunction(2); - /* Initialize svc_table, ip_vs_svc_fwm_table */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); @@ -4551,15 +4544,12 @@ int __init ip_vs_control_init(void) if (ret < 0) return ret; - LeaveFunction(2); return 0; } void ip_vs_control_cleanup(void) { - EnterFunction(2); unregister_netdevice_notifier(&ip_vs_dst_notifier); /* relying on common rcu_barrier() in ip_vs_cleanup() */ - LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 5e6ec32aff2b..bb7aca4601ff 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -30,8 +30,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -270,3 +269,4 @@ static void __exit ip_vs_dh_cleanup(void) module_init(ip_vs_dh_init); module_exit(ip_vs_dh_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs destination hashing scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index ce2a1549b304..77f4f637ff67 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -12,8 +12,7 @@ * get_stats()) do the per cpu summing. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/jiffies.h> @@ -21,6 +20,7 @@ #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> +#include <linux/rcupdate_wait.h> #include <net/ip_vs.h> @@ -230,7 +230,7 @@ static int ip_vs_estimation_kthread(void *data) void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ @@ -264,7 +264,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, } set_user_nice(kd->task, sysctl_est_nice(ipvs)); - set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); + if (sysctl_est_preferred_cpulist(ipvs)) + kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); @@ -304,7 +305,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - ipvs->enable && ipvs->est_max_threads) + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); @@ -341,7 +342,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) } /* Start kthread tasks only when services are present */ - if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { + if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; @@ -484,7 +485,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && ipvs->enable) + if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; @@ -549,7 +550,7 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) __set_bit(row, kd->avail); if (!kd->tick_len[row]) { RCU_INIT_POINTER(kd->ticks[row], NULL); - kfree_rcu(td); + kfree_rcu(td, rcu_head); } kd->est_count--; if (kd->est_count) { @@ -661,7 +662,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; } @@ -679,7 +680,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) rcu_read_unlock(); local_bh_enable(); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; cond_resched(); @@ -755,7 +756,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) @@ -785,7 +786,7 @@ last_kt: id = ipvs->est_kt_count; next_kt: - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto unlock; id--; if (id < 0) diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c index b846cc385279..d657b47c6511 100644 --- a/net/netfilter/ipvs/ip_vs_fo.c +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -8,8 +8,7 @@ * Kenny Mathis : added initial functionality based on weight */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -72,3 +71,4 @@ static void __exit ip_vs_fo_cleanup(void) module_init(ip_vs_fo_init); module_exit(ip_vs_fo_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted failover scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index ef1f45e43b63..b315c608fda4 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -16,8 +16,7 @@ * Author: Wouter Gadeyne */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/moduleparam.h> @@ -35,7 +34,7 @@ #include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/ip_vs.h> @@ -53,6 +52,7 @@ enum { IP_VS_FTP_EPSV, }; +static bool exiting_module; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. @@ -605,7 +605,7 @@ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!ipvs) + if (!ipvs || !exiting_module) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); @@ -627,6 +627,7 @@ static int __init ip_vs_ftp_init(void) */ static void __exit ip_vs_ftp_exit(void) { + exiting_module = true; unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } @@ -635,3 +636,4 @@ static void __exit ip_vs_ftp_exit(void) module_init(ip_vs_ftp_init); module_exit(ip_vs_ftp_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs ftp helper"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 1b87214d385e..e6c8ed0c92f6 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -34,8 +34,7 @@ * me to write this module. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -123,7 +122,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -293,7 +291,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) */ static void ip_vs_lblc_check_expire(struct timer_list *t) { - struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_lblc_table *tbl = timer_container_of(tbl, t, + periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; @@ -550,6 +549,7 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = { static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; @@ -563,15 +563,16 @@ static int __net_init __ip_vs_lblc_init(struct net *net) /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) - ipvs->lblc_ctl_table[0].procname = NULL; + vars_table_size = 0; } else ipvs->lblc_ctl_table = vs_vars_table; ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; - ipvs->lblc_ctl_header = - register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); + ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", + ipvs->lblc_ctl_table, + vars_table_size); if (!ipvs->lblc_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); @@ -628,3 +629,4 @@ static void __exit ip_vs_lblc_cleanup(void) module_init(ip_vs_lblc_init); module_exit(ip_vs_lblc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index ad8f5fea6d3a..a25cf7bb6185 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -32,8 +32,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/module.h> @@ -294,7 +293,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -457,7 +455,8 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) */ static void ip_vs_lblcr_check_expire(struct timer_list *t) { - struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_lblcr_table *tbl = timer_container_of(tbl, t, + periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; @@ -736,6 +735,7 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; @@ -749,14 +749,15 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) - ipvs->lblcr_ctl_table[0].procname = NULL; + vars_table_size = 0; } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; - ipvs->lblcr_ctl_header = - register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); + ipvs->lblcr_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", + ipvs->lblcr_ctl_table, + vars_table_size); if (!ipvs->lblcr_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); @@ -813,3 +814,4 @@ static void __exit ip_vs_lblcr_cleanup(void) module_init(ip_vs_lblcr_init); module_exit(ip_vs_lblcr_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 9d34d81fc6f1..38cc38c5d8bb 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -9,8 +9,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -86,3 +85,4 @@ static void __exit ip_vs_lc_cleanup(void) module_init(ip_vs_lc_init); module_exit(ip_vs_lc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs least connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c index e3d7f5c879ce..f61f54004c9e 100644 --- a/net/netfilter/ipvs/ip_vs_mh.c +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -17,8 +17,7 @@ https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 08adcb222986..81974f69e5bb 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -30,8 +30,7 @@ * PASV response can not be NAT-ed) but Active FTP should work */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/types.h> diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index f56862a87518..ada158c610ce 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -26,8 +26,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -136,3 +135,4 @@ static void __exit ip_vs_nq_cleanup(void) module_init(ip_vs_nq_init); module_exit(ip_vs_nq_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs never queue scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index c03066fdd5ca..c5c67df80a0b 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -12,8 +12,7 @@ * active connections */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -79,3 +78,4 @@ static void __exit ip_vs_ovf_cleanup(void) module_init(ip_vs_ovf_init); module_exit(ip_vs_ovf_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs overflow connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 166c669f0763..3035079ebd99 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/spinlock.h> diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0ac6705a61d3..85f31d71e29a 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -185,3 +184,4 @@ static void __exit ip_vs_sip_cleanup(void) module_init(ip_vs_sip_init); module_exit(ip_vs_sip_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs sip helper"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index f100da4ba3bc..fd9dbca24c85 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -8,8 +8,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -340,7 +339,7 @@ void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_protocol_init(void) { - char protocols[64]; + char protocols[64] = { 0 }; #define REGISTER_PROTOCOL(p) \ do { \ register_ip_vs_protocol(p); \ @@ -348,8 +347,6 @@ int __init ip_vs_protocol_init(void) strcat(protocols, (p)->name); \ } while (0) - protocols[0] = '\0'; - protocols[2] = '\0'; #ifdef CONFIG_IP_VS_PROTO_TCP REGISTER_PROTOCOL(&ip_vs_protocol_tcp); #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 89602c16f6b6..44e14acc187e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -6,8 +6,7 @@ * Wensong Zhang <wensong@linuxvirtualserver.org> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index a0921adc31a9..83e452916403 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -126,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -174,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 7da51390cea6..f68a1533ee45 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -13,8 +13,7 @@ * protocol ip_vs_proto_data and is handled by netns */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/ip.h> diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 68260d91c988..0f0107c80dd2 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -9,8 +9,7 @@ * Network name space (netns) aware. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 38495c6f6c7c..4125ee561cdc 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -14,8 +14,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -122,4 +121,5 @@ static void __exit ip_vs_rr_cleanup(void) module_init(ip_vs_rr_init); module_exit(ip_vs_rr_cleanup); +MODULE_DESCRIPTION("ipvs round-robin scheduler"); MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index d4903723be7e..c6e421c4e299 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -12,8 +12,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/spinlock.h> diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 7663288e5358..245a323c84cd 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -30,8 +30,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -137,3 +136,4 @@ static void __exit ip_vs_sed_cleanup(void) module_init(ip_vs_sed_init); module_exit(ip_vs_sed_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs shortest expected delay scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index c2028e412092..0e85e07e23b9 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -32,8 +32,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -376,3 +375,4 @@ static void __exit ip_vs_sh_cleanup(void) module_init(ip_vs_sh_init); module_exit(ip_vs_sh_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs source hashing scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 4963fec815da..54dd1514ac45 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -32,8 +32,7 @@ * Persistence support, fwmark and time-out. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/slab.h> @@ -51,7 +50,7 @@ #include <linux/kernel.h> #include <linux/sched/signal.h> -#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ +#include <linux/unaligned.h> /* Used for ntoh_seq and hton_seq */ #include <net/ip.h> #include <net/sock.h> @@ -603,7 +602,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { struct ip_vs_sync_conn_options *opt = (struct ip_vs_sync_conn_options *)&s[1]; - memcpy(opt, &cp->in_seq, sizeof(*opt)); + memcpy(opt, &cp->sync_conn_opt, sizeof(*opt)); } m->nr_conns++; @@ -1297,20 +1296,14 @@ static void set_sock_size(struct sock *sk, int mode, int val) */ static void set_mcast_loop(struct sock *sk, u_char loop) { - struct inet_sock *inet = inet_sk(sk); - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; + inet_assign_bit(MC_LOOP, sk, loop); #ifdef CONFIG_IP_VS_IPV6 - if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - + if (READ_ONCE(sk->sk_family) == AF_INET6) { /* IPV6_MULTICAST_LOOP */ - np->mc_loop = loop ? 1 : 0; + inet6_assign_bit(MC6_LOOP, sk, loop); } #endif - release_sock(sk); } /* @@ -1322,13 +1315,13 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ lock_sock(sk); - inet->mc_ttl = ttl; + WRITE_ONCE(inet->mc_ttl, ttl); #ifdef CONFIG_IP_VS_IPV6 if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MULTICAST_HOPS */ - np->mcast_hops = ttl; + WRITE_ONCE(np->mcast_hops, ttl); } #endif release_sock(sk); @@ -1341,13 +1334,13 @@ static void set_mcast_pmtudisc(struct sock *sk, int val) /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ lock_sock(sk); - inet->pmtudisc = val; + WRITE_ONCE(inet->pmtudisc, val); #ifdef CONFIG_IP_VS_IPV6 if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MTU_DISCOVER */ - np->pmtudisc = val; + WRITE_ONCE(np->pmtudisc, val); } #endif release_sock(sk); @@ -1371,7 +1364,7 @@ static int set_mcast_if(struct sock *sk, struct net_device *dev) struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MULTICAST_IF */ - np->mcast_oif = dev->ifindex; + WRITE_ONCE(np->mcast_oif, dev->ifindex); } #endif release_sock(sk); @@ -1441,7 +1434,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) sin.sin_addr.s_addr = addr; sin.sin_port = 0; - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); + return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); } static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, @@ -1507,8 +1500,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - salen, 0); + result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr, + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; @@ -1548,7 +1541,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id, get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); sock->sk->sk_bound_dev_if = dev->ifindex; - result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); + result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; @@ -1582,13 +1575,11 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) struct kvec iov; int len; - EnterFunction(7); iov.iov_base = (void *)buffer; iov.iov_len = length; len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); - LeaveFunction(7); return len; } @@ -1614,15 +1605,12 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) struct kvec iov = {buffer, buflen}; int len; - EnterFunction(7); - /* Receive a packet */ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen); len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); if (len < 0) return len; - LeaveFunction(7); return len; } diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c index 3308e4cc740a..dbb7f5fd4688 100644 --- a/net/netfilter/ipvs/ip_vs_twos.c +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -4,8 +4,7 @@ * Authors: Darby Payne <darby.payne@applovin.com> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/module.h> @@ -137,3 +136,4 @@ static void __exit ip_vs_twos_cleanup(void) module_init(ip_vs_twos_init); module_exit(ip_vs_twos_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs power of twos choice scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 09f584b564a0..9da445ca09a1 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -14,8 +14,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -109,3 +108,4 @@ static void __exit ip_vs_wlc_cleanup(void) module_init(ip_vs_wlc_init); module_exit(ip_vs_wlc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted least connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 1bc7a0789d85..99f09cbf2d9b 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -13,8 +13,7 @@ * with weight 0 when all weights are zero */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -263,3 +262,4 @@ static void __exit ip_vs_wrr_cleanup(void) module_init(ip_vs_wrr_init); module_exit(ip_vs_wrr_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted round-robin scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 029171379884..3162ce3c2640 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -21,8 +21,7 @@ * - the only place where we can see skb->sk != NULL */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/slab.h> @@ -97,7 +96,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest) if (!dest_dst) return NULL; dst = dest_dst->dst_cache; - if (dst->obsolete && + if (READ_ONCE(dst->obsolete) && dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; return dest_dst; @@ -119,13 +118,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) return false; } -/* Get route to daddr, update *saddr, optionally bind route to saddr */ +/* Get route to daddr, optionally bind route to saddr */ static struct rtable *do_output_route4(struct net *net, __be32 daddr, - int rt_mode, __be32 *saddr) + int rt_mode, __be32 *ret_saddr) { struct flowi4 fl4; struct rtable *rt; - bool loop = false; memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -135,23 +133,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, retry: rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { - /* Invalid saddr ? */ - if (PTR_ERR(rt) == -EINVAL && *saddr && - rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { - *saddr = 0; - flowi4_update_output(&fl4, 0, 0, daddr, 0); - goto retry; - } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); return NULL; - } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + } + if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); - *saddr = fl4.saddr; - flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); - loop = true; + flowi4_update_output(&fl4, 0, daddr, fl4.saddr); + rt_mode = 0; goto retry; } - *saddr = fl4.saddr; + if (ret_saddr) + *ret_saddr = fl4.saddr; return rt; } @@ -180,7 +172,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && (addr_type & IPV6_ADDR_LOOPBACK); old_rt_is_local = __ip_vs_is_local_route6( - (struct rt6_info *)skb_dst(skb)); + dst_rt6_info(skb_dst(skb))); } else #endif { @@ -271,7 +263,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } @@ -286,7 +278,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, { if (ip_hdr(skb)->ttl <= 1) { /* Tell the sender its packet died... */ - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); return false; } @@ -318,7 +310,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rtable *) dest_dst->dst_cache; + rt = dst_rtable(dest_dst->dst_cache); else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); @@ -339,24 +331,20 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", &dest->addr.ip, &dest_dst->dst_saddr.ip, - atomic_read(&rt->dst.__refcnt)); + rcuref_read(&rt->dst.__rcuref)); } if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; } else { - __be32 saddr = htonl(INADDR_ANY); - noref = 0; /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; - rt = do_output_route4(net, daddr, rt_mode, &saddr); + rt = do_output_route4(net, daddr, rt_mode, ret_saddr); if (!rt) goto err_unreach; - if (ret_saddr) - *ret_saddr = saddr; } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; @@ -390,10 +378,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < 68) { @@ -481,7 +469,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rt6_info *) dest_dst->dst_cache; + rt = dst_rt6_info(dest_dst->dst_cache); else { u32 cookie; @@ -501,13 +489,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, ip_vs_dest_dst_free(dest_dst); goto err_unreach; } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", &dest->addr.in6, &dest_dst->dst_saddr.in6, - atomic_read(&rt->dst.__refcnt)); + rcuref_read(&rt->dst.__rcuref)); } if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.in6; @@ -517,7 +505,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, rt_mode); if (!dst) goto err_unreach; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); } local = __ip_vs_is_local_route6(rt); @@ -553,10 +541,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < IPV6_MIN_MTU) { @@ -706,8 +694,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, { struct iphdr *iph = ip_hdr(skb); - EnterFunction(10); - if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; @@ -719,12 +705,10 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); - LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } @@ -735,8 +719,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, { struct ipv6hdr *iph = ipv6_hdr(skb); - EnterFunction(10); - if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, &iph->daddr, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) @@ -747,12 +729,10 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); - LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } #endif @@ -768,8 +748,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ int local, rc, was_input; - EnterFunction(10); - /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; @@ -839,12 +817,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); - LeaveFunction(10); return rc; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } @@ -856,8 +832,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct rt6_info *rt; /* Route to the other host */ int local, rc; - EnterFunction(10); - /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { __be16 _pt, *p; @@ -876,7 +850,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_RDR); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -927,11 +901,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); - LeaveFunction(10); return rc; tx_error: - LeaveFunction(10); kfree_skb(skb); return NF_STOLEN; } @@ -994,7 +966,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) - *payload_len = ntohs(old_iph->tot_len); + *payload_len = skb_ip_totlen(skb); } /* Implement full-functionality option for ECN encapsulation */ @@ -1098,11 +1070,11 @@ ipvs_gre_encap(struct net *net, struct sk_buff *skb, { __be16 proto = *next_protocol == IPPROTO_IPIP ? htons(ETH_P_IP) : htons(ETH_P_IPV6); - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t hdrlen; if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); hdrlen = gre_calc_hlen(tflags); gre_build_header(skb, hdrlen, tflags, proto, 0, 0); @@ -1149,8 +1121,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, int tun_type, gso_type; int tun_flags; - EnterFunction(10); - local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1183,11 +1153,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1199,7 +1169,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, &next_protocol, NULL, &dsfield, &ttl, dfp); if (IS_ERR(skb)) - goto tx_error; + return NF_STOLEN; gso_type = __tun_gso_type_mask(AF_INET, cp->af); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { @@ -1225,6 +1195,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1267,14 +1238,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, else if (ret == NF_DROP) kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; tx_error: - if (!IS_ERR(skb)) - kfree_skb(skb); - LeaveFunction(10); + kfree_skb(skb); return NF_STOLEN; } @@ -1298,8 +1265,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, int tun_type, gso_type; int tun_flags; - EnterFunction(10); - local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, @@ -1311,7 +1276,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (local) return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); tdev = rt->dst.dev; /* @@ -1333,11 +1298,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1347,7 +1312,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, &next_protocol, &payload_len, &dsfield, &ttl, NULL); if (IS_ERR(skb)) - goto tx_error; + return NF_STOLEN; gso_type = __tun_gso_type_mask(AF_INET6, cp->af); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { @@ -1373,6 +1338,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1414,14 +1380,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, else if (ret == NF_DROP) kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; tx_error: - if (!IS_ERR(skb)) - kfree_skb(skb); - LeaveFunction(10); + kfree_skb(skb); return NF_STOLEN; } #endif @@ -1437,8 +1399,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, { int local; - EnterFunction(10); - local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1455,12 +1415,10 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); - LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } @@ -1471,8 +1429,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, { int local; - EnterFunction(10); - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, @@ -1489,12 +1445,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); - LeaveFunction(10); return NF_STOLEN; tx_error: kfree_skb(skb); - LeaveFunction(10); return NF_STOLEN; } #endif @@ -1514,8 +1468,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, int local; int rt_mode, was_input; - EnterFunction(10); - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be forwarded directly here, because there is no need to translate address/port back */ @@ -1526,7 +1478,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); - goto out; + return rc; } /* @@ -1582,14 +1534,11 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; - rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); - goto out; + return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); tx_error: kfree_skb(skb); rc = NF_STOLEN; - out: - LeaveFunction(10); return rc; } @@ -1604,8 +1553,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, int local; int rt_mode; - EnterFunction(10); - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be forwarded directly here, because there is no need to translate address/port back */ @@ -1616,7 +1563,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); - goto out; + return rc; } /* @@ -1631,7 +1578,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -1671,14 +1618,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; - rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); - goto out; + return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); tx_error: kfree_skb(skb); rc = NF_STOLEN; -out: - LeaveFunction(10); return rc; } #endif diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c new file mode 100644 index 000000000000..46e667a50d98 --- /dev/null +++ b/net/netfilter/nf_bpf_link.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/netfilter.h> + +#include <net/netfilter/nf_bpf_link.h> +#include <uapi/linux/netfilter_ipv4.h> + +static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, + const struct nf_hook_state *s) +{ + const struct bpf_prog *prog = bpf_prog; + struct bpf_nf_ctx ctx = { + .state = s, + .skb = skb, + }; + + return bpf_prog_run_pin_on_cpu(prog, &ctx); +} + +struct bpf_nf_link { + struct bpf_link link; + struct nf_hook_ops hook_ops; + netns_tracker ns_tracker; + struct net *net; + u32 dead; + const struct nf_defrag_hook *defrag_hook; +}; + +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) +static const struct nf_defrag_hook * +get_proto_defrag_hook(struct bpf_nf_link *link, + const struct nf_defrag_hook __rcu **ptr_global_hook, + const char *mod) +{ + const struct nf_defrag_hook *hook; + int err; + + /* RCU protects us from races against module unloading */ + rcu_read_lock(); + hook = rcu_dereference(*ptr_global_hook); + if (!hook) { + rcu_read_unlock(); + err = request_module("%s", mod); + if (err) + return ERR_PTR(err < 0 ? err : -EINVAL); + + rcu_read_lock(); + hook = rcu_dereference(*ptr_global_hook); + } + + if (hook && try_module_get(hook->owner)) { + /* Once we have a refcnt on the module, we no longer need RCU */ + hook = rcu_pointer_handoff(hook); + } else { + WARN_ONCE(!hook, "%s has bad registration", mod); + hook = ERR_PTR(-ENOENT); + } + rcu_read_unlock(); + + if (!IS_ERR(hook)) { + err = hook->enable(link->net); + if (err) { + module_put(hook->owner); + hook = ERR_PTR(err); + } + } + + return hook; +} +#endif + +static int bpf_nf_enable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook __maybe_unused *hook; + + switch (link->hook_ops.pf) { +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + case NFPROTO_IPV4: + hook = get_proto_defrag_hook(link, &nf_defrag_v4_hook, "nf_defrag_ipv4"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + case NFPROTO_IPV6: + hook = get_proto_defrag_hook(link, &nf_defrag_v6_hook, "nf_defrag_ipv6"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif + default: + return -EAFNOSUPPORT; + } +} + +static void bpf_nf_disable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook *hook = link->defrag_hook; + + if (!hook) + return; + hook->disable(link->net); + module_put(hook->owner); +} + +static void bpf_nf_link_release(struct bpf_link *link) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + if (nf_link->dead) + return; + + /* do not double release in case .detach was already called */ + if (!cmpxchg(&nf_link->dead, 0, 1)) { + nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); + bpf_nf_disable_defrag(nf_link); + put_net_track(nf_link->net, &nf_link->ns_tracker); + } +} + +static void bpf_nf_link_dealloc(struct bpf_link *link) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + kfree(nf_link); +} + +static int bpf_nf_link_detach(struct bpf_link *link) +{ + bpf_nf_link_release(link); + return 0; +} + +static void bpf_nf_link_show_info(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n", + nf_link->hook_ops.pf, nf_link->hook_ops.hooknum, + nf_link->hook_ops.priority); +} + +static int bpf_nf_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + const struct nf_defrag_hook *hook = nf_link->defrag_hook; + + info->netfilter.pf = nf_link->hook_ops.pf; + info->netfilter.hooknum = nf_link->hook_ops.hooknum; + info->netfilter.priority = nf_link->hook_ops.priority; + info->netfilter.flags = hook ? BPF_F_NETFILTER_IP_DEFRAG : 0; + + return 0; +} + +static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + return -EOPNOTSUPP; +} + +static const struct bpf_link_ops bpf_nf_link_lops = { + .release = bpf_nf_link_release, + .dealloc = bpf_nf_link_dealloc, + .detach = bpf_nf_link_detach, + .show_fdinfo = bpf_nf_link_show_info, + .fill_link_info = bpf_nf_link_fill_link_info, + .update_prog = bpf_nf_link_update, +}; + +static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) +{ + int prio; + + switch (attr->link_create.netfilter.pf) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS) + return -EPROTO; + break; + default: + return -EAFNOSUPPORT; + } + + if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG) + return -EOPNOTSUPP; + + /* make sure conntrack confirm is always last */ + prio = attr->link_create.netfilter.priority; + if (prio == NF_IP_PRI_FIRST) + return -ERANGE; /* sabotage_in and other warts */ + else if (prio == NF_IP_PRI_LAST) + return -ERANGE; /* e.g. conntrack confirm */ + else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) && + prio <= NF_IP_PRI_CONNTRACK_DEFRAG) + return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */ + + return 0; +} + +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_nf_link *link; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + err = bpf_nf_check_pf_and_hooks(attr); + if (err) + return err; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog, + attr->link_create.attach_type); + + link->hook_ops.hook = nf_hook_run_bpf; + link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF; + link->hook_ops.priv = prog; + + link->hook_ops.pf = attr->link_create.netfilter.pf; + link->hook_ops.priority = attr->link_create.netfilter.priority; + link->hook_ops.hooknum = attr->link_create.netfilter.hooknum; + + link->net = net; + link->dead = false; + link->defrag_hook = NULL; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) { + err = bpf_nf_enable_defrag(link); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + } + + err = nf_register_net_hook(net, &link->hook_ops); + if (err) { + bpf_nf_disable_defrag(link); + bpf_link_cleanup(&link_primer); + return err; + } + + get_net_track(net, &link->ns_tracker, GFP_KERNEL); + + return bpf_link_settle(&link_primer); +} + +const struct bpf_prog_ops netfilter_prog_ops = { + .test_run = bpf_prog_test_run_nf, +}; + +static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name) +{ + struct btf *btf; + s32 type_id; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR_OR_NULL(btf)) + return false; + + type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT); + if (WARN_ON_ONCE(type_id < 0)) + return false; + + info->btf = btf; + info->btf_id = type_id; + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + return true; +} + +static bool nf_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_nf_ctx)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) + return false; + + switch (off) { + case bpf_ctx_range(struct bpf_nf_ctx, skb): + if (size != sizeof_field(struct bpf_nf_ctx, skb)) + return false; + + return nf_ptr_to_btf_id(info, "sk_buff"); + case bpf_ctx_range(struct bpf_nf_ctx, state): + if (size != sizeof_field(struct bpf_nf_ctx, state)) + return false; + + return nf_ptr_to_btf_id(info, "nf_hook_state"); + default: + return false; + } + + return false; +} + +static const struct bpf_func_proto * +bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id, prog); +} + +const struct bpf_verifier_ops netfilter_verifier_ops = { + .is_valid_access = nf_is_valid_access, + .get_func_proto = bpf_nf_func_proto, +}; diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 5d8ed6c90b7e..f1be4dd5cf85 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -122,17 +122,67 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } +static bool get_ct_or_tuple_from_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conn **ct, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone **zone, + bool *refcounted) +{ + const struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *found_ct; + + found_ct = nf_ct_get(skb, &ctinfo); + if (found_ct && !nf_ct_is_template(found_ct)) { + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + *zone = nf_ct_zone(found_ct); + *ct = found_ct; + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) + return false; + + if (found_ct) + *zone = nf_ct_zone(found_ct); + + h = nf_conntrack_find_get(net, *zone, tuple); + if (!h) + return true; + + found_ct = nf_ct_tuplehash_to_ctrack(h); + *refcounted = true; + *ct = found_ct; + + return true; +} + static int __nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct = NULL; struct nf_conn *found_ct; unsigned int collect = 0; + bool refcounted = false; + + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) + return -ENOENT; - if (time_is_after_eq_jiffies((unsigned long)list->last_gc)) + if (ct && nf_ct_is_confirmed(ct)) { + if (refcounted) + nf_ct_put(ct); + return -EEXIST; + } + + if ((u32)jiffies == list->last_gc) goto add_new_node; /* check the saved connections */ @@ -144,10 +194,10 @@ static int __nf_conncount_add(struct net *net, if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - return 0; /* already exists */ + goto out_put; /* already exists */ } else { collect++; } @@ -156,7 +206,7 @@ static int __nf_conncount_add(struct net *net, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -165,7 +215,7 @@ static int __nf_conncount_add(struct net *net, * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); - return 0; + goto out_put; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -188,31 +238,35 @@ add_new_node: if (conn == NULL) return -ENOMEM; - conn->tuple = *tuple; + conn->tuple = tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; list->last_gc = (u32)jiffies; + +out_put: + if (refcounted) + nf_ct_put(ct); return 0; } -int nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +int nf_conncount_add_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); - ret = __nf_conncount_add(net, list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, list); spin_unlock_bh(&list->list_lock); return ret; } -EXPORT_SYMBOL_GPL(nf_conncount_add); +EXPORT_SYMBOL_GPL(nf_conncount_add_skb); void nf_conncount_list_init(struct nf_conncount_list *list) { @@ -224,8 +278,8 @@ void nf_conncount_list_init(struct nf_conncount_list *list) EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ -bool nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) +static bool __nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -234,11 +288,7 @@ bool nf_conncount_gc_list(struct net *net, bool ret = false; /* don't bother if we just did GC */ - if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc))) - return false; - - /* don't bother if other cpu is already doing GC */ - if (!spin_trylock(&list->list_lock)) + if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { @@ -269,7 +319,21 @@ bool nf_conncount_gc_list(struct net *net, if (!list->count) ret = true; list->last_gc = (u32)jiffies; - spin_unlock(&list->list_lock); + + return ret; +} + +bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) +{ + bool ret; + + /* don't bother if other cpu is already doing GC */ + if (!spin_trylock_bh(&list->list_lock)) + return false; + + ret = __nf_conncount_gc_list(net, list); + spin_unlock_bh(&list->list_lock); return ret; } @@ -309,20 +373,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree) static unsigned int insert_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + bool do_gc = true, refcounted = false; + unsigned int count = 0, gc_count = 0; struct rb_node **rbnode, *parent; - struct nf_conncount_rb *rbconn; + struct nf_conntrack_tuple tuple; struct nf_conncount_tuple *conn; - unsigned int count = 0, gc_count = 0; - u8 keylen = data->keylen; - bool do_gc = true; + struct nf_conncount_rb *rbconn; + struct nf_conn *ct = NULL; spin_lock_bh(&nf_conncount_locks[hash]); restart: @@ -333,7 +399,7 @@ restart: rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); parent = *rbnode; - diff = key_diff(key, rbconn->key, keylen); + diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { rbnode = &((*rbnode)->rb_left); } else if (diff > 0) { @@ -341,8 +407,8 @@ restart: } else { int ret; - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); - if (ret) + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); + if (ret && ret != -EEXIST) count = 0; /* hotdrop */ else count = rbconn->list.count; @@ -365,28 +431,35 @@ restart: goto restart; } - /* expected case: match, insert new node */ - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - goto out_unlock; + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { + /* expected case: match, insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + goto out_unlock; - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(conncount_rb_cachep, rbconn); - goto out_unlock; - } + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(conncount_rb_cachep, rbconn); + goto out_unlock; + } - conn->tuple = *tuple; - conn->zone = *zone; - memcpy(rbconn->key, key, sizeof(u32) * keylen); + conn->tuple = tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); - nf_conncount_list_init(&rbconn->list); - list_add(&conn->node, &rbconn->list.head); - count = 1; - rbconn->list.count = count; + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); + count = 1; + rbconn->list.count = count; - rb_link_node_rcu(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); + rb_link_node_rcu(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + + if (refcounted) + nf_ct_put(ct); + } out_unlock: spin_unlock_bh(&nf_conncount_locks[hash]); return count; @@ -394,16 +467,15 @@ out_unlock: static unsigned int count_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct rb_root *root; struct rb_node *parent; struct nf_conncount_rb *rbconn; unsigned int hash; - u8 keylen = data->keylen; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; @@ -414,7 +486,7 @@ count_tree(struct net *net, rbconn = rb_entry(parent, struct nf_conncount_rb, node); - diff = key_diff(key, rbconn->key, keylen); + diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { @@ -422,7 +494,7 @@ count_tree(struct net *net, } else { int ret; - if (!tuple) { + if (!skb) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } @@ -437,19 +509,23 @@ count_tree(struct net *net, } /* same source network -> be counted! */ - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); spin_unlock_bh(&rbconn->list.list_lock); - if (ret) + if (ret && ret != -EEXIST) { return 0; /* hotdrop */ - else + } else { + /* -EEXIST means add was skipped, update the list */ + if (ret == -EEXIST) + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } } } - if (!tuple) + if (!skb) return 0; - return insert_tree(net, data, root, hash, key, tuple, zone); + return insert_tree(net, skb, l3num, data, root, hash, key); } static void tree_gc_worker(struct work_struct *work) @@ -511,24 +587,24 @@ next: } /* Count and return number of conntrack entries in 'net' with particular 'key'. - * If 'tuple' is not null, insert it into the accounting data structure. - * Call with RCU read lock. + * If 'skb' is not null, insert the corresponding tuple into the accounting + * data structure. Call with RCU read lock. */ -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key) { - return count_tree(net, data, key, tuple, zone); + return count_tree(net, skb, l3num, data, key); + } -EXPORT_SYMBOL_GPL(nf_conncount_count); +EXPORT_SYMBOL_GPL(nf_conncount_count_skb); -struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, - unsigned int keylen) +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { struct nf_conncount_data *data; - int ret, i; + int i; if (keylen % sizeof(u32) || keylen / sizeof(u32) > MAX_KEYLEN || @@ -541,12 +617,6 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family if (!data) return ERR_PTR(-ENOMEM); - ret = nf_ct_netns_get(net, family); - if (ret < 0) { - kfree(data); - return ERR_PTR(ret); - } - for (i = 0; i < ARRAY_SIZE(data->root); ++i) data->root[i] = RB_ROOT; @@ -583,13 +653,11 @@ static void destroy_tree(struct rb_root *r) } } -void nf_conncount_destroy(struct net *net, unsigned int family, - struct nf_conncount_data *data) +void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data) { unsigned int i; cancel_work_sync(&data->gc_work); - nf_ct_netns_put(net, family); for (i = 0; i < ARRAY_SIZE(data->root); ++i) destroy_tree(&data->root[i]); @@ -605,15 +673,11 @@ static int __init nf_conncount_modinit(void) for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); - conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", - sizeof(struct nf_conncount_tuple), - 0, 0, NULL); + conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0); if (!conncount_conn_cachep) return -ENOMEM; - conncount_rb_cachep = kmem_cache_create("nf_conncount_rb", - sizeof(struct nf_conncount_rb), - 0, 0, NULL); + conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0); if (!conncount_rb_cachep) { kmem_cache_destroy(conncount_conn_cachep); return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index d011d2eb0848..7be4c35e4795 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb, /* increase the UDP timeout of the master connection as replies from * Amanda clients to the server can be quite delayed */ - nf_ct_refresh(ct, skb, master_timeout * HZ); + nf_ct_refresh(ct, master_timeout * HZ); /* No data? */ dataoff = protoff + sizeof(struct udphdr); diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 24002bc61e07..4a136fc3a9c0 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/btf_ids.h> #include <linux/net_namespace.h> +#include <net/xdp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> @@ -31,7 +32,9 @@ * -EINVAL - Passed NULL for bpf_tuple pointer * -EINVAL - opts->reserved is not 0 * -EINVAL - netns_id is less than -1 - * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12) + * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12 + * -EINVAL - opts->ct_zone_id set when + opts__sz isn't NF_BPF_CT_OPTS_SZ (16) * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP * -ENONET - No network namespace found for netns_id * -ENOENT - Conntrack lookup could not find entry for tuple @@ -41,6 +44,8 @@ * Values: * IPPROTO_TCP, IPPROTO_UDP * @dir: - connection tracking tuple direction. + * @ct_zone_id - connection tracking zone id. + * @ct_zone_dir - connection tracking zone direction. * @reserved - Reserved member, will be reused for more options in future * Values: * 0 @@ -50,11 +55,13 @@ struct bpf_ct_opts { s32 error; u8 l4proto; u8 dir; - u8 reserved[2]; + u16 ct_zone_id; + u8 ct_zone_dir; + u8 reserved[3]; }; enum { - NF_BPF_CT_OPTS_SZ = 12, + NF_BPF_CT_OPTS_SZ = 16, }; static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, @@ -103,12 +110,21 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, u32 timeout) { struct nf_conntrack_tuple otuple, rtuple; + struct nf_conntrack_zone ct_zone; struct nf_conn *ct; int err; - if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || - opts_len != NF_BPF_CT_OPTS_SZ) + if (!opts || !bpf_tuple) return ERR_PTR(-EINVAL); + if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) + return ERR_PTR(-EINVAL); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) + return ERR_PTR(-EINVAL); + } else { + if (opts->ct_zone_id) + return ERR_PTR(-EINVAL); + } if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) return ERR_PTR(-EINVAL); @@ -129,7 +145,16 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, return ERR_PTR(-ENONET); } - ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple, + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->ct_zone_dir == 0) + opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; + nf_ct_zone_init(&ct_zone, + opts->ct_zone_id, opts->ct_zone_dir, 0); + } else { + ct_zone = nf_ct_zone_dflt; + } + + ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple, GFP_ATOMIC); if (IS_ERR(ct)) goto out; @@ -151,12 +176,21 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, { struct nf_conntrack_tuple_hash *hash; struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone ct_zone; struct nf_conn *ct; int err; - if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || - opts_len != NF_BPF_CT_OPTS_SZ) + if (!opts || !bpf_tuple) return ERR_PTR(-EINVAL); + if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) + return ERR_PTR(-EINVAL); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) + return ERR_PTR(-EINVAL); + } else { + if (opts->ct_zone_id) + return ERR_PTR(-EINVAL); + } if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) return ERR_PTR(-EPROTO); if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) @@ -173,7 +207,16 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, return ERR_PTR(-ENONET); } - hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->ct_zone_dir == 0) + opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; + nf_ct_zone_init(&ct_zone, + opts->ct_zone_id, opts->ct_zone_dir, 0); + } else { + ct_zone = nf_ct_zone_dflt; + } + + hash = nf_conntrack_find_get(net, &ct_zone, &tuple); if (opts->netns_id >= 0) put_net(net); if (!hash) @@ -192,8 +235,7 @@ BTF_ID(struct, nf_conn___init) /* Check writes into `struct nf_conn` */ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + int off, int size) { const struct btf_type *ncit, *nct, *t; size_t end; @@ -230,9 +272,7 @@ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, return 0; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in nf_conntrack BTF"); +__bpf_kfunc_start_defs(); /* bpf_xdp_ct_alloc - Allocate a new CT entry * @@ -247,9 +287,9 @@ __diag_ignore_all("-Wmissing-prototypes", * @opts - Additional options for allocation (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ -struct nf_conn___init * +__bpf_kfunc struct nf_conn___init * bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) { @@ -281,9 +321,9 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for lookup (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ -struct nf_conn * +__bpf_kfunc struct nf_conn * bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) { @@ -314,9 +354,9 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for allocation (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ -struct nf_conn___init * +__bpf_kfunc struct nf_conn___init * bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) { @@ -349,9 +389,9 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for lookup (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ -struct nf_conn * +__bpf_kfunc struct nf_conn * bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) { @@ -376,11 +416,13 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, * @nfct - Pointer to referenced nf_conn___init object, obtained * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc. */ -struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) +__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) { struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; + if (!nf_ct_is_confirmed(nfct)) + nfct->timeout += nfct_time_stamp; nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { @@ -400,10 +442,8 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) * @nf_conn - Pointer to referenced nf_conn object, obtained using * bpf_xdp_ct_lookup or bpf_skb_ct_lookup. */ -void bpf_ct_release(struct nf_conn *nfct) +__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct) { - if (!nfct) - return; nf_ct_put(nfct); } @@ -417,7 +457,7 @@ void bpf_ct_release(struct nf_conn *nfct) * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. * @timeout - Timeout in msecs. */ -void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) +__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) { __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout)); } @@ -432,7 +472,7 @@ void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup. * @timeout - New timeout in msecs. */ -int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) +__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) { return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout)); } @@ -447,7 +487,7 @@ int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. * @status - New status value. */ -int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) +__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) { return nf_ct_change_status_common((struct nf_conn *)nfct, status); } @@ -462,14 +502,14 @@ int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup. * @status - New status value. */ -int bpf_ct_change_status(struct nf_conn *nfct, u32 status) +__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) { return nf_ct_change_status_common(nfct, status); } -__diag_pop() +__bpf_kfunc_end_defs(); -BTF_SET8_START(nf_ct_kfunc_set) +BTF_KFUNCS_START(nf_ct_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) @@ -480,7 +520,7 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) -BTF_SET8_END(nf_ct_kfunc_set) +BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 9fb9b8031298..a7552a46d6ac 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -75,10 +75,11 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); - nf_ct_refresh(ct, skb, timeout * HZ); + nf_ct_refresh(ct, timeout * HZ); out: return NF_ACCEPT; } EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Broadcast connection tracking helper"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 496c4920505b..0b95f226f211 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -96,8 +96,8 @@ static DEFINE_MUTEX(nf_conntrack_mutex); #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) #define GC_SCAN_EXPIRED_MAX (64000u / HZ) -#define MIN_CHAINLEN 8u -#define MAX_CHAINLEN (32u - MIN_CHAINLEN) +#define MIN_CHAINLEN 50u +#define MAX_CHAINLEN (80u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; @@ -136,8 +136,8 @@ static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) } /* return true if we need to recompute hashes (in case hash table was resized) */ -static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, - unsigned int h2, unsigned int sequence) +static bool nf_conntrack_double_lock(unsigned int h1, unsigned int h2, + unsigned int sequence) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; @@ -211,24 +211,18 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, unsigned int zoneid, const struct net *net) { - u64 a, b, c, d; + siphash_key_t key; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); - /* The direction must be ignored, handle usable tuplehash members manually */ - a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3]; - b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3]; + key = nf_conntrack_hash_rnd; - c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16; - c |= tuple->dst.protonum; + key.key[0] ^= zoneid; + key.key[1] ^= net_hash_mix(net); - d = (u64)zoneid << 32 | net_hash_mix(net); - - /* IPv4: u3.all[1,2,3] == 0 */ - c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2]; - d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2]; - - return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd); + return siphash((void *)tuple, + offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), + &key); } static u32 scale_hash(u32 hash) @@ -335,9 +329,6 @@ nf_ct_get_tuple(const struct sk_buff *skb, #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: -#endif /* fallthrough */ return nf_ct_get_tuple_ports(skb, dataoff, tuple); default: @@ -511,10 +502,14 @@ u32 nf_ct_get_id(const struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_ct_get_id); +static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) +{ + return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); +} + static void clean_from_lists(struct nf_conn *ct) { - pr_debug("clean_from_lists(%p)\n", ct); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); @@ -538,10 +533,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); - if (tmpl != p) { - tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); + if (tmpl != p) tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; - } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) @@ -582,7 +575,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - pr_debug("%s(%p)\n", __func__, ct); WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { @@ -603,7 +595,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) if (ct->master) nf_ct_put(ct->master); - pr_debug("%s: returning ct=%p to slab\n", __func__, ct); nf_conntrack_free(ct); } EXPORT_SYMBOL(nf_ct_destroy); @@ -622,7 +613,7 @@ static void __nf_ct_delete_from_lists(struct nf_conn *ct) reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); clean_from_lists(ct); nf_conntrack_double_unlock(hash, reply_hash); @@ -786,8 +777,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - rcu_read_lock(); - h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { /* We have a candidate that matches the tuple we're interested @@ -799,7 +788,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, smp_acquire__after_ctrl_dep(); if (likely(nf_ct_key_equal(h, tuple, zone, net))) - goto found; + return h; /* TYPESAFE_BY_RCU recycled the candidate */ nf_ct_put(ct); @@ -807,8 +796,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, h = NULL; } -found: - rcu_read_unlock(); return h; } @@ -820,16 +807,21 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); struct nf_conntrack_tuple_hash *thash; + rcu_read_lock(); + thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, zone_id, net)); if (thash) - return thash; + goto out_unlock; rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (rid != zone_id) - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, rid, net)); + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + +out_unlock: + rcu_read_unlock(); return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -886,10 +878,8 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) zone = nf_ct_zone(ct); - if (!nf_ct_ext_valid_pre(ct->ext)) { - NF_CT_STAT_INC_ATOMIC(net, insert_failed); - return -ETIMEDOUT; - } + if (!nf_ct_ext_valid_pre(ct->ext)) + return -EAGAIN; local_bh_disable(); do { @@ -900,7 +890,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); @@ -924,6 +914,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) goto chaintoolong; } + /* If genid has changed, we can't insert anymore because ct + * extensions could have stale pointers and nf_ct_iterate_destroy + * might have completed its table scan already. + * + * Increment of the ext genid right after this check is fine: + * nf_ct_iterate_destroy blocks until locks are released. + */ + if (!nf_ct_ext_valid_post(ct->ext)) { + err = -EAGAIN; + goto out; + } + smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); @@ -932,12 +934,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) NF_CT_STAT_INC(net, insert); local_bh_enable(); - if (!nf_ct_ext_valid_post(ct->ext)) { - nf_ct_kill(ct); - NF_CT_STAT_INC_ATOMIC(net, drop); - return -ETIMEDOUT; - } - return 0; chaintoolong: NF_CT_STAT_INC(net, chaintoolong); @@ -992,6 +988,56 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) tstamp->start = ktime_get_real_ns(); } +/** + * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow + * @ct1: conntrack in hash table to check against + * @ct2: merge candidate + * + * returns true if ct1 and ct2 happen to refer to the same flow, but + * in opposing directions, i.e. + * ct1: a:b -> c:d + * ct2: c:d -> a:b + * for both directions. If so, @ct2 should not have been created + * as the skb should have been picked up as ESTABLISHED flow. + * But ct1 was not yet committed to hash table before skb that created + * ct2 had arrived. + * + * Note we don't compare netns because ct entries in different net + * namespace cannot clash to begin with. + * + * @return: true if ct1 and ct2 are identical when swapping origin/reply. + */ +static bool +nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) +{ + u16 id1, id2; + + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) + return false; + + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, + &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + return false; + + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); + if (id1 != id2) + return false; + + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); + + return id1 == id2; +} + +static int nf_ct_can_merge(const struct nf_conn *ct, + const struct nf_conn *loser_ct) +{ + return nf_ct_match(ct, loser_ct) || + nf_ct_match_reverse(ct, loser_ct); +} + /* caller must hold locks to prevent concurrent changes */ static int __nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) @@ -1003,11 +1049,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, loser_ct = nf_ct_get(skb, &ctinfo); - if (nf_ct_is_dying(ct)) - return NF_DROP; - - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || - nf_ct_match(ct, loser_ct)) { + if (nf_ct_can_merge(ct, loser_ct)) { struct net *net = nf_ct_net(ct); nf_conntrack_get(&ct->ct_general); @@ -1079,6 +1121,12 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); + /* confirmed bit must be set after hlist add, not before: + * loser_ct can still be visible to other cpu due to + * SLAB_TYPESAFE_BY_RCU. + */ + smp_mb__before_atomic(); + set_bit(IPS_CONFIRMED_BIT, &loser_ct->status); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; @@ -1094,7 +1142,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. * - * If there is one, @skb (and the assocated, unconfirmed conntrack) has + * If there is one, @skb (and the associated, unconfirmed conntrack) has * to be dropped. In case @skb is retransmitted, next conntrack lookup * will find the already-existing entry. * @@ -1186,7 +1234,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related * connections for unconfirmed conns. But packet copies and @@ -1210,14 +1258,11 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } - pr_debug("Confirming conntrack %p\n", ct); /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking * further use of that particular connection -JM. */ - ct->status |= IPS_CONFIRMED; - if (unlikely(nf_ct_is_dying(ct))) { NF_CT_STAT_INC(net, insert_failed); goto dying; @@ -1249,7 +1294,7 @@ chaintoolong: } } - /* Timer relative to confirmation time, not original + /* Timeout is relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; @@ -1257,11 +1302,21 @@ chaintoolong: __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after - * starting the timer and setting the CONFIRMED bit. The RCU barriers - * guarantee that no other CPU can find the conntrack before the above - * stores are visible. + * setting ct->timeout. The RCU barriers guarantee that no other CPU + * can find the conntrack before the above stores are visible. */ __nf_conntrack_hash_insert(ct, hash, reply_hash); + + /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups + * skip entries that lack this bit. This happens when a CPU is looking + * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU + * or when another CPU encounters this entry right after the insertion + * but before the set-confirm-bit below. This bit must not be set until + * after __nf_conntrack_hash_insert(). + */ + smp_mb__before_atomic(); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); @@ -1292,7 +1347,7 @@ dying: } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); -/* Returns true if a connection correspondings to the tuple (required +/* Returns true if a connection corresponds to the tuple (required for NAT). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, @@ -1374,9 +1429,6 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) - continue; - if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; @@ -1446,11 +1498,12 @@ static bool gc_worker_skip_ct(const struct nf_conn *ct) static bool gc_worker_can_early_drop(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; + u8 protonum = nf_ct_protonum(ct); if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; - l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(protonum); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; @@ -1505,11 +1558,6 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { - nf_ct_offload_timeout(tmp); - continue; - } - if (expired_count > GC_SCAN_EXPIRED_MAX) { rcu_read_unlock(); @@ -1620,12 +1668,16 @@ __nf_conntrack_alloc(struct net *net, /* We don't want any race condition at early drop stage */ ct_count = atomic_inc_return(&cnet->count); - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { + if (unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; atomic_dec(&cnet->count); - net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); + if (net == &init_net) + net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); + else + net_warn_ratelimited("nf_conntrack: table full in netns %u, dropping packet\n", + net->ns.inum); return ERR_PTR(-ENOMEM); } } @@ -1721,16 +1773,14 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_zone tmp; struct nf_conntrack_net *cnet; - if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { - pr_debug("Can't invert tuple.\n"); + if (!nf_ct_invert_tuple(&repl_tuple, tuple)) return NULL; - } zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) - return (struct nf_conntrack_tuple_hash *)ct; + return ERR_CAST(ct); if (!nf_ct_add_synproxy(ct, tmpl)) { nf_conntrack_free(ct); @@ -1762,10 +1812,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); - exp = nf_ct_find_expectation(net, zone, tuple); + exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { - pr_debug("expectation arrives ct=%p exp=%p\n", - ct, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ @@ -1829,10 +1877,8 @@ resolve_normal_ct(struct nf_conn *tmpl, if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, - &tuple)) { - pr_debug("Can't get tuple\n"); + &tuple)) return 0; - } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); @@ -1864,17 +1910,15 @@ resolve_normal_ct(struct nf_conn *tmpl, if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { ctinfo = IP_CT_ESTABLISHED_REPLY; } else { + unsigned long status = READ_ONCE(ct->status); + /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - pr_debug("normal packet for %p\n", ct); + if (likely(status & IPS_SEEN_REPLY)) ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - pr_debug("related packet for %p\n", ct); + else if (status & IPS_EXPECTED) ctinfo = IP_CT_RELATED; - } else { - pr_debug("new packet for %p\n", ct); + else ctinfo = IP_CT_NEW; - } } nf_ct_set(skb, ct, ctinfo); return 0; @@ -1953,11 +1997,6 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct, return nf_conntrack_sctp_packet(ct, skb, dataoff, ctinfo, state); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: - return nf_conntrack_dccp_packet(ct, skb, dataoff, - ctinfo, state); -#endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return nf_conntrack_gre_packet(ct, skb, dataoff, @@ -1988,7 +2027,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) /* rcu_read_lock()ed by nf_hook_thresh */ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { - pr_debug("not prepared to track yet or error occurred\n"); NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; @@ -2027,7 +2065,6 @@ repeat: if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ - pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_ct_put(ct); skb->_nfct = 0; /* Special case: TCP tracker reports an attempt to reopen a @@ -2038,7 +2075,7 @@ repeat: goto repeat; NF_CT_STAT_INC_ATOMIC(state->net, invalid); - if (ret == -NF_DROP) + if (ret == NF_DROP) NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = -ret; @@ -2056,31 +2093,11 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_in); -/* Alter reply tuple (maybe alter helper). This is for NAT, and is - implicitly racy: see __nf_conntrack_confirm */ -void nf_conntrack_alter_reply(struct nf_conn *ct, - const struct nf_conntrack_tuple *newreply) -{ - struct nf_conn_help *help = nfct_help(ct); - - /* Should be unconfirmed, so not in hash table yet */ - WARN_ON(nf_ct_is_confirmed(ct)); - - pr_debug("Altering reply tuple of %p to ", ct); - nf_ct_dump_tuple(newreply); - - ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (ct->master || (help && !hlist_empty(&help->expectations))) - return; -} -EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); - /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, u32 extra_jiffies, - bool do_acct) + unsigned int bytes) { /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) @@ -2093,8 +2110,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (READ_ONCE(ct->timeout) != extra_jiffies) WRITE_ONCE(ct->timeout, extra_jiffies); acct: - if (do_acct) - nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); + if (bytes) + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -2186,74 +2203,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) nf_conntrack_get(skb_nfct(nskb)); } -static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - const struct nf_nat_hook *nat_hook; - struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - unsigned int status; - int dataoff; - u16 l3num; - u8 l4num; - - l3num = nf_ct_l3num(ct); - - dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); - if (dataoff <= 0) - return -1; - - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - l4num, net, &tuple)) - return -1; - - if (ct->status & IPS_SRC_NAT) { - memcpy(tuple.src.u3.all, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, - sizeof(tuple.src.u3.all)); - tuple.src.u.all = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; - } - - if (ct->status & IPS_DST_NAT) { - memcpy(tuple.dst.u3.all, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, - sizeof(tuple.dst.u3.all)); - tuple.dst.u.all = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; - } - - h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); - if (!h) - return 0; - - /* Store status bits of the conntrack that is clashing to re-do NAT - * mangling according to what it has been done already to this packet. - */ - status = ct->status; - - nf_ct_put(ct); - ct = nf_ct_tuplehash_to_ctrack(h); - nf_ct_set(skb, ct, ctinfo); - - nat_hook = rcu_dereference(nf_nat_hook); - if (!nat_hook) - return 0; - - if (status & IPS_SRC_NAT && - nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, - IP_CT_DIR_ORIGINAL) == NF_DROP) - return -1; - - if (status & IPS_DST_NAT && - nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, - IP_CT_DIR_ORIGINAL) == NF_DROP) - return -1; - - return 0; -} - /* This packet is coming from userspace via nf_queue, complete the packet * processing after the helper invocation in nf_confirm(). */ @@ -2266,11 +2215,14 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, help = nfct_help(ct); if (!help) - return 0; + return NF_ACCEPT; helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) - return 0; + return NF_ACCEPT; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: @@ -2285,43 +2237,34 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) - return 0; + return NF_ACCEPT; break; } #endif default: - return 0; + return NF_ACCEPT; } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return -1; + return NF_DROP; } } /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; + return nf_conntrack_confirm(skb); } static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; - int err; ct = nf_ct_get(skb, &ctinfo); if (!ct) - return 0; - - if (!nf_ct_is_confirmed(ct)) { - err = __nf_conntrack_update(net, skb, ct, ctinfo); - if (err < 0) - return err; - - ct = nf_ct_get(skb, &ctinfo); - } + return NF_ACCEPT; return nf_confirm_cthelper(skb, ct, ctinfo); } @@ -2552,7 +2495,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) * netfilter framework. Roll on, two-stage module * delete... */ - synchronize_net(); + synchronize_rcu_expedited(); i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { @@ -2580,12 +2523,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) struct hlist_nulls_head *hash; unsigned int nr_slots, i; - if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) @@ -2761,11 +2707,25 @@ err_cachep: return ret; } +static void nf_conntrack_set_closing(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = nf_ct_to_nf_conn(nfct); + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + nf_conntrack_tcp_set_closing(ct); + break; + } +} + static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, + .set_closing = nf_conntrack_set_closing, + .confirm = __nf_conntrack_confirm, + .get_id = nf_conntrack_get_id, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 8698b3424646..81baf2082604 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -162,6 +162,14 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, return ret; } +static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (local64_read(&e->timestamp)) + local64_set(&e->timestamp, ktime_get_real_ns()); +#endif +} + int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, u32 portid, int report) { @@ -186,6 +194,8 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, /* This is a resent of a destroy event? If so, skip missed */ missed = e->portid ? 0 : e->missed; + nf_ct_ecache_tstamp_refresh(e); + ret = __nf_conntrack_eventmask_report(e, events, missed, &item); if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { /* This is a destroy event that has been triggered by a process, @@ -291,12 +301,24 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) - mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } } +static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + u64 ts = 0; + + if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + ts = ktime_get_real_ns(); + + local64_set(&e->timestamp, ts); +#endif +} + bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); @@ -309,7 +331,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp break; return true; case 2: /* autodetect: no event listener, don't allocate extension. */ - if (!READ_ONCE(net->ct.ctnetlink_has_listener)) + if (!READ_ONCE(nf_ctnetlink_has_listener)) return true; fallthrough; case 1: @@ -326,6 +348,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); if (e) { + nf_ct_ecache_tstamp_new(ct, e); e->ctmask = ctmask; e->expmask = expmask; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 96948e98ec53..cfc2daa3fc7f 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { - struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); + struct nf_conntrack_expect *exp = timer_container_of(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); @@ -118,7 +118,7 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { - if (del_timer(&exp->timeout)) { + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; @@ -211,14 +211,14 @@ nf_ct_find_expectation(struct net *net, !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; - if (exp->flags & NF_CT_EXPECT_PERMANENT) { + if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; - } else if (del_timer(&exp->timeout)) { + } else if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } - /* Undo exp->master refcnt increase, if del_timer() failed */ + /* Undo exp->master refcnt increase, if timer_delete() failed */ nf_ct_put(exp->master); return NULL; @@ -520,7 +520,7 @@ void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, vo hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { - if (iter(exp, data) && del_timer(&exp->timeout)) { + if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } @@ -550,7 +550,7 @@ void nf_ct_expect_iterate_net(struct net *net, if (!net_eq(nf_ct_exp_net(exp), net)) continue; - if (iter(exp, data) && del_timer(&exp->timeout)) { + if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } @@ -722,9 +722,7 @@ int nf_conntrack_expect_init(void) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; - nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", - sizeof(struct nf_conntrack_expect), - 0, 0, NULL); + nf_ct_expect_cachep = KMEM_CACHE(nf_conntrack_expect, 0); if (!nf_ct_expect_cachep) return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 0b513f7bf9f3..dd62cc12e775 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -40,10 +40,10 @@ static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = { [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache), #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP - [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_acct), + [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_tstamp), #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT - [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_tstamp), + [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_timeout), #endif #ifdef CONFIG_NF_CONNTRACK_LABELS [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels), diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index e697a824b001..540d97715bd2 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -533,6 +533,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Get fields bitmap */ if (nf_h323_error_boundary(bs, 0, f->sz)) return H323_ERROR_BOUND; + if (f->sz > 32) + return H323_ERROR_RANGE; bmp = get_bitmap(bs, f->sz); if (base) *(unsigned int *)base = bmp; @@ -589,6 +591,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, bmp2_len = get_bits(bs, 7) + 1; if (nf_h323_error_boundary(bs, 0, bmp2_len)) return H323_ERROR_BOUND; + if (bmp2_len > 32) + return H323_ERROR_RANGE; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; if (base) diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 5a9bce24f3c3..14f73872f647 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1385,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, if (info->timeout > 0) { pr_debug("nf_ct_ras: set RAS connection timeout to " "%u seconds\n", info->timeout); - nf_ct_refresh(ct, skb, info->timeout * HZ); + nf_ct_refresh(ct, info->timeout * HZ); /* Set expect timeout */ spin_lock_bh(&nf_conntrack_expect_lock); @@ -1433,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, info->sig_port[!dir] = 0; /* Give it 30 seconds for UCF or URJ */ - nf_ct_refresh(ct, skb, 30 * HZ); + nf_ct_refresh(ct, 30 * HZ); return 0; } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 48ea6d0264b5..ceb48c3ca0a4 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -194,12 +194,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; - /* We already got a helper explicitly attached. The function - * nf_conntrack_alter_reply - in case NAT is in use - asks for looking - * the helper up again. Since now the user is in full control of - * making consistent helper configurations, skip this automatic - * re-lookup, otherwise we'll lose the helper. - */ + /* We already got a helper explicitly attached (e.g. nft_ct) */ if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; @@ -242,104 +237,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); -/* 'skb' should already be pulled to nh_ofs. */ -int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, u16 proto) -{ - const struct nf_conntrack_helper *helper; - const struct nf_conn_help *help; - unsigned int protoff; - int err; - - if (ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - if (helper->tuple.src.l3num != NFPROTO_UNSPEC && - helper->tuple.src.l3num != proto) - return NF_ACCEPT; - - switch (proto) { - case NFPROTO_IPV4: - protoff = ip_hdrlen(skb); - proto = ip_hdr(skb)->protocol; - break; - case NFPROTO_IPV6: { - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - int ofs; - - ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return NF_ACCEPT; - } - protoff = ofs; - proto = nexthdr; - break; - } - default: - WARN_ONCE(1, "helper invoked on non-IP family!"); - return NF_DROP; - } - - if (helper->tuple.dst.protonum != proto) - return NF_ACCEPT; - - err = helper->help(skb, protoff, ct, ctinfo); - if (err != NF_ACCEPT) - return err; - - /* Adjust seqs after helper. This is needed due to some helpers (e.g., - * FTP with NAT) adusting the TCP payload size when mangling IP - * addresses and/or port numbers in the text-based control connection. - */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) - return NF_DROP; - return NF_ACCEPT; -} -EXPORT_SYMBOL_GPL(nf_ct_helper); - -int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, - u8 proto, bool nat, struct nf_conntrack_helper **hp) -{ - struct nf_conntrack_helper *helper; - struct nf_conn_help *help; - int ret = 0; - - helper = nf_conntrack_helper_try_module_get(name, family, proto); - if (!helper) - return -EINVAL; - - help = nf_ct_helper_ext_add(ct, GFP_KERNEL); - if (!help) { - nf_conntrack_helper_put(helper); - return -ENOMEM; - } -#if IS_ENABLED(CONFIG_NF_NAT) - if (nat) { - ret = nf_nat_helper_try_module_get(name, family, proto); - if (ret) { - nf_conntrack_helper_put(helper); - return ret; - } - } -#endif - rcu_assign_pointer(help->helper, helper); - *hp = helper; - return ret; -} -EXPORT_SYMBOL_GPL(nf_ct_add_helper); - /* appropriate ct lock protecting must be taken by caller */ static int unhelp(struct nf_conn *ct, void *me) { @@ -458,6 +355,9 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + if (!nf_ct_helper_hash) + return -ENOENT; + if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; @@ -468,7 +368,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) (cur->tuple.src.l3num == NFPROTO_UNSPEC || cur->tuple.src.l3num == me->tuple.src.l3num) && cur->tuple.dst.protonum == me->tuple.dst.protonum) { - ret = -EEXIST; + ret = -EBUSY; goto out; } } @@ -479,7 +379,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; + ret = -EBUSY; goto out; } } @@ -613,4 +513,5 @@ int nf_conntrack_helper_init(void) void nf_conntrack_helper_fini(void) { kvfree(nf_ct_helper_hash); + nf_ct_helper_hash = NULL; } diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 6e70e137a0a6..6c46aad23313 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -11,8 +11,6 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> -static DEFINE_SPINLOCK(nf_connlabels_lock); - static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; @@ -60,23 +58,24 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { + int v; + if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; - spin_lock(&nf_connlabels_lock); - net->ct.labels_used++; - spin_unlock(&nf_connlabels_lock); - BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); + v = atomic_inc_return_relaxed(&net->ct.labels_used); + WARN_ON_ONCE(v <= 0); + return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_get); void nf_connlabels_put(struct net *net) { - spin_lock(&nf_connlabels_lock); - net->ct.labels_used--; - spin_unlock(&nf_connlabels_lock); + int v = atomic_dec_return_relaxed(&net->ct.labels_used); + + WARN_ON_ONCE(v < 0); } EXPORT_SYMBOL_GPL(nf_connlabels_put); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1286ae7d4609..3a04665adf99 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -57,9 +57,10 @@ #include "nf_internals.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { - struct nf_conn *last; + unsigned long last_id; unsigned int cpu; bool done; }; @@ -176,7 +177,12 @@ nla_put_failure: static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, bool skip_zero) { - long timeout = nf_ct_expires(ct) / HZ; + long timeout; + + if (nf_ct_is_confirmed(ct)) + timeout = nf_ct_expires(ct) / HZ; + else + timeout = ct->timeout / HZ; if (skip_zero && timeout == 0) return 0; @@ -328,11 +334,12 @@ nla_put_failure: } #ifdef CONFIG_NF_CONNTRACK_MARK -static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct, + bool dump) { u32 mark = READ_ONCE(ct->mark); - if (!mark) + if (!mark && !dump) return 0; if (nla_put_be32(skb, CTA_MARK, htonl(mark))) @@ -343,18 +350,18 @@ nla_put_failure: return -1; } #else -#define ctnetlink_dump_mark(a, b) (0) +#define ctnetlink_dump_mark(a, b, c) (0) #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; - int len, ret; - char *secctx; + struct lsm_context ctx; + int ret; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return 0; ret = -1; @@ -362,20 +369,37 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_secctx) goto nla_put_failure; - if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) + if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context)) goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; nla_put_failure: - security_release_secctx(secctx, len); + security_release_secctx(&ctx); return ret; } #else #define ctnetlink_dump_secctx(a, b) (0) #endif -#ifdef CONFIG_NF_CONNTRACK_LABELS +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int +ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); + + if (e) { + u64 ts = local64_read(&e->timestamp); + + if (ts) + return nla_put_be64(skb, CTA_TIMESTAMP_EVENT, + cpu_to_be64(ts), CTA_TIMESTAMP_PAD); + } +#endif + return 0; +} + static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); @@ -384,6 +408,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct) return 0; return nla_total_size(sizeof(labels->bits)); } +#endif static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) @@ -404,10 +429,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) return 0; } -#else -#define ctnetlink_dump_labels(a, b) (0) -#define ctnetlink_label_size(a) (0) -#endif #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) @@ -548,7 +569,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb, static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct, true) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || @@ -645,7 +666,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) return len + len4; } -#endif static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { @@ -660,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK - int len, ret; + int ret; - ret = security_secid_to_secctx(ct->secmark, NULL, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, NULL); + if (ret < 0) return 0; return nla_total_size(0) /* CTA_SECCTX */ - + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ + + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */ #else return 0; #endif @@ -683,6 +703,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) return 0; #endif } +#endif #ifdef CONFIG_NF_CONNTRACK_EVENTS static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) @@ -713,6 +734,9 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */ +#endif ; } @@ -831,10 +855,13 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) } #ifdef CONFIG_NF_CONNTRACK_MARK - if (events & (1 << IPCT_MARK) && - ctnetlink_dump_mark(skb, ct) < 0) + if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK))) goto nla_put_failure; #endif + + if (ctnetlink_dump_event_timestamp(skb, ct)) + goto nla_put_failure; + nlmsg_end(skb, nlh); err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); @@ -857,8 +884,6 @@ errout: static int ctnetlink_done(struct netlink_callback *cb) { - if (cb->args[1]) - nf_ct_put((struct nf_conn *)cb->args[1]); kfree(cb->data); return 0; } @@ -870,6 +895,7 @@ struct ctnetlink_filter_u32 { struct ctnetlink_filter { u8 family; + bool zone_filter; u_int32_t orig_flags; u_int32_t reply_flags; @@ -986,13 +1012,16 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) if (err) goto err_filter; + if (cda[CTA_ZONE]) { + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + goto err_filter; + filter->zone_filter = true; + } + if (!cda[CTA_FILTER]) return filter; - err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); - if (err < 0) - goto err_filter; - err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); if (err < 0) goto err_filter; @@ -1037,7 +1066,7 @@ err_filter: static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) { - return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS]; + return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE]; } static int ctnetlink_start(struct netlink_callback *cb) @@ -1142,6 +1171,10 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; + if (filter->zone_filter && + !nf_ct_zone_equal_any(ct, &filter->zone)) + goto ignore_entry; + if (filter->orig_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, @@ -1173,19 +1206,26 @@ ignore_entry: return 0; } +static unsigned long ctnetlink_get_id(const struct nf_conn *ct) +{ + unsigned long id = nf_ct_get_id(ct); + + return id ? id : 1; +} + static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); - struct nf_conn *ct, *last; + unsigned long last_id = cb->args[1]; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *nf_ct_evict[8]; + struct nf_conn *ct; int res, i; spinlock_t *lockp; - last = (struct nf_conn *)cb->args[1]; i = 0; local_bh_disable(); @@ -1222,7 +1262,7 @@ restart: continue; if (cb->args[1]) { - if (ct != last) + if (ctnetlink_get_id(ct) != last_id) continue; cb->args[1] = 0; } @@ -1235,8 +1275,7 @@ restart: NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, true, flags); if (res < 0) { - nf_conntrack_get(&ct->ct_general); - cb->args[1] = (unsigned long)ct; + cb->args[1] = ctnetlink_get_id(ct); spin_unlock(lockp); goto out; } @@ -1249,12 +1288,10 @@ restart: } out: local_bh_enable(); - if (last) { + if (last_id) { /* nf ct hash resize happened, now clear the leftover. */ - if ((struct nf_conn *)cb->args[1] == last) + if (cb->args[1] == last_id) cb->args[1] = 0; - - nf_ct_put(last); } while (i) { @@ -1316,15 +1353,11 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; - ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, NULL, NULL); + ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, + cta_ip_nla_policy, NULL); if (ret < 0) return ret; - ret = nla_validate_nested_deprecated(attr, CTA_IP_MAX, - cta_ip_nla_policy, NULL); - if (ret) - return ret; - switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_nlattr_to_tuple(tb, tuple, flags); @@ -1550,13 +1583,11 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, [CTA_STATUS_MASK] = { .type = NLA_U32 }, + [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) { - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) - return 0; - return ctnetlink_filter_match(ct, data); } @@ -1572,9 +1603,6 @@ static int ctnetlink_flush_conntrack(struct net *net, }; if (ctnetlink_needs_filter(family, cda)) { - if (cda[CTA_FILTER]) - return -EOPNOTSUPP; - filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -1603,14 +1631,14 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, if (err < 0) return err; - if (cda[CTA_TUPLE_ORIG]) + if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, family, &zone); - else if (cda[CTA_TUPLE_REPLY]) + else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, family, &zone); else { - u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC; + u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, @@ -1626,11 +1654,6 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, ct = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) { - nf_ct_put(ct); - return -EBUSY; - } - if (cda[CTA_ID]) { __be32 id = nla_get_be32(cda[CTA_ID]); @@ -1710,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } -static int ctnetlink_done_list(struct netlink_callback *cb) -{ - struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - - if (ctx->last) - nf_ct_put(ctx->last); - - return 0; -} - #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_one_entry(struct sk_buff *skb, struct netlink_callback *cb, @@ -1734,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, if (l3proto && nf_ct_l3num(ct) != l3proto) return 0; - if (ctx->last) { - if (ct != ctx->last) + if (ctx->last_id) { + if (ctnetlink_get_id(ct) != ctx->last_id) return 0; - ctx->last = NULL; + ctx->last_id = 0; } /* We can't dump extension info for the unconfirmed @@ -1752,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); - if (res < 0) { - if (!refcount_inc_not_zero(&ct->ct_general.use)) - return 0; - - ctx->last = ct; - } + if (res < 0) + ctx->last_id = ctnetlink_get_id(ct); return res; } @@ -1773,10 +1782,10 @@ static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - struct nf_conn *last = ctx->last; #ifdef CONFIG_NF_CONNTRACK_EVENTS const struct net *net = sock_net(skb->sk); struct nf_conntrack_net_ecache *ecache_net; + unsigned long last_id = ctx->last_id; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; #endif @@ -1784,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) if (ctx->done) return 0; - ctx->last = NULL; + ctx->last_id = 0; #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache_net = nf_conn_pernet_ecache(net); @@ -1795,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) int res; ct = nf_ct_tuplehash_to_ctrack(h); - if (last && last != ct) + if (last_id && last_id != ctnetlink_get_id(ct)) continue; res = ctnetlink_dump_one_entry(skb, cb, ct, true); if (res < 0) { spin_unlock_bh(&ecache_net->dying_lock); - nf_ct_put(last); return skb->len; } - nf_ct_put(last); - last = NULL; + last_id = 0; } spin_unlock_bh(&ecache_net->dying_lock); #endif ctx->done = true; - nf_ct_put(last); return skb->len; } @@ -1824,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } @@ -1839,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } @@ -2015,7 +2019,6 @@ static void ctnetlink_change_mark(struct nf_conn *ct, static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, - [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; @@ -2253,9 +2256,6 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - __nf_ct_set_timeout(ct, timeout); - rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; @@ -2319,6 +2319,9 @@ ctnetlink_create_conntrack(struct net *net, /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + __nf_ct_set_timeout(ct, timeout); + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) @@ -2375,12 +2378,15 @@ ctnetlink_create_conntrack(struct net *net, err = nf_conntrack_hash_check_insert(ct); if (err < 0) - goto err2; + goto err3; rcu_read_unlock(); return ct; +err3: + if (ct->master) + nf_ct_put(ct->master); err2: rcu_read_unlock(); err1: @@ -2735,7 +2741,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK - if (ctnetlink_dump_mark(skb, ct) < 0) + if (ctnetlink_dump_mark(skb, ct, true) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) @@ -2976,7 +2982,9 @@ nla_put_failure: return -1; } +#if IS_ENABLED(CONFIG_NF_NAT) static const union nf_inet_addr any_addr; +#endif static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) { @@ -3143,23 +3151,27 @@ errout: return 0; } #endif -static int ctnetlink_exp_done(struct netlink_callback *cb) + +static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp) { - if (cb->args[1]) - nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); - return 0; + unsigned long id = (unsigned long)exp; + + id += nf_ct_get_id(exp->master); + id += exp->class; + + return id ? id : 1; } static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; + unsigned long last_id = cb->args[1]; + struct nf_conntrack_expect *exp; rcu_read_lock(); - last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]], @@ -3171,7 +3183,7 @@ restart: continue; if (cb->args[1]) { - if (exp != last) + if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } @@ -3180,9 +3192,7 @@ restart: cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!refcount_inc_not_zero(&exp->use)) - continue; - cb->args[1] = (unsigned long)exp; + cb->args[1] = ctnetlink_exp_id(exp); goto out; } } @@ -3193,32 +3203,30 @@ restart: } out: rcu_read_unlock(); - if (last) - nf_ct_expect_put(last); - return skb->len; } static int ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nf_conn *ct = cb->data; struct nf_conn_help *help = nfct_help(ct); u_int8_t l3proto = nfmsg->nfgen_family; + unsigned long last_id = cb->args[1]; + struct nf_conntrack_expect *exp; if (cb->args[0]) return 0; rcu_read_lock(); - last = (struct nf_conntrack_expect *)cb->args[1]; + restart: hlist_for_each_entry_rcu(exp, &help->expectations, lnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (cb->args[1]) { - if (exp != last) + if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } @@ -3226,9 +3234,7 @@ restart: cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!refcount_inc_not_zero(&exp->use)) - continue; - cb->args[1] = (unsigned long)exp; + cb->args[1] = ctnetlink_exp_id(exp); goto out; } } @@ -3239,9 +3245,6 @@ restart: cb->args[0] = 1; out: rcu_read_unlock(); - if (last) - nf_ct_expect_put(last); - return skb->len; } @@ -3260,7 +3263,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, - .done = ctnetlink_exp_done, }; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, @@ -3310,7 +3312,6 @@ static int ctnetlink_get_expect(struct sk_buff *skb, else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, - .done = ctnetlink_exp_done, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } @@ -3413,7 +3414,8 @@ static int ctnetlink_del_expect(struct sk_buff *skb, if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); - if (ntohl(id) != (u32)(unsigned long)exp) { + + if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } @@ -3421,7 +3423,7 @@ static int ctnetlink_del_expect(struct sk_buff *skb, /* after list removal, usage count == 1 */ spin_lock_bh(&nf_conntrack_expect_lock); - if (del_timer(&exp->timeout)) { + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_expect_put(exp); @@ -3450,7 +3452,7 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { if (cda[CTA_EXPECT_TIMEOUT]) { - if (!del_timer(&x->timeout)) + if (!timer_delete(&x->timeout)) return -ETIME; x->timeout.expires = jiffies + @@ -3460,10 +3462,12 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, return 0; } +#if IS_ENABLED(CONFIG_NF_NAT) static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, }; +#endif static int ctnetlink_parse_expect_nat(const struct nlattr *attr, @@ -3866,7 +3870,7 @@ static int __init ctnetlink_init(void) { int ret; - BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx)); + NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c new file mode 100644 index 000000000000..068e9489e1c2 --- /dev/null +++ b/net/netfilter/nf_conntrack_ovs.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Support ct functions for openvswitch and used by OVS and TC conntrack. */ + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/ipv6_frag.h> +#include <net/ip.h> +#include <linux/netfilter_ipv6.h> + +/* 'skb' should already be pulled to nh_ofs. */ +int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + unsigned int protoff; + int err; + + if (ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + if (helper->tuple.src.l3num != NFPROTO_UNSPEC && + helper->tuple.src.l3num != proto) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + proto = ip_hdr(skb)->protocol; + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + int ofs; + + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + protoff = ofs; + proto = nexthdr; + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + if (helper->tuple.dst.protonum != proto) + return NF_ACCEPT; + + err = helper->help(skb, protoff, ct, ctinfo); + if (err != NF_ACCEPT) + return err; + + /* Adjust seqs after helper. This is needed due to some helpers (e.g., + * FTP with NAT) adusting the TCP payload size when mangling IP + * addresses and/or port numbers in the text-based control connection. + */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) + return NF_DROP; + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_ct_helper); + +int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, + u8 proto, bool nat, struct nf_conntrack_helper **hp) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + int ret = 0; + + helper = nf_conntrack_helper_try_module_get(name, family, proto); + if (!helper) + return -EINVAL; + + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (!help) { + nf_conntrack_helper_put(helper); + return -ENOMEM; + } +#if IS_ENABLED(CONFIG_NF_NAT) + if (nat) { + ret = nf_nat_helper_try_module_get(name, family, proto); + if (ret) { + nf_conntrack_helper_put(helper); + return ret; + } + } +#endif + rcu_assign_pointer(help->helper, helper); + *hp = helper; + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_add_helper); + +/* Trim the skb to the length specified by the IP/IPv6 header, + * removing any trailing lower-layer padding. This prepares the skb + * for higher-layer processing that assumes skb->len excludes padding + * (such as nf_ip_checksum). The caller needs to pull the skb to the + * network header, and ensure ip_hdr/ipv6_hdr points to valid data. + */ +int nf_ct_skb_network_trim(struct sk_buff *skb, int family) +{ + unsigned int len; + + switch (family) { + case NFPROTO_IPV4: + len = skb_ip_totlen(skb); + break; + case NFPROTO_IPV6: + len = ntohs(ipv6_hdr(skb)->payload_len); + if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) { + int err = nf_ip6_check_hbh_len(skb, &len); + + if (err) + return err; + } + len += sizeof(struct ipv6hdr); + break; + default: + len = skb->len; + } + + return pskb_trim_rcsum(skb, len); +} +EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim); + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, + u16 zone, u8 family, u8 *proto, u16 *mru) +{ + int err; + + if (family == NFPROTO_IPV4) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(net, skb, user); + local_bh_enable(); + if (err) + return err; + + *mru = IPCB(skb)->frag_max_size; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + } else if (family == NFPROTO_IPV6) { + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + err = nf_ct_frag6_gather(net, skb, user); + if (err) { + if (err != -EINPROGRESS) + kfree_skb(skb); + return err; + } + + *proto = ipv6_hdr(skb)->nexthdr; + *mru = IP6CB(skb)->frag_max_size; +#endif + } else { + kfree_skb(skb); + return -EPFNOSUPPORT; + } + + skb_clear_hash(skb); + skb->ignore_df = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_handle_fragments); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index ccef340be575..bc1d96686b9c 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -100,9 +100,6 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) case IPPROTO_UDP: return &nf_conntrack_l4proto_udp; case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp; case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp; -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp; #endif @@ -284,16 +281,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) /* We only do TCP and SCTP at the moment: is there a better way? */ if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + tuple.dst.protonum != IPPROTO_SCTP) return -ENOPROTOOPT; - } - if ((unsigned int)*len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); + if ((unsigned int)*len < sizeof(struct sockaddr_in)) return -EINVAL; - } h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { @@ -307,17 +299,12 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u3.ip; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); nf_ct_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else return 0; } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); return -ENOENT; } @@ -360,12 +347,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (!h) { - pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", - &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + if (!h) return -ENOENT; - } ct = nf_ct_tuplehash_to_ctrack(h); @@ -695,9 +678,6 @@ void nf_conntrack_proto_pernet_init(struct net *net) #if IS_ENABLED(CONFIG_IPV6) nf_conntrack_icmpv6_init_net(net); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - nf_conntrack_dccp_init_net(net); -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP nf_conntrack_sctp_init_net(net); #endif @@ -713,3 +693,4 @@ MODULE_ALIAS("ip_conntrack"); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv4 and IPv6 connection tracking"); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c deleted file mode 100644 index c1557d47ccd1..000000000000 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ /dev/null @@ -1,778 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * DCCP connection tracking protocol helper - * - * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/sysctl.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/dccp.h> -#include <linux/slab.h> - -#include <net/net_namespace.h> -#include <net/netns/generic.h> - -#include <linux/netfilter/nfnetlink_conntrack.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_ecache.h> -#include <net/netfilter/nf_conntrack_timeout.h> -#include <net/netfilter/nf_log.h> - -/* Timeouts are based on values from RFC4340: - * - * - REQUEST: - * - * 8.1.2. Client Request - * - * A client MAY give up on its DCCP-Requests after some time - * (3 minutes, for example). - * - * - RESPOND: - * - * 8.1.3. Server Response - * - * It MAY also leave the RESPOND state for CLOSED after a timeout of - * not less than 4MSL (8 minutes); - * - * - PARTOPEN: - * - * 8.1.5. Handshake Completion - * - * If the client remains in PARTOPEN for more than 4MSL (8 minutes), - * it SHOULD reset the connection with Reset Code 2, "Aborted". - * - * - OPEN: - * - * The DCCP timestamp overflows after 11.9 hours. If the connection - * stays idle this long the sequence number won't be recognized - * as valid anymore. - * - * - CLOSEREQ/CLOSING: - * - * 8.3. Termination - * - * The retransmission timer should initially be set to go off in two - * round-trip times and should back off to not less than once every - * 64 seconds ... - * - * - TIMEWAIT: - * - * 4.3. States - * - * A server or client socket remains in this state for 2MSL (4 minutes) - * after the connection has been town down, ... - */ - -#define DCCP_MSL (2 * 60 * HZ) - -static const char * const dccp_state_names[] = { - [CT_DCCP_NONE] = "NONE", - [CT_DCCP_REQUEST] = "REQUEST", - [CT_DCCP_RESPOND] = "RESPOND", - [CT_DCCP_PARTOPEN] = "PARTOPEN", - [CT_DCCP_OPEN] = "OPEN", - [CT_DCCP_CLOSEREQ] = "CLOSEREQ", - [CT_DCCP_CLOSING] = "CLOSING", - [CT_DCCP_TIMEWAIT] = "TIMEWAIT", - [CT_DCCP_IGNORE] = "IGNORE", - [CT_DCCP_INVALID] = "INVALID", -}; - -#define sNO CT_DCCP_NONE -#define sRQ CT_DCCP_REQUEST -#define sRS CT_DCCP_RESPOND -#define sPO CT_DCCP_PARTOPEN -#define sOP CT_DCCP_OPEN -#define sCR CT_DCCP_CLOSEREQ -#define sCG CT_DCCP_CLOSING -#define sTW CT_DCCP_TIMEWAIT -#define sIG CT_DCCP_IGNORE -#define sIV CT_DCCP_INVALID - -/* - * DCCP state transition table - * - * The assumption is the same as for TCP tracking: - * - * We are the man in the middle. All the packets go through us but might - * get lost in transit to the destination. It is assumed that the destination - * can't receive segments we haven't seen. - * - * The following states exist: - * - * NONE: Initial state, expecting Request - * REQUEST: Request seen, waiting for Response from server - * RESPOND: Response from server seen, waiting for Ack from client - * PARTOPEN: Ack after Response seen, waiting for packet other than Response, - * Reset or Sync from server - * OPEN: Packet other than Response, Reset or Sync seen - * CLOSEREQ: CloseReq from server seen, expecting Close from client - * CLOSING: Close seen, expecting Reset - * TIMEWAIT: Reset seen - * IGNORE: Not determinable whether packet is valid - * - * Some states exist only on one side of the connection: REQUEST, RESPOND, - * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to - * the one it was in before. - * - * Packets are marked as ignored (sIG) if we don't know if they're valid - * (for example a reincarnation of a connection we didn't notice is dead - * already) and the server may send back a connection closing Reset or a - * Response. They're also used for Sync/SyncAck packets, which we don't - * care about. - */ -static const u_int8_t -dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { - [CT_DCCP_ROLE_CLIENT] = { - [DCCP_PKT_REQUEST] = { - /* - * sNO -> sRQ Regular Request - * sRQ -> sRQ Retransmitted Request or reincarnation - * sRS -> sRS Retransmitted Request (apparently Response - * got lost after we saw it) or reincarnation - * sPO -> sIG Ignore, conntrack might be out of sync - * sOP -> sIG Ignore, conntrack might be out of sync - * sCR -> sIG Ignore, conntrack might be out of sync - * sCG -> sIG Ignore, conntrack might be out of sync - * sTW -> sRQ Reincarnation - * - * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ - sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, - }, - [DCCP_PKT_RESPONSE] = { - /* - * sNO -> sIV Invalid - * sRQ -> sIG Ignore, might be response to ignored Request - * sRS -> sIG Ignore, might be response to ignored Request - * sPO -> sIG Ignore, might be response to ignored Request - * sOP -> sIG Ignore, might be response to ignored Request - * sCR -> sIG Ignore, might be response to ignored Request - * sCG -> sIG Ignore, might be response to ignored Request - * sTW -> sIV Invalid, reincarnation in reverse direction - * goes through sRQ - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, - }, - [DCCP_PKT_ACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) - * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN - * sOP -> sOP Regular ACK, remain in OPEN - * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV - }, - [DCCP_PKT_DATA] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) - * sOP -> sOP Regular Data packet - * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, - }, - [DCCP_PKT_DATAACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) - * sPO -> sPO Remain in PARTOPEN state - * sOP -> sOP Regular DataAck packet in OPEN state - * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV - }, - [DCCP_PKT_CLOSEREQ] = { - /* - * CLOSEREQ may only be sent by the server. - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV - }, - [DCCP_PKT_CLOSE] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sCG Client-initiated close - * sOP -> sCG Client-initiated close - * sCR -> sCG Close in response to CloseReq (8.3.) - * sCG -> sCG Retransmit - * sTW -> sIV Late retransmit, already in TIME_WAIT - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV - }, - [DCCP_PKT_RESET] = { - /* - * sNO -> sIV No connection - * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) - * sRS -> sTW Response received without Request - * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) - * sOP -> sTW Connection reset - * sCR -> sTW Connection reset - * sCG -> sTW Connection reset - * sTW -> sIG Ignore (don't refresh timer) - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG - }, - [DCCP_PKT_SYNC] = { - /* - * We currently ignore Sync packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - [DCCP_PKT_SYNCACK] = { - /* - * We currently ignore SyncAck packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - }, - [CT_DCCP_ROLE_SERVER] = { - [DCCP_PKT_REQUEST] = { - /* - * sNO -> sIV Invalid - * sRQ -> sIG Ignore, conntrack might be out of sync - * sRS -> sIG Ignore, conntrack might be out of sync - * sPO -> sIG Ignore, conntrack might be out of sync - * sOP -> sIG Ignore, conntrack might be out of sync - * sCR -> sIG Ignore, conntrack might be out of sync - * sCG -> sIG Ignore, conntrack might be out of sync - * sTW -> sRQ Reincarnation, must reverse roles - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ - }, - [DCCP_PKT_RESPONSE] = { - /* - * sNO -> sIV Response without Request - * sRQ -> sRS Response to clients Request - * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) - * sPO -> sIG Response to an ignored Request or late retransmit - * sOP -> sIG Ignore, might be response to ignored Request - * sCR -> sIG Ignore, might be response to ignored Request - * sCG -> sIG Ignore, might be response to ignored Request - * sTW -> sIV Invalid, Request from client in sTW moves to sRQ - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV - }, - [DCCP_PKT_ACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular Ack in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_DATA] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular Data packet in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_DATAACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular DataAck in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_CLOSEREQ] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) - * sOP -> sCR CloseReq in OPEN state - * sCR -> sCR Retransmit - * sCG -> sCR Simultaneous close, client sends another Close - * sTW -> sIV Already closed - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV - }, - [DCCP_PKT_CLOSE] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP -> sCG Move direcly to CLOSING - * sOP -> sCG Move to CLOSING - * sCR -> sIV Close after CloseReq is invalid - * sCG -> sCG Retransmit - * sTW -> sIV Already closed - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV - }, - [DCCP_PKT_RESET] = { - /* - * sNO -> sIV No connection - * sRQ -> sTW Reset in response to Request - * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) - * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) - * sOP -> sTW - * sCR -> sTW - * sCG -> sTW - * sTW -> sIG Ignore (don't refresh timer) - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ - sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG - }, - [DCCP_PKT_SYNC] = { - /* - * We currently ignore Sync packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - [DCCP_PKT_SYNCACK] = { - /* - * We currently ignore SyncAck packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - }, -}; - -static noinline bool -dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - const struct dccp_hdr *dh, - const struct nf_hook_state *hook_state) -{ - struct net *net = nf_ct_net(ct); - struct nf_dccp_net *dn; - const char *msg; - u_int8_t state; - - state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; - switch (state) { - default: - dn = nf_dccp_pernet(net); - if (dn->dccp_loose == 0) { - msg = "not picking up existing connection "; - goto out_invalid; - } - break; - case CT_DCCP_REQUEST: - break; - case CT_DCCP_INVALID: - msg = "invalid state transition "; - goto out_invalid; - } - - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.state = CT_DCCP_NONE; - ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; - ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; - ct->proto.dccp.handshake_seq = 0; - return true; - -out_invalid: - nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg); - return false; -} - -static u64 dccp_ack_seq(const struct dccp_hdr *dh) -{ - const struct dccp_hdr_ack_bits *dhack; - - dhack = (void *)dh + __dccp_basic_hdr_len(dh); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + - ntohl(dhack->dccph_ack_nr_low); -} - -static bool dccp_error(const struct dccp_hdr *dh, - struct sk_buff *skb, unsigned int dataoff, - const struct nf_hook_state *state) -{ - unsigned int dccp_len = skb->len - dataoff; - unsigned int cscov; - const char *msg; - - if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || - dh->dccph_doff * 4 > dccp_len) { - msg = "nf_ct_dccp: truncated/malformed packet "; - goto out_invalid; - } - - cscov = dccp_len; - if (dh->dccph_cscov) { - cscov = (dh->dccph_cscov - 1) * 4; - if (cscov > dccp_len) { - msg = "nf_ct_dccp: bad checksum coverage "; - goto out_invalid; - } - } - - if (state->hook == NF_INET_PRE_ROUTING && - state->net->ct.sysctl_checksum && - nf_checksum_partial(skb, state->hook, dataoff, cscov, - IPPROTO_DCCP, state->pf)) { - msg = "nf_ct_dccp: bad checksum "; - goto out_invalid; - } - - if (dh->dccph_type >= DCCP_PKT_INVALID) { - msg = "nf_ct_dccp: reserved packet type "; - goto out_invalid; - } - return false; -out_invalid: - nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg); - return true; -} - -int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct dccp_hdr _dh, *dh; - u_int8_t type, old_state, new_state; - enum ct_dccp_roles role; - unsigned int *timeouts; - - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); - if (!dh) - return NF_DROP; - - if (dccp_error(dh, skb, dataoff, state)) - return -NF_ACCEPT; - - type = dh->dccph_type; - if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) - return -NF_ACCEPT; - - if (type == DCCP_PKT_RESET && - !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - /* Tear down connection immediately if only reply is a RESET */ - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_ACCEPT; - } - - spin_lock_bh(&ct->lock); - - role = ct->proto.dccp.role[dir]; - old_state = ct->proto.dccp.state; - new_state = dccp_state_table[role][type][old_state]; - - switch (new_state) { - case CT_DCCP_REQUEST: - if (old_state == CT_DCCP_TIMEWAIT && - role == CT_DCCP_ROLE_SERVER) { - /* Reincarnation in the reverse direction: reopen and - * reverse client/server roles. */ - ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; - } - break; - case CT_DCCP_RESPOND: - if (old_state == CT_DCCP_REQUEST) - ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); - break; - case CT_DCCP_PARTOPEN: - if (old_state == CT_DCCP_RESPOND && - type == DCCP_PKT_ACK && - dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) - set_bit(IPS_ASSURED_BIT, &ct->status); - break; - case CT_DCCP_IGNORE: - /* - * Connection tracking might be out of sync, so we ignore - * packets that might establish a new connection and resync - * if the server responds with a valid Response. - */ - if (ct->proto.dccp.last_dir == !dir && - ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && - type == DCCP_PKT_RESPONSE) { - ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); - new_state = CT_DCCP_RESPOND; - break; - } - ct->proto.dccp.last_dir = dir; - ct->proto.dccp.last_pkt = type; - - spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet"); - return NF_ACCEPT; - case CT_DCCP_INVALID: - spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition"); - return -NF_ACCEPT; - } - - ct->proto.dccp.last_dir = dir; - ct->proto.dccp.last_pkt = type; - ct->proto.dccp.state = new_state; - spin_unlock_bh(&ct->lock); - - if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, ct); - - timeouts = nf_ct_timeout_lookup(ct); - if (!timeouts) - timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; - nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - - return NF_ACCEPT; -} - -static bool dccp_can_early_drop(const struct nf_conn *ct) -{ - switch (ct->proto.dccp.state) { - case CT_DCCP_CLOSEREQ: - case CT_DCCP_CLOSING: - case CT_DCCP_TIMEWAIT: - return true; - default: - break; - } - - return false; -} - -#ifdef CONFIG_NF_CONNTRACK_PROCFS -static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) -{ - seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) -static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct, bool destroy) -{ - struct nlattr *nest_parms; - - spin_lock_bh(&ct->lock); - nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); - if (!nest_parms) - goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state)) - goto nla_put_failure; - - if (destroy) - goto skip_state; - - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || - nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, - cpu_to_be64(ct->proto.dccp.handshake_seq), - CTA_PROTOINFO_DCCP_PAD)) - goto nla_put_failure; -skip_state: - nla_nest_end(skb, nest_parms); - spin_unlock_bh(&ct->lock); - - return 0; - -nla_put_failure: - spin_unlock_bh(&ct->lock); - return -1; -} - -static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { - [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, - [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, -}; - -#define DCCP_NLATTR_SIZE ( \ - NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \ - NLA_ALIGN(NLA_HDRLEN + 0)) - -static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) -{ - struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; - struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; - int err; - - if (!attr) - return 0; - - err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr, - dccp_nla_policy, NULL); - if (err < 0) - return err; - - if (!tb[CTA_PROTOINFO_DCCP_STATE] || - !tb[CTA_PROTOINFO_DCCP_ROLE] || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { - return -EINVAL; - } - - spin_lock_bh(&ct->lock); - ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); - if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; - } else { - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; - } - if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { - ct->proto.dccp.handshake_seq = - be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); - } - spin_unlock_bh(&ct->lock); - return 0; -} -#endif - -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_cttimeout.h> - -static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - struct nf_dccp_net *dn = nf_dccp_pernet(net); - unsigned int *timeouts = data; - int i; - - if (!timeouts) - timeouts = dn->dccp_timeout; - - /* set default DCCP timeouts. */ - for (i=0; i<CT_DCCP_MAX; i++) - timeouts[i] = dn->dccp_timeout[i]; - - /* there's a 1:1 mapping between attributes and protocol states. */ - for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { - if (tb[i]) { - timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; - } - } - - timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST]; - return 0; -} - -static int -dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeouts = data; - int i; - - for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { - if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) - goto nla_put_failure; - } - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { - [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - -void nf_conntrack_dccp_init_net(struct net *net) -{ - struct nf_dccp_net *dn = nf_dccp_pernet(net); - - /* default values */ - dn->dccp_loose = 1; - dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; - - /* timeouts[0] is unused, make it same as SYN_SENT so - * ->timeouts[0] contains 'new' timeout, like udp or icmp. - */ - dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { - .l4proto = IPPROTO_DCCP, - .can_early_drop = dccp_can_early_drop, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = dccp_print_conntrack, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = DCCP_NLATTR_SIZE, - .to_nlattr = dccp_to_nlattr, - .from_nlattr = nlattr_to_dccp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = dccp_timeout_nlattr_to_obj, - .obj_to_nlattr = dccp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_DCCP_MAX, - .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, - .nla_policy = dccp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -}; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 728eeb0aea87..af369e686fc5 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -205,6 +205,8 @@ int nf_conntrack_gre_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { + unsigned long status; + if (!nf_ct_is_confirmed(ct)) { unsigned int *timeouts = nf_ct_timeout_lookup(ct); @@ -217,11 +219,17 @@ int nf_conntrack_gre_packet(struct nf_conn *ct, ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; } + status = READ_ONCE(ct->status); /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ - if (ct->status & IPS_SEEN_REPLY) { + if (status & IPS_SEEN_REPLY) { nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.stream_timeout); + + /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ + if (unlikely((status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe. */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); @@ -296,6 +304,7 @@ void nf_conntrack_gre_init_net(struct net *net) /* protocol helper struct */ const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = { .l4proto = IPPROTO_GRE, + .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 1020d67600a9..327b8059025d 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = { [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, - [ICMPV6_MLD2_REPORT - 130] = 1 + [ICMPV6_MLD2_REPORT - 130] = 1, + [ICMPV6_MRDISC_ADV - 130] = 1, + [ICMPV6_MRDISC_SOL - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 945dd40e7077..7c6f7c9f7332 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -39,20 +39,15 @@ static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { - [SCTP_CONNTRACK_CLOSED] = 10 SECS, - [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, - [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, - [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, - [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, - [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, - [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), + [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), + [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), + [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 @@ -105,14 +100,14 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW}, +/* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ +/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, @@ -126,7 +121,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ +/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, @@ -142,17 +137,19 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) } #endif +/* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ - ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))) && \ - (sch)->length; \ + (offset) < (skb)->len && \ + ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - unsigned long *map) + unsigned long *map, + const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; @@ -161,8 +158,6 @@ static int do_basic_checks(struct nf_conn *ct, flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); - if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) @@ -177,7 +172,9 @@ static int do_basic_checks(struct nf_conn *ct, sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { - pr_debug("Basic checks failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "%s failed. chunk num %d, type %d, len %d flag %d\n", + __func__, count, sch->type, sch->length, flag); return 1; } @@ -185,7 +182,6 @@ static int do_basic_checks(struct nf_conn *ct, set_bit(sch->type, map); } - pr_debug("Basic checks passed\n"); return count == 0; } @@ -195,64 +191,47 @@ static int sctp_new_state(enum ip_conntrack_dir dir, { int i; - pr_debug("Chunk type: %d\n", chunk_type); - switch (chunk_type) { case SCTP_CID_INIT: - pr_debug("SCTP_CID_INIT\n"); i = 0; break; case SCTP_CID_INIT_ACK: - pr_debug("SCTP_CID_INIT_ACK\n"); i = 1; break; case SCTP_CID_ABORT: - pr_debug("SCTP_CID_ABORT\n"); i = 2; break; case SCTP_CID_SHUTDOWN: - pr_debug("SCTP_CID_SHUTDOWN\n"); i = 3; break; case SCTP_CID_SHUTDOWN_ACK: - pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); i = 4; break; case SCTP_CID_ERROR: - pr_debug("SCTP_CID_ERROR\n"); i = 5; break; case SCTP_CID_COOKIE_ECHO: - pr_debug("SCTP_CID_COOKIE_ECHO\n"); i = 6; break; case SCTP_CID_COOKIE_ACK: - pr_debug("SCTP_CID_COOKIE_ACK\n"); i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: - pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; case SCTP_CID_HEARTBEAT: - pr_debug("SCTP_CID_HEARTBEAT"); i = 9; break; case SCTP_CID_HEARTBEAT_ACK: - pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ - pr_debug("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); + pr_debug("Unknown chunk type %d, Will stay in %s\n", + chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } - pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", - dir, sctp_conntrack_names[cur_state], chunk_type, - sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); - return sctp_conntracks[dir][i][cur_state]; } @@ -299,7 +278,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; - } else { + } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", @@ -369,7 +348,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (sh == NULL) goto out; - if (do_basic_checks(ct, skb, dataoff, map) != 0) + if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { @@ -392,7 +371,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "verification tag check failed %x vs %x for dir %d", + sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } @@ -426,6 +407,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_COOKIE_ACK) { + ct->proto.sctp.init[dir] = 0; + ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); @@ -467,23 +451,26 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { - pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " - "conntrack=%u\n", - dir, sch->type, old_state); + nf_ct_l4proto_log_invalid(skb, ct, state, + "Invalid, old_state %d, dir %d, type %d", + old_state, dir, sch->type); + goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT || - sch->type == SCTP_CID_INIT_ACK) { - struct sctp_inithdr _inithdr, *ih; + if (sch->type == SCTP_CID_INIT) { + struct sctp_inithdr _ih, *ih; - ih = skb_header_pointer(skb, offset + sizeof(_sch), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) goto out_unlock; - pr_debug("Setting vtag %x for dir %d\n", - ih->init_tag, !dir); + + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) + ct->proto.sctp.init[!dir] = 0; + ct->proto.sctp.init[dir] = 1; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so @@ -494,6 +481,24 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; + } else if (sch->type == SCTP_CID_INIT_ACK) { + struct sctp_inithdr _ih, *ih; + __be32 vtag; + + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) + goto out_unlock; + + vtag = ct->proto.sctp.vtag[!dir]; + if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) + goto out_unlock; + /* collision */ + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && + vtag != ih->init_tag) + goto out_unlock; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 3ac1af6f59fc..0c1d086e96cb 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -14,7 +14,7 @@ #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/tcp.h> @@ -457,7 +457,8 @@ static void tcp_init_sender(struct ip_ct_tcp_state *sender, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, - u32 end, u32 win) + u32 end, u32 win, + enum ip_conntrack_dir dir) { /* SYN-ACK in reply to a SYN * or SYN from reply direction in simultaneous open. @@ -471,7 +472,8 @@ static void tcp_init_sender(struct ip_ct_tcp_state *sender, * Both sides must send the Window Scale option * to enable window scaling in either direction. */ - if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && + if (dir == IP_CT_DIR_REPLY && + !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) { sender->td_scale = 0; receiver->td_scale = 0; @@ -542,7 +544,7 @@ tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, if (tcph->syn) { tcp_init_sender(sender, receiver, skb, dataoff, tcph, - end, win); + end, win, dir); if (!tcph->ack) /* Simultaneous open */ return NFCT_TCP_ACCEPT; @@ -585,7 +587,7 @@ tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, */ tcp_init_sender(sender, receiver, skb, dataoff, tcph, - end, win); + end, win, dir); if (dir == IP_CT_DIR_REPLY && !tcph->ack) return NFCT_TCP_ACCEPT; @@ -835,7 +837,8 @@ static bool tcp_error(const struct tcphdr *th, static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - const struct tcphdr *th) + const struct tcphdr *th, + const struct nf_hook_state *state) { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); @@ -846,7 +849,7 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { - pr_debug("nf_ct_tcp: invalid new deleting.\n"); + tcp_error_log(skb, state, "invalid new"); return false; } @@ -911,6 +914,41 @@ static bool tcp_can_early_drop(const struct nf_conn *ct) return false; } +void nf_conntrack_tcp_set_closing(struct nf_conn *ct) +{ + enum tcp_conntrack old_state; + const unsigned int *timeouts; + u32 timeout; + + if (!nf_ct_is_confirmed(ct)) + return; + + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; + ct->proto.tcp.state = TCP_CONNTRACK_CLOSE; + + if (old_state == TCP_CONNTRACK_CLOSE || + test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_unlock_bh(&ct->lock); + return; + } + + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) { + const struct nf_tcp_net *tn; + + tn = nf_tcp_pernet(nf_ct_net(ct)); + timeouts = tn->timeouts; + } + + timeout = timeouts[TCP_CONNTRACK_CLOSE]; + WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); + + spin_unlock_bh(&ct->lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); +} + static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) { state->td_end = 0; @@ -930,7 +968,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); - struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; enum nf_ct_tcp_action res; @@ -946,7 +983,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, if (tcp_error(th, skb, dataoff, state)) return -NF_ACCEPT; - if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th)) + if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state)) return -NF_ACCEPT; spin_lock_bh(&ct->lock); @@ -954,7 +991,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; - tuple = &ct->tuplehash[dir].tuple; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: @@ -1232,13 +1268,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; - pr_debug("tcp_conntracks: "); - nf_ct_dump_tuple(tuple); - pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", - (th->syn ? 1 : 0), (th->ack ? 1 : 0), - (th->fin ? 1 : 0), (th->rst ? 1 : 0), - old_state, new_state); - ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3b516cffc779..0030fbe8885c 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -88,6 +88,7 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, const struct nf_hook_state *state) { unsigned int *timeouts; + unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; @@ -96,26 +97,27 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); - if (!nf_ct_is_confirmed(ct)) + status = READ_ONCE(ct->status); + if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + if (status & IPS_SEEN_REPLY) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; bool stream = false; /* Still active after two seconds? Extend timeout. */ if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; - stream = true; + stream = (status & IPS_ASSURED) == 0; } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ - if (unlikely((ct->status & IPS_NAT_CLASH))) + if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 77f5e82d8e3f..ca748f8dbff1 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -611,7 +611,7 @@ int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, start += strlen(name); *val = simple_strtoul(start, &end, 0); if (start == end) - return 0; + return -1; if (matchoff && matchlen) { *matchoff = start - dptr; *matchlen = end - start; @@ -1553,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; @@ -1624,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 460294bd4b60..207b240b14e5 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -14,6 +14,7 @@ #include <linux/sysctl.h> #endif +#include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> @@ -22,9 +23,6 @@ #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> -#ifdef CONFIG_LWTUNNEL -#include <net/netfilter/nf_hooks_lwtunnel.h> -#endif #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; @@ -70,11 +68,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, ntohs(tuple->dst.u.udp.port)); break; - case IPPROTO_DCCP: - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.dccp.port), - ntohs(tuple->dst.u.dccp.port)); - break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), @@ -101,69 +94,87 @@ struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; + unsigned int skip_elems; unsigned int bucket; u_int64_t time_now; }; -static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) +static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net, + struct ct_iter_state *st) { - struct ct_iter_state *st = seq->private; + struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int i; - for (st->bucket = 0; - st->bucket < st->htable_size; - st->bucket++) { - n = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - if (!is_a_nulls(n)) - return n; - } - return NULL; -} + for (i = st->bucket; i < st->htable_size; i++) { + unsigned int skip = 0; -static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, - struct hlist_nulls_node *head) -{ - struct ct_iter_state *st = seq->private; +restart: + hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct hlist_nulls_node *tmp = n; + + if (!net_eq(net, nf_ct_net(ct))) + continue; + + if (++skip <= st->skip_elems) + continue; + + /* h should be returned, skip to nulls marker. */ + while (!is_a_nulls(tmp)) + tmp = rcu_dereference(hlist_nulls_next_rcu(tmp)); + + /* check if h is still linked to hash[i] */ + if (get_nulls_value(tmp) != i) { + skip = 0; + goto restart; + } - head = rcu_dereference(hlist_nulls_next_rcu(head)); - while (is_a_nulls(head)) { - if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= st->htable_size) - return NULL; + st->skip_elems = skip; + st->bucket = i; + return h; } - head = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - } - return head; -} -static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_nulls_node *head = ct_get_first(seq); + skip = 0; + if (get_nulls_value(n) != i) + goto restart; - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; + st->skip_elems = 0; + } + + st->bucket = i; + return NULL; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; + struct net *net = seq_file_net(seq); st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); - return ct_get_idx(seq, *pos); + + if (*pos == 0) { + st->skip_elems = 0; + st->bucket = 0; + } else if (st->skip_elems) { + /* resume from last dumped entry */ + st->skip_elems--; + } + + return ct_get_next(net, st); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct ct_iter_state *st = s->private; + struct net *net = seq_file_net(s); + (*pos)++; - return ct_get_next(s, v); + return ct_get_next(net, st); } static void ct_seq_stop(struct seq_file *s, void *v) @@ -175,17 +186,16 @@ static void ct_seq_stop(struct seq_file *s, void *v) #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { + struct lsm_context ctx; int ret; - u32 len; - char *secctx; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return; - seq_printf(s, "secctx=%s ", secctx); + seq_printf(s, "secctx=%s ", ctx.context); - security_release_secctx(secctx, len); + security_release_secctx(&ctx); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) @@ -265,7 +275,6 @@ static const char* l4proto_name(u16 proto) case IPPROTO_ICMP: return "icmp"; case IPPROTO_TCP: return "tcp"; case IPPROTO_UDP: return "udp"; - case IPPROTO_DCCP: return "dccp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; @@ -275,7 +284,7 @@ static const char* l4proto_name(u16 proto) return "unknown"; } -static unsigned int +static void seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) { struct nf_conn_acct *acct; @@ -283,14 +292,12 @@ seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) acct = nf_conn_acct_find(ct); if (!acct) - return 0; + return; counter = acct->counter; seq_printf(s, "packets=%llu bytes=%llu ", (unsigned long long)atomic64_read(&counter[dir].packets), (unsigned long long)atomic64_read(&counter[dir].bytes)); - - return 0; } /* return 0 on success, 1 in case of error */ @@ -310,6 +317,9 @@ static int ct_seq_show(struct seq_file *s, void *v) smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) { + struct ct_iter_state *st = s->private; + + st->skip_elems--; nf_ct_kill(ct); goto release; } @@ -342,8 +352,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_has_overflowed(s)) goto release; - if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) - goto release; + seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL); if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); @@ -352,8 +361,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); - if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) - goto release; + seq_print_acct(s, ct, IP_CT_DIR_REPLY); if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[HW_OFFLOAD] "); @@ -531,7 +539,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_count); static unsigned int nf_conntrack_htable_size_user __read_mostly; static int -nf_conntrack_hash_sysctl(struct ctl_table *table, int write, +nf_conntrack_hash_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -551,6 +559,29 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write, return ret; } +static int +nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, i; + + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret < 0 || !write) + return ret; + + if (*(u8 *)table->data == 0) + return 0; + + /* Load nf_log_syslog only if no logger is currently registered */ + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + if (nf_log_is_registered(i)) + return 0; + } + request_module("%s", "nf_log_syslog"); + + return 0; +} + static struct ctl_table_header *nf_ct_netfilter_header; enum nf_ct_sysctl_index { @@ -602,36 +633,23 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT, - NF_SYSCTL_CT_PROTO_DCCP_LOOSE, -#endif #ifdef CONFIG_NF_CT_PROTO_GRE NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif -#ifdef CONFIG_LWTUNNEL - NF_SYSCTL_CT_LWTUNNEL, -#endif - __NF_SYSCTL_CT_LAST_SYSCTL, + NF_SYSCTL_CT_LAST_SYSCTL, }; -#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) - static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", @@ -660,14 +678,16 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dou8vec_minmax, + .proc_handler = nf_conntrack_log_invalid_sysctl, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", @@ -886,58 +906,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { - .procname = "nf_conntrack_dccp_timeout_request", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = { - .procname = "nf_conntrack_dccp_timeout_respond", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = { - .procname = "nf_conntrack_dccp_timeout_partopen", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = { - .procname = "nf_conntrack_dccp_timeout_open", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = { - .procname = "nf_conntrack_dccp_timeout_closereq", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = { - .procname = "nf_conntrack_dccp_timeout_closing", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = { - .procname = "nf_conntrack_dccp_timeout_timewait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { - .procname = "nf_conntrack_dccp_loose", - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_NF_CT_PROTO_GRE [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { .procname = "nf_conntrack_gre_timeout", @@ -952,16 +920,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif -#ifdef CONFIG_LWTUNNEL - [NF_SYSCTL_CT_LWTUNNEL] = { - .procname = "nf_hooks_lwtunnel", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = nf_hooks_lwtunnel_sysctl_handler, - }, -#endif - {} }; static struct ctl_table nf_ct_netfilter_table[] = { @@ -970,9 +928,10 @@ static struct ctl_table nf_ct_netfilter_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, - { } }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, @@ -1032,29 +991,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, #endif } -static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net, - struct ctl_table *table) -{ -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct nf_dccp_net *dn = nf_dccp_pernet(net); - -#define XASSIGN(XNAME, dn) \ - table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \ - &(dn)->dccp_timeout[CT_DCCP_ ## XNAME] - - XASSIGN(REQUEST, dn); - XASSIGN(RESPOND, dn); - XASSIGN(PARTOPEN, dn); - XASSIGN(OPEN, dn); - XASSIGN(CLOSEREQ, dn); - XASSIGN(CLOSING, dn); - XASSIGN(TIMEWAIT, dn); -#undef XASSIGN - - table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose; -#endif -} - static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, struct ctl_table *table) { @@ -1100,7 +1036,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); - nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); /* Don't allow non-init_net ns to alter global sysctls */ @@ -1110,7 +1045,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } - cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table); + cnet->sysctl_header = register_net_sysctl_sz(net, "net/netfilter", + table, + ARRAY_SIZE(nf_ct_sysctl_table)); if (!cnet->sysctl_header) goto out_unregister_netfilter; @@ -1124,7 +1061,7 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); @@ -1222,11 +1159,12 @@ static int __init nf_conntrack_standalone_init(void) nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif + nf_conntrack_init_end(); + ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; - nf_conntrack_init_end(); return 0; out_pernet: diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a8e2425e43b0..fab8b9011098 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -15,12 +15,26 @@ #define NF_RECURSION_LIMIT 2 -static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { - if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) goto err; if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { @@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb_clear_tstamp(skb); - __this_cpu_inc(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)++; dev_queue_xmit(skb); - __this_cpu_dec(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)--; return; err: kfree_skb(skb); diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c new file mode 100644 index 000000000000..4a5f5195f2d2 --- /dev/null +++ b/net/netfilter/nf_flow_table_bpf.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable Flow Table Helpers for XDP hook + * + * These are called from the XDP programs. + * Note that it is allowed to break compatibility for these functions since + * the interface they are exposed through to BPF programs is explicitly + * unstable. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <net/netfilter/nf_flow_table.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <net/xdp.h> + +/* bpf_flowtable_opts - options for bpf flowtable helpers + * @error: out parameter, set for any encountered error + */ +struct bpf_flowtable_opts { + s32 error; +}; + +enum { + NF_BPF_FLOWTABLE_OPTS_SZ = 4, +}; + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in nf_flow_table BTF"); + +__bpf_kfunc_start_defs(); + +static struct flow_offload_tuple_rhash * +bpf_xdp_flow_tuple_lookup(struct net_device *dev, + struct flow_offload_tuple *tuple, __be16 proto) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *nf_flow_table; + struct flow_offload *nf_flow; + + nf_flow_table = nf_flowtable_by_dev(dev); + if (!nf_flow_table) + return ERR_PTR(-ENOENT); + + tuplehash = flow_offload_lookup(nf_flow_table, tuple); + if (!tuplehash) + return ERR_PTR(-ENOENT); + + nf_flow = container_of(tuplehash, struct flow_offload, + tuplehash[tuplehash->tuple.dir]); + flow_offload_refresh(nf_flow_table, nf_flow, false); + + return tuplehash; +} + +__bpf_kfunc struct flow_offload_tuple_rhash * +bpf_xdp_flow_lookup(struct xdp_md *ctx, struct bpf_fib_lookup *fib_tuple, + struct bpf_flowtable_opts *opts, u32 opts_len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + struct flow_offload_tuple tuple = { + .iifidx = fib_tuple->ifindex, + .l3proto = fib_tuple->family, + .l4proto = fib_tuple->l4_protocol, + .src_port = fib_tuple->sport, + .dst_port = fib_tuple->dport, + }; + struct flow_offload_tuple_rhash *tuplehash; + __be16 proto; + + if (opts_len != NF_BPF_FLOWTABLE_OPTS_SZ) { + opts->error = -EINVAL; + return NULL; + } + + switch (fib_tuple->family) { + case AF_INET: + tuple.src_v4.s_addr = fib_tuple->ipv4_src; + tuple.dst_v4.s_addr = fib_tuple->ipv4_dst; + proto = htons(ETH_P_IP); + break; + case AF_INET6: + tuple.src_v6 = *(struct in6_addr *)&fib_tuple->ipv6_src; + tuple.dst_v6 = *(struct in6_addr *)&fib_tuple->ipv6_dst; + proto = htons(ETH_P_IPV6); + break; + default: + opts->error = -EAFNOSUPPORT; + return NULL; + } + + tuplehash = bpf_xdp_flow_tuple_lookup(xdp->rxq->dev, &tuple, proto); + if (IS_ERR(tuplehash)) { + opts->error = PTR_ERR(tuplehash); + return NULL; + } + + return tuplehash; +} + +__diag_pop() + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(nf_ft_kfunc_set) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_KFUNCS_END(nf_ft_kfunc_set) + +static const struct btf_kfunc_id_set nf_flow_kfunc_set = { + .owner = THIS_MODULE, + .set = &nf_ft_kfunc_set, +}; + +int nf_flow_register_bpf(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, + &nf_flow_kfunc_set); +} +EXPORT_SYMBOL_GPL(nf_flow_register_bpf); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 81c26a96c30b..06e8251a6644 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -77,22 +77,28 @@ EXPORT_SYMBOL_GPL(flow_offload_alloc); static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) { - const struct rt6_info *rt; - - if (flow_tuple->l3proto == NFPROTO_IPV6) { - rt = (const struct rt6_info *)flow_tuple->dst_cache; - return rt6_get_cookie(rt); - } + if (flow_tuple->l3proto == NFPROTO_IPV6) + return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache)); return 0; } +static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct dst_entry *dst = route->tuple[dir].dst; + + route->tuple[dir].dst = NULL; + + return dst; +} + static int flow_offload_fill_route(struct flow_offload *flow, - const struct nf_flow_route *route, + struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; - struct dst_entry *dst = route->tuple[dir].dst; + struct dst_entry *dst = nft_route_dst_fetch(route, dir); int i, j = 0; switch (flow_tuple->l3proto) { @@ -112,7 +118,10 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->in_vlan_ingress |= BIT(j); j++; } + + flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: @@ -121,13 +130,11 @@ static int flow_offload_fill_route(struct flow_offload *flow, memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; - flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; + dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: - if (!dst_hold_safe(route->tuple[dir].dst)) - return -1; - + flow_tuple->ifidx = route->tuple[dir].out.ifindex; flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; @@ -148,63 +155,95 @@ static void nft_flow_dst_release(struct flow_offload *flow, dst_release(flow->tuplehash[dir].tuple.dst_cache); } -int flow_offload_route_init(struct flow_offload *flow, - const struct nf_flow_route *route) +void flow_offload_route_init(struct flow_offload *flow, + struct nf_flow_route *route) { - int err; - - err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); - if (err < 0) - return err; - - err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); - if (err < 0) - goto err_route_reply; - + flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); flow->type = NF_FLOW_OFFLOAD_ROUTE; - - return 0; - -err_route_reply: - nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); - - return err; } EXPORT_SYMBOL_GPL(flow_offload_route_init); -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) +static inline bool nf_flow_has_expired(const struct flow_offload *flow) { + return nf_flow_timeout_delta(flow->timeout) <= 0; +} + +static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state) +{ + struct ip_ct_tcp *tcp = &ct->proto.tcp; + + spin_lock_bh(&ct->lock); + if (tcp->state != tcp_state) + tcp->state = tcp_state; + + /* syn packet triggers the TCP reopen case from conntrack. */ + if (tcp->state == TCP_CONNTRACK_CLOSE) + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + /* Conntrack state is outdated due to offload bypass. + * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks + * TCP reset validation will fail. + */ tcp->seen[0].td_maxwin = 0; + tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; tcp->seen[1].td_maxwin = 0; + tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; + spin_unlock_bh(&ct->lock); } -static void flow_offload_fixup_ct(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct flow_offload *flow) { + struct nf_conn *ct = flow->ct; struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); + bool expired, closing = false; + u32 offload_timeout = 0; s32 timeout; if (l4num == IPPROTO_TCP) { - struct nf_tcp_net *tn = nf_tcp_pernet(net); - - flow_offload_fixup_tcp(&ct->proto.tcp); + const struct nf_tcp_net *tn = nf_tcp_pernet(net); + u8 tcp_state; + + /* Enter CLOSE state if fin/rst packet has been seen, this + * allows TCP reopen from conntrack. Otherwise, pick up from + * the last seen TCP state. + */ + closing = test_bit(NF_FLOW_CLOSING, &flow->flags); + if (closing) { + flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE); + timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]); + expired = false; + } else { + tcp_state = READ_ONCE(ct->proto.tcp.state); + flow_offload_fixup_tcp(ct, tcp_state); + timeout = READ_ONCE(tn->timeouts[tcp_state]); + expired = nf_flow_has_expired(flow); + } + offload_timeout = READ_ONCE(tn->offload_timeout); - timeout = tn->timeouts[ct->proto.tcp.state]; - timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { - struct nf_udp_net *tn = nf_udp_pernet(net); - - timeout = tn->timeouts[UDP_CT_REPLIED]; - timeout -= tn->offload_timeout; + const struct nf_udp_net *tn = nf_udp_pernet(net); + enum udp_conntrack state = + test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + UDP_CT_REPLIED : UDP_CT_UNREPLIED; + + timeout = READ_ONCE(tn->timeouts[state]); + expired = nf_flow_has_expired(flow); + offload_timeout = READ_ONCE(tn->offload_timeout); } else { return; } + if (expired) + timeout -= offload_timeout; + if (timeout < 0) timeout = 0; - if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) - WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); + if (closing || + nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) + nf_ct_refresh(ct, timeout); } static void flow_offload_route_release(struct flow_offload *flow) @@ -302,7 +341,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - nf_ct_offload_timeout(flow->ct); + nf_ct_refresh(flow->ct, NF_CT_DAY); if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); @@ -314,28 +353,24 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, - struct flow_offload *flow) + struct flow_offload *flow, bool force) { u32 timeout; timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); - if (timeout - READ_ONCE(flow->timeout) > HZ) + if (force || timeout - READ_ONCE(flow->timeout) > HZ) WRITE_ONCE(flow->timeout, timeout); else return; - if (likely(!nf_flowtable_hw_offload(flow_table))) + if (likely(!nf_flowtable_hw_offload(flow_table)) || + test_bit(NF_FLOW_CLOSING, &flow->flags)) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); -static inline bool nf_flow_has_expired(const struct flow_offload *flow) -{ - return nf_flow_timeout_delta(flow->timeout) <= 0; -} - static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -351,8 +386,8 @@ static void flow_offload_del(struct nf_flowtable *flow_table, void flow_offload_teardown(struct flow_offload *flow) { clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); - set_bit(NF_FLOW_TEARDOWN, &flow->flags); - flow_offload_fixup_ct(flow->ct); + if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags)) + flow_offload_fixup_ct(flow); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -416,14 +451,124 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } +static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, + const struct flow_offload *flow) +{ + return flow_table->type->gc && flow_table->type->gc(flow); +} + +/** + * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry + * @ct: Flowtable offloaded tcp ct + * + * Return: number of seconds when ct entry should expire. + */ +static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct) +{ + u8 state = READ_ONCE(ct->proto.tcp.state); + + switch (state) { + case TCP_CONNTRACK_SYN_SENT: + case TCP_CONNTRACK_SYN_RECV: + return 0; + case TCP_CONNTRACK_ESTABLISHED: + return NF_CT_DAY; + case TCP_CONNTRACK_FIN_WAIT: + case TCP_CONNTRACK_CLOSE_WAIT: + case TCP_CONNTRACK_LAST_ACK: + case TCP_CONNTRACK_TIME_WAIT: + return 5 * 60 * HZ; + case TCP_CONNTRACK_CLOSE: + return 0; + } + + return 0; +} + +/** + * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry + * @ct: Flowtable offloaded ct + * + * Datapath lookups in the conntrack table will evict nf_conn entries + * if they have expired. + * + * Once nf_conn entries have been offloaded, nf_conntrack might not see any + * packets anymore. Thus ct->timeout is no longer refreshed and ct can + * be evicted. + * + * To avoid the need for an additional check on the offload bit for every + * packet processed via nf_conntrack_in(), set an arbitrary timeout large + * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT + * from the packet path via nf_ct_is_expired(). + */ +static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct) +{ + static const u32 min_timeout = 5 * 60 * HZ; + u32 expires = nf_ct_expires(ct); + + /* normal case: large enough timeout, nothing to do. */ + if (likely(expires >= min_timeout)) + return; + + /* must check offload bit after this, we do not hold any locks. + * flowtable and ct entries could have been removed on another CPU. + */ + if (!refcount_inc_not_zero(&ct->ct_general.use)) + return; + + /* load ct->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + + if (nf_ct_is_confirmed(ct) && + test_bit(IPS_OFFLOAD_BIT, &ct->status)) { + u8 l4proto = nf_ct_protonum(ct); + u32 new_timeout = true; + + switch (l4proto) { + case IPPROTO_UDP: + new_timeout = NF_CT_DAY; + break; + case IPPROTO_TCP: + new_timeout = nf_flow_table_tcp_timeout(ct); + break; + default: + WARN_ON_ONCE(1); + break; + } + + /* Update to ct->timeout from nf_conntrack happens + * without holding ct->lock. + * + * Use cmpxchg to ensure timeout extension doesn't + * happen when we race with conntrack datapath. + * + * The inverse -- datapath updating ->timeout right + * after this -- is fine, datapath is authoritative. + */ + if (new_timeout) { + new_timeout += nfct_time_stamp; + cmpxchg(&ct->timeout, expires, new_timeout); + } + } + + nf_ct_put(ct); +} + static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { + bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags); + if (nf_flow_has_expired(flow) || - nf_ct_is_dying(flow->ct)) + nf_ct_is_dying(flow->ct) || + nf_flow_custom_gc(flow_table, flow)) { flow_offload_teardown(flow); + teardown = true; + } else if (!teardown) { + nf_flow_table_extend_ct_timeout(flow->ct); + } - if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (teardown) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); @@ -432,6 +577,10 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, } else { flow_offload_del(flow_table, flow); } + } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) && + test_bit(NF_FLOW_HW, &flow->flags) && + !test_bit(NF_FLOW_HW_DYING, &flow->flags)) { + nf_flow_offload_del(flow_table, flow); } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } @@ -671,8 +820,14 @@ static int __init nf_flow_table_module_init(void) if (ret) goto out_offload; + ret = nf_flow_register_bpf(); + if (ret) + goto out_bpf; + return 0; +out_bpf: + nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); return ret; diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 0ccabf3fa6aa..b0f199171932 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -17,11 +17,15 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return NF_ACCEPT; + veth = (struct vlan_ethhdr *)skb_mac_header(skb); proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): - proto = nf_flow_pppoe_proto(skb); + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; break; default: proto = skb->protocol; @@ -39,7 +43,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, } static int nf_flow_rule_route_inet(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 19efba1e51ef..78883343e5d6 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -8,6 +8,7 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_ether.h> +#include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -27,11 +28,15 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto, return 0; tcph = (void *)(skb_network_header(skb) + thoff); - if (unlikely(tcph->fin || tcph->rst)) { + if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) { flow_offload_teardown(flow); return -1; } + if ((tcph->fin || tcph->rst) && + !test_bit(NF_FLOW_CLOSING, &flow->flags)) + set_bit(NF_FLOW_CLOSING, &flow->flags); + return 0; } @@ -140,8 +145,11 @@ static bool ip_has_options(unsigned int thoff) static void nf_flow_tuple_encap(struct sk_buff *skb, struct flow_offload_tuple *tuple) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; + struct iphdr *iph; + u16 offset = 0; int i = 0; if (skb_vlan_tag_present(skb)) { @@ -154,47 +162,65 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, veth = (struct vlan_ethhdr *)skb_mac_header(skb); tuple->encap[i].id = ntohs(veth->h_vlan_TCI); tuple->encap[i].proto = skb->protocol; + inner_proto = veth->h_vlan_encapsulated_proto; + offset += VLAN_HLEN; break; case htons(ETH_P_PPP_SES): - phdr = (struct pppoe_hdr *)skb_mac_header(skb); + phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; + inner_proto = *((__be16 *)(phdr + 1)); + offset += PPPOE_SES_HLEN; break; } + + if (inner_proto == htons(ETH_P_IP)) { + iph = (struct iphdr *)(skb_network_header(skb) + offset); + if (iph->protocol == IPPROTO_IPIP) { + tuple->tun.dst_v4.s_addr = iph->daddr; + tuple->tun.src_v4.s_addr = iph->saddr; + tuple->tun.l3_proto = IPPROTO_IPIP; + } + } } -static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple, u32 *hdrsize, - u32 offset) +struct nf_flowtable_ctx { + const struct net_device *in; + u32 offset; + u32 hdrsize; +}; + +static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, + struct flow_offload_tuple *tuple) { struct flow_ports *ports; unsigned int thoff; struct iphdr *iph; u8 ipproto; - if (!pskb_may_pull(skb, sizeof(*iph) + offset)) + if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset)) return -1; - iph = (struct iphdr *)(skb_network_header(skb) + offset); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); thoff = (iph->ihl * 4); if (ip_is_fragment(iph) || unlikely(ip_has_options(thoff))) return -1; - thoff += offset; + thoff += ctx->offset; ipproto = iph->protocol; switch (ipproto) { case IPPROTO_TCP: - *hdrsize = sizeof(struct tcphdr); + ctx->hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: - *hdrsize = sizeof(struct udphdr); + ctx->hdrsize = sizeof(struct udphdr); break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: - *hdrsize = sizeof(struct gre_base_hdr); + ctx->hdrsize = sizeof(struct gre_base_hdr); break; #endif default: @@ -204,7 +230,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (iph->ttl <= 1) return -1; - if (!pskb_may_pull(skb, thoff + *hdrsize)) + if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; switch (ipproto) { @@ -224,13 +250,13 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, } } - iph = (struct iphdr *)(skb_network_header(skb) + offset); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v4.s_addr = iph->saddr; tuple->dst_v4.s_addr = iph->daddr; tuple->l3proto = AF_INET; tuple->l4proto = ipproto; - tuple->iifidx = dev->ifindex; + tuple->iifidx = ctx->in->ifindex; nf_flow_tuple_encap(skb, tuple); return 0; @@ -267,28 +293,72 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, +static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) +{ + struct iphdr *iph; + u16 size; + + if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) + return false; + + iph = (struct iphdr *)(skb_network_header(skb) + *psize); + size = iph->ihl << 2; + + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) + return false; + + if (iph->ttl <= 1) + return false; + + if (iph->protocol == IPPROTO_IPIP) + *psize += size; + + return true; +} + +static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + + if (iph->protocol != IPPROTO_IPIP) + return; + + skb_pull(skb, iph->ihl << 2); + skb_reset_network_header(skb); +} + +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; + bool ret = false; switch (skb->protocol) { case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return false; + veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { *offset += VLAN_HLEN; - return true; + inner_proto = proto; + ret = true; } break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_proto(skb) == proto) { + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { *offset += PPPOE_SES_HLEN; - return true; + ret = true; } break; } - return false; + if (inner_proto == htons(ETH_P_IP)) + ret = nf_flow_ip4_tunnel_proto(skb, offset); + + return ret; } static void nf_flow_encap_pop(struct sk_buff *skb, @@ -310,84 +380,86 @@ static void nf_flow_encap_pop(struct sk_buff *skb, skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): - skb->protocol = nf_flow_pppoe_proto(skb); + skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; } } + + if (skb->protocol == htons(ETH_P_IP)) + nf_flow_ip4_tunnel_pop(skb); } +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; +}; + static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, - const struct flow_offload_tuple_rhash *tuplehash, - unsigned short type) + struct nf_flow_xmit *xmit) { - struct net_device *outdev; - - outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); - if (!outdev) - return NF_DROP; - - skb->dev = outdev; - dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, - tuplehash->tuple.out.h_source, skb->len); + skb->dev = xmit->outdev; + dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); dev_queue_xmit(skb); return NF_STOLEN; } -unsigned int -nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static struct flow_offload_tuple_rhash * +nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, struct sk_buff *skb) { - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; struct flow_offload_tuple tuple = {}; + + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + return NULL; + + if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) + return NULL; + + return flow_offload_lookup(flow_table, &tuple); +} + +static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct flow_offload_tuple_rhash *tuplehash, + struct sk_buff *skb) +{ enum flow_offload_tuple_dir dir; struct flow_offload *flow; - struct net_device *outdev; - u32 hdrsize, offset = 0; unsigned int thoff, mtu; - struct rtable *rt; struct iphdr *iph; - __be32 nexthop; - int ret; - - if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset)) - return NF_ACCEPT; - - if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - mtu = flow->tuplehash[dir].tuple.mtu + offset; + mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) + mtu -= sizeof(*iph); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) - return NF_ACCEPT; + return 0; - iph = (struct iphdr *)(skb_network_header(skb) + offset); - thoff = (iph->ihl * 4) + offset; + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); + thoff = (iph->ihl * 4) + ctx->offset; if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) - return NF_ACCEPT; + return 0; if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); - return NF_ACCEPT; + return 0; } - if (skb_try_make_writable(skb, thoff + hdrsize)) - return NF_DROP; + if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + return -1; - flow_offload_refresh(flow_table, flow); + flow_offload_refresh(flow_table, flow, false); nf_flow_encap_pop(skb, tuplehash); - thoff -= offset; + thoff -= ctx->offset; iph = ip_hdr(skb); nf_flow_nat_ip(flow, skb, thoff, dir, iph); @@ -398,36 +470,204 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + return 1; +} + +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +{ + int data_len = skb->len + sizeof(__be16); + struct ppp_hdr { + struct pppoe_hdr hdr; + __be16 proto; + } *ph; + __be16 proto; + + if (skb_cow_head(skb, PPPOE_SES_HLEN)) + return -1; + + switch (skb->protocol) { + case htons(ETH_P_IP): + proto = htons(PPP_IP); + break; + case htons(ETH_P_IPV6): + proto = htons(PPP_IPV6); + break; + default: + return -1; + } + + __skb_push(skb, PPPOE_SES_HLEN); + skb_reset_network_header(skb); + + ph = (struct ppp_hdr *)(skb->data); + ph->hdr.ver = 1; + ph->hdr.type = 1; + ph->hdr.code = 0; + ph->hdr.sid = htons(id); + ph->hdr.length = htons(data_len); + ph->proto = proto; + skb->protocol = htons(ETH_P_PPP_SES); + + return 0; +} + +static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + struct rtable *rt = dst_rtable(tuple->dst_cache); + u8 tos = iph->tos, ttl = iph->ttl; + __be16 frag_off = iph->frag_off; + u32 headroom = sizeof(*iph); + int err; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); + if (err) + return err; + + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + skb_clear_hash_if_not_l4(skb); + + /* Push down and install the IP header. */ + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; + iph->protocol = tuple->tun.l3_proto; + iph->tos = tos; + iph->daddr = tuple->tun.src_v4.s_addr; + iph->saddr = tuple->tun.dst_v4.s_addr; + iph->ttl = ttl; + iph->tot_len = htons(skb->len); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); + ip_send_check(iph); + + *ip_daddr = tuple->tun.src_v4.s_addr; + + return 0; +} + +static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); + + return 0; +} + +static int nf_flow_encap_push(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + int i; + + for (i = 0; i < tuple->encap_num; i++) { + switch (tuple->encap[i].proto) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + if (skb_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id) < 0) + return -1; + break; + case htons(ETH_P_PPP_SES): + if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + return -1; + break; + } + } + + return 0; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; + enum flow_offload_tuple_dir dir; + struct nf_flowtable_ctx ctx = { + .in = state->in, + }; + struct nf_flow_xmit xmit = {}; + struct flow_offload *flow; + struct neighbour *neigh; + struct rtable *rt; + __be32 ip_daddr; + int ret; + + tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); + if (!tuplehash) + return NF_ACCEPT; + + ret = nf_flow_offload_forward(&ctx, flow_table, tuplehash, skb); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip_daddr = other_tuple->src_v4.s_addr; + + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) + return NF_DROP; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; + switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rtable *)tuplehash->tuple.dst_cache; - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + rt = dst_rtable(tuplehash->tuple.dst_cache); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); @@ -535,32 +775,31 @@ static void nf_flow_nat_ipv6(const struct flow_offload *flow, } } -static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple, u32 *hdrsize, - u32 offset) +static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, + struct flow_offload_tuple *tuple) { struct flow_ports *ports; struct ipv6hdr *ip6h; unsigned int thoff; u8 nexthdr; - thoff = sizeof(*ip6h) + offset; + thoff = sizeof(*ip6h) + ctx->offset; if (!pskb_may_pull(skb, thoff)) return -1; - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); nexthdr = ip6h->nexthdr; switch (nexthdr) { case IPPROTO_TCP: - *hdrsize = sizeof(struct tcphdr); + ctx->hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: - *hdrsize = sizeof(struct udphdr); + ctx->hdrsize = sizeof(struct udphdr); break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: - *hdrsize = sizeof(struct gre_base_hdr); + ctx->hdrsize = sizeof(struct gre_base_hdr); break; #endif default: @@ -570,7 +809,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (ip6h->hop_limit <= 1) return -1; - if (!pskb_may_pull(skb, thoff + *hdrsize)) + if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; switch (nexthdr) { @@ -590,67 +829,49 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, } } - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v6 = ip6h->saddr; tuple->dst_v6 = ip6h->daddr; tuple->l3proto = AF_INET6; tuple->l4proto = nexthdr; - tuple->iifidx = dev->ifindex; + tuple->iifidx = ctx->in->ifindex; nf_flow_tuple_encap(skb, tuple); return 0; } -unsigned int -nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct flow_offload_tuple_rhash *tuplehash, + struct sk_buff *skb) { - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; enum flow_offload_tuple_dir dir; - const struct in6_addr *nexthop; struct flow_offload *flow; - struct net_device *outdev; unsigned int thoff, mtu; - u32 hdrsize, offset = 0; struct ipv6hdr *ip6h; - struct rt6_info *rt; - int ret; - - if (skb->protocol != htons(ETH_P_IPV6) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset)) - return NF_ACCEPT; - - if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - mtu = flow->tuplehash[dir].tuple.mtu + offset; + mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) - return NF_ACCEPT; + return 0; - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); - thoff = sizeof(*ip6h) + offset; + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); + thoff = sizeof(*ip6h) + ctx->offset; if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) - return NF_ACCEPT; + return 0; if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); - return NF_ACCEPT; + return 0; } - if (skb_try_make_writable(skb, thoff + hdrsize)) - return NF_DROP; + if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + return -1; - flow_offload_refresh(flow_table, flow); + flow_offload_refresh(flow_table, flow, false); nf_flow_encap_pop(skb, tuplehash); @@ -663,35 +884,100 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + return 1; +} + +static struct flow_offload_tuple_rhash * +nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct sk_buff *skb) +{ + struct flow_offload_tuple tuple = {}; + + if (skb->protocol != htons(ETH_P_IPV6) && + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset)) + return NULL; + + if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0) + return NULL; + + return flow_offload_lookup(flow_table, &tuple); +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; + enum flow_offload_tuple_dir dir; + struct nf_flowtable_ctx ctx = { + .in = state->in, + }; + struct nf_flow_xmit xmit = {}; + struct in6_addr *ip6_daddr; + struct flow_offload *flow; + struct neighbour *neigh; + struct rt6_info *rt; + int ret; + + tuplehash = nf_flow_offload_ipv6_lookup(&ctx, flow_table, skb); + if (tuplehash == NULL) + return NF_ACCEPT; + + ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + rt = dst_rt6_info(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip6_daddr = &other_tuple->src_v6; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; + switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + rt = dst_rt6_info(tuplehash->tuple.dst_cache); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 4d9b99abe37d..d8f7bfd60ac6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -34,7 +34,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; - unsigned int enc_keys; + unsigned long long enc_keys; if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) return; @@ -43,8 +43,8 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); mask->enc_key_id.keyid = 0xffffffff; - enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL); + enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL); if (ip_tunnel_info_af(tun_info) == AF_INET) { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, @@ -55,7 +55,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, mask->enc_ipv4.src = 0xffffffff; if (key->enc_ipv4.dst) mask->enc_ipv4.dst = 0xffffffff; - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else { memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, @@ -70,7 +70,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, sizeof(struct in6_addr))) memset(&mask->enc_ipv6.dst, 0xff, sizeof(struct in6_addr)); - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -163,14 +163,14 @@ static int nf_flow_rule_match(struct nf_flow_match *match, return -EOPNOTSUPP; } mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(key->control.addr_type); + match->dissector.used_keys |= BIT_ULL(key->control.addr_type); mask->basic.n_proto = 0xffff; switch (tuple->l4proto) { case IPPROTO_TCP: key->tcp.flags = 0; mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP); break; case IPPROTO_UDP: case IPPROTO_GRE: @@ -182,9 +182,9 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->basic.ip_proto = tuple->l4proto; mask->basic.ip_proto = 0xff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); switch (tuple->l4proto) { case IPPROTO_TCP: @@ -194,7 +194,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->tp.dst = tuple->dst_port; mask->tp.dst = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_PORTS); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); break; } @@ -555,7 +555,7 @@ static void flow_offload_redirect(struct net *net, switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: this_tuple = &flow->tuplehash[dir].tuple; - ifindex = this_tuple->out.hw_ifidx; + ifindex = this_tuple->out.ifidx; break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; @@ -679,7 +679,7 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, return 0; } -int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -704,7 +704,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); -int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -735,7 +735,7 @@ nf_flow_offload_rule_alloc(struct net *net, { const struct nf_flowtable *flowtable = offload->flowtable; const struct flow_offload_tuple *tuple, *other_tuple; - const struct flow_offload *flow = offload->flow; + struct flow_offload *flow = offload->flow; struct dst_entry *other_dst = NULL; struct nf_flow_rule *flow_rule; int err = -ENOMEM; @@ -841,8 +841,8 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, struct list_head *block_cb_list) { struct flow_cls_offload cls_flow = {}; + struct netlink_ext_ack extack = {}; struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; int err, i = 0; @@ -895,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, ok_count += flow_offload_tuple_add(offload, flow_rule[0], FLOW_OFFLOAD_DIR_ORIGINAL); - ok_count += flow_offload_tuple_add(offload, flow_rule[1], - FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); if (ok_count == 0) return -ENOENT; @@ -926,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload) { clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); } @@ -946,7 +948,9 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) u64 lastused; flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); - flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, + &stats[1]); lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, @@ -1188,7 +1192,7 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, int err; if (!nf_flowtable_hw_offload(flowtable)) - return 0; + return nf_flow_offload_xdp_setup(flowtable, dev, cmd); if (dev->netdev_ops->ndo_setup_tc) err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c new file mode 100644 index 000000000000..f0984cf69a09 --- /dev/null +++ b/net/netfilter/nf_flow_table_path.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_common.h> +#include <linux/netfilter/nf_tables.h> +#include <net/ip.h> +#include <net/inet_dscp.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_flow_table.h> + +static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) +{ + if (dst_xfrm(dst)) + return FLOW_OFFLOAD_XMIT_XFRM; + + return FLOW_OFFLOAD_XMIT_NEIGH; +} + +static void nft_default_forward_path(struct nf_flow_route *route, + struct dst_entry *dst_cache, + enum ip_conntrack_dir dir) +{ + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; + route->tuple[dir].dst = dst_cache; + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); +} + +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + +static int nft_dev_fill_forward_path(const struct nf_flow_route *route, + const struct dst_entry *dst_cache, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, u8 *ha, + struct net_device_path_stack *stack) +{ + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; + struct net_device *dev = dst_cache->dev; + struct neighbour *n; + u8 nud_state; + + if (!nft_is_valid_ether_device(dev)) + goto out; + + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -1; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + neigh_release(n); + + if (!(nud_state & NUD_VALID)) + return -1; + +out: + return dev_fill_forward_path(dev, ha, stack); +} + +struct nft_forward_info { + const struct net_device *indev; + const struct net_device *outdev; + struct id { + __u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; + struct flow_offload_tunnel tun; + u8 num_tuns; + u8 ingress_vlans; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + enum flow_offload_xmit_type xmit_type; +}; + +static void nft_dev_path_info(const struct net_device_path_stack *stack, + struct nft_forward_info *info, + unsigned char *ha, struct nf_flowtable *flowtable) +{ + const struct net_device_path *path; + int i; + + memcpy(info->h_dest, ha, ETH_ALEN); + + for (i = 0; i < stack->num_paths; i++) { + path = &stack->path[i]; + switch (path->type) { + case DEV_PATH_ETHERNET: + case DEV_PATH_DSA: + case DEV_PATH_VLAN: + case DEV_PATH_PPPOE: + case DEV_PATH_TUN: + info->indev = path->dev; + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + if (path->type == DEV_PATH_ETHERNET) + break; + if (path->type == DEV_PATH_DSA) { + i = stack->num_paths; + break; + } + + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ + if (path->type == DEV_PATH_TUN) { + if (info->num_tuns) { + info->indev = NULL; + break; + } + info->tun.src_v6 = path->tun.src_v6; + info->tun.dst_v6 = path->tun.dst_v6; + info->tun.l3_proto = path->tun.l3_proto; + info->num_tuns++; + } else { + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = + path->encap.id; + info->encap[info->num_encaps].proto = + path->encap.proto; + info->num_encaps++; + } + if (path->type == DEV_PATH_PPPOE) + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + break; + case DEV_PATH_BRIDGE: + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_UNTAG_HW: + info->ingress_vlans |= BIT(info->num_encaps - 1); + break; + case DEV_PATH_BR_VLAN_TAG: + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = path->bridge.vlan_id; + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; + info->num_encaps++; + break; + case DEV_PATH_BR_VLAN_UNTAG: + if (WARN_ON_ONCE(info->num_encaps-- == 0)) { + info->indev = NULL; + break; + } + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + break; + default: + info->indev = NULL; + break; + } + } + info->outdev = info->indev; + + if (nf_flowtable_hw_offload(flowtable) && + nft_is_valid_ether_device(info->indev)) + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; +} + +static bool nft_flowtable_find_dev(const struct net_device *dev, + struct nft_flowtable *ft) +{ + struct nft_hook *hook; + bool found = false; + + list_for_each_entry_rcu(hook, &ft->hook_list, list) { + if (!nft_hook_find_ops_rcu(hook, dev)) + continue; + + found = true; + break; + } + + return found; +} + +static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, + struct flow_offload_tunnel *tun, + struct nf_flow_route *route, + enum ip_conntrack_dir dir) +{ + struct dst_entry *cur_dst = route->tuple[dir].dst; + struct dst_entry *tun_dst = NULL; + struct flowi fl = {}; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = tun->dst_v4.s_addr; + fl.u.ip4.saddr = tun->src_v4.s_addr; + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = tun->dst_v6; + fl.u.ip6.saddr = tun->src_v6; + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); + if (!tun_dst) + return -ENOENT; + + route->tuple[dir].dst = tun_dst; + dst_release(cur_dst); + + return 0; +} + +static void nft_dev_forward_path(const struct nft_pktinfo *pkt, + struct nf_flow_route *route, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + const struct dst_entry *dst = route->tuple[dir].dst; + struct net_device_path_stack stack; + struct nft_forward_info info = {}; + unsigned char ha[ETH_ALEN]; + int i; + + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) + nft_dev_path_info(&stack, &info, ha, &ft->data); + + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) + return; + + route->tuple[!dir].in.ifindex = info.indev->ifindex; + for (i = 0; i < info.num_encaps; i++) { + route->tuple[!dir].in.encap[i].id = info.encap[i].id; + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; + } + + if (info.num_tuns && + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; + route->tuple[!dir].in.num_tuns = info.num_tuns; + } + + route->tuple[!dir].in.num_encaps = info.num_encaps; + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; + route->tuple[dir].out.ifindex = info.outdev->ifindex; + + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); + route->tuple[dir].xmit_type = info.xmit_type; + } +} + +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + struct dst_entry *this_dst = skb_dst(pkt->skb); + struct dst_entry *other_dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; + fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; + fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + if (!dst_hold_safe(this_dst)) + return -ENOENT; + + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); + if (!other_dst) { + dst_release(this_dst); + return -ENOENT; + } + + nft_default_forward_path(route, this_dst, dir); + nft_default_forward_path(route, other_dst, !dir); + + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, dir, ft); + if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, !dir, ft); + + return 0; +} +EXPORT_SYMBOL_GPL(nft_flow_route); diff --git a/net/netfilter/nf_flow_table_xdp.c b/net/netfilter/nf_flow_table_xdp.c new file mode 100644 index 000000000000..e1252d042699 --- /dev/null +++ b/net/netfilter/nf_flow_table_xdp.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_flow_table.h> + +struct flow_offload_xdp_ft { + struct list_head head; + struct nf_flowtable *ft; + struct rcu_head rcuhead; +}; + +struct flow_offload_xdp { + struct hlist_node hnode; + unsigned long net_device_addr; + struct list_head head; +}; + +#define NF_XDP_HT_BITS 4 +static DEFINE_HASHTABLE(nf_xdp_hashtable, NF_XDP_HT_BITS); +static DEFINE_MUTEX(nf_xdp_hashtable_lock); + +/* caller must hold rcu read lock */ +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev) +{ + unsigned long key = (unsigned long)dev; + struct flow_offload_xdp *iter; + + hash_for_each_possible_rcu(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + struct flow_offload_xdp_ft *ft_elem; + + /* The user is supposed to insert a given net_device + * just into a single nf_flowtable so we always return + * the first element here. + */ + ft_elem = list_first_or_null_rcu(&iter->head, + struct flow_offload_xdp_ft, + head); + return ft_elem ? ft_elem->ft : NULL; + } + } + + return NULL; +} + +static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft, + const struct net_device *dev) +{ + struct flow_offload_xdp *iter, *elem = NULL; + unsigned long key = (unsigned long)dev; + struct flow_offload_xdp_ft *ft_elem; + + ft_elem = kzalloc(sizeof(*ft_elem), GFP_KERNEL_ACCOUNT); + if (!ft_elem) + return -ENOMEM; + + ft_elem->ft = ft; + + mutex_lock(&nf_xdp_hashtable_lock); + + hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + elem = iter; + break; + } + } + + if (!elem) { + elem = kzalloc(sizeof(*elem), GFP_KERNEL_ACCOUNT); + if (!elem) + goto err_unlock; + + elem->net_device_addr = key; + INIT_LIST_HEAD(&elem->head); + hash_add_rcu(nf_xdp_hashtable, &elem->hnode, key); + } + list_add_tail_rcu(&ft_elem->head, &elem->head); + + mutex_unlock(&nf_xdp_hashtable_lock); + + return 0; + +err_unlock: + mutex_unlock(&nf_xdp_hashtable_lock); + kfree(ft_elem); + + return -ENOMEM; +} + +static void nf_flowtable_by_dev_remove(struct nf_flowtable *ft, + const struct net_device *dev) +{ + struct flow_offload_xdp *iter, *elem = NULL; + unsigned long key = (unsigned long)dev; + + mutex_lock(&nf_xdp_hashtable_lock); + + hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + elem = iter; + break; + } + } + + if (elem) { + struct flow_offload_xdp_ft *ft_elem, *ft_next; + + list_for_each_entry_safe(ft_elem, ft_next, &elem->head, head) { + if (ft_elem->ft == ft) { + list_del_rcu(&ft_elem->head); + kfree_rcu(ft_elem, rcuhead); + } + } + + if (list_empty(&elem->head)) + hash_del_rcu(&elem->hnode); + else + elem = NULL; + } + + mutex_unlock(&nf_xdp_hashtable_lock); + + if (elem) { + synchronize_rcu(); + kfree(elem); + } +} + +int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + switch (cmd) { + case FLOW_BLOCK_BIND: + return nf_flowtable_by_dev_insert(flowtable, dev); + case FLOW_BLOCK_UNBIND: + nf_flowtable_by_dev_remove(flowtable, dev); + return 0; + } + + WARN_ON_ONCE(1); + return 0; +} diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c index 00e89ffd78f6..2d890dd04ff8 100644 --- a/net/netfilter/nf_hooks_lwtunnel.c +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -3,6 +3,9 @@ #include <linux/sysctl.h> #include <net/lwtunnel.h> #include <net/netfilter/nf_hooks_lwtunnel.h> +#include <linux/netfilter.h> + +#include "nf_internals.h" static inline int nf_hooks_lwtunnel_get(void) { @@ -25,7 +28,7 @@ static inline int nf_hooks_lwtunnel_set(int enable) } #ifdef CONFIG_SYSCTL -int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, +int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int proc_nf_hooks_lwtunnel_enabled = 0; @@ -50,4 +53,71 @@ int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, return ret; } EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); + +static struct ctl_table nf_lwtunnel_sysctl_table[] = { + { + .procname = "nf_hooks_lwtunnel", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = nf_hooks_lwtunnel_sysctl_handler, + }, +}; + +static int __net_init nf_lwtunnel_net_init(struct net *net) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = nf_lwtunnel_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_lwtunnel_sysctl_table, + sizeof(nf_lwtunnel_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } + + hdr = register_net_sysctl_sz(net, "net/netfilter", table, + ARRAY_SIZE(nf_lwtunnel_sysctl_table)); + if (!hdr) + goto err_reg; + + net->nf.nf_lwtnl_dir_header = hdr; + + return 0; +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit nf_lwtunnel_net_exit(struct net *net) +{ + const struct ctl_table *table; + + table = net->nf.nf_lwtnl_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations nf_lwtunnel_net_ops = { + .init = nf_lwtunnel_net_init, + .exit = nf_lwtunnel_net_exit, +}; + +int __init netfilter_lwtunnel_init(void) +{ + return register_pernet_subsys(&nf_lwtunnel_net_ops); +} + +void netfilter_lwtunnel_fini(void) +{ + unregister_pernet_subsys(&nf_lwtunnel_net_ops); +} +#else +int __init netfilter_lwtunnel_init(void) { return 0; } +void netfilter_lwtunnel_fini(void) {} #endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 832ae64179f0..25403023060b 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -29,6 +29,12 @@ void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ int __init netfilter_log_init(void); +#ifdef CONFIG_LWTUNNEL +/* nf_hooks_lwtunnel.c */ +int __init netfilter_lwtunnel_init(void); +void netfilter_lwtunnel_fini(void); +#endif + /* core.c */ void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 8a29290149bd..74cef8bf554c 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -31,10 +31,10 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) int i; for (i = 0; i < NF_LOG_TYPE_MAX; i++) { - if (loggers[pf][i] == NULL) + log = nft_log_dereference(loggers[pf][i]); + if (!log) continue; - log = nft_log_dereference(loggers[pf][i]); if (!strncasecmp(str_logger, log->name, strlen(log->name))) return log; } @@ -125,6 +125,32 @@ void nf_log_unregister(struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_unregister); +/** + * nf_log_is_registered - Check if any logger is registered for a given + * protocol family. + * + * @pf: Protocol family + * + * Returns: true if at least one logger is active for @pf, false otherwise. + */ +bool nf_log_is_registered(u_int8_t pf) +{ + int i; + + if (pf >= NFPROTO_NUMPROTO) { + WARN_ON_ONCE(1); + return false; + } + + for (i = 0; i < NF_LOG_TYPE_MAX; i++) { + if (rcu_access_pointer(loggers[pf][i])) + return true; + } + + return false; +} +EXPORT_SYMBOL(nf_log_is_registered); + int nf_log_bind_pf(struct net *net, u_int8_t pf, const struct nf_logger *logger) { @@ -156,6 +182,11 @@ int nf_logger_find_get(int pf, enum nf_log_type type) struct nf_logger *logger; int ret = -ENOENT; + if (pf >= ARRAY_SIZE(loggers)) + return -EINVAL; + if (type >= NF_LOG_TYPE_MAX) + return -EINVAL; + if (pf == NFPROTO_INET) { ret = nf_logger_find_get(NFPROTO_IPV4, type); if (ret < 0) @@ -193,11 +224,12 @@ void nf_logger_put(int pf, enum nf_log_type type) return; } - BUG_ON(loggers[pf][type] == NULL); - rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); - module_put(logger->me); + if (!logger) + WARN_ON_ONCE(1); + else + module_put(logger->me); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_logger_put); @@ -389,7 +421,7 @@ static const struct seq_operations nflog_seq_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; -static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO]; static struct ctl_table_header *nf_log_sysctl_fhdr; static struct ctl_table nf_log_sysctl_ftable[] = { @@ -400,10 +432,9 @@ static struct ctl_table nf_log_sysctl_ftable[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; -static int nf_log_proc_dostring(struct ctl_table *table, int write, +static int nf_log_proc_dostring(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; @@ -487,9 +518,10 @@ static int netfilter_log_sysctl_init(struct net *net) for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) table[i].extra2 = net; - net->nf.nf_log_dir_header = register_net_sysctl(net, - "net/netfilter/nf_log", - table); + net->nf.nf_log_dir_header = register_net_sysctl_sz(net, + "net/netfilter/nf_log", + table, + ARRAY_SIZE(nf_log_sysctl_table)); if (!net->nf.nf_log_dir_header) goto err_reg; @@ -507,7 +539,7 @@ err_alloc: static void netfilter_log_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->nf.nf_log_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_log_dir_header); diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index cb894f0d63e9..86d5fc5d28e3 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -111,7 +111,8 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct nf_loginfo *loginfo, const char *prefix) + const struct nf_loginfo *loginfo, const char *prefix, + struct net *net) { const struct net_device *physoutdev __maybe_unused; const struct net_device *physindev __maybe_unused; @@ -121,7 +122,7 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, in ? in->name : "", out ? out->name : ""); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - physindev = nf_bridge_get_physindev(skb); + physindev = nf_bridge_get_physindev(skb, net); if (physindev && in != physindev) nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); physoutdev = nf_bridge_get_physoutdev(skb); @@ -148,7 +149,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, - prefix); + prefix, net); dump_arp_packet(m, loginfo, skb, skb_network_offset(skb)); nf_log_buf_close(m); @@ -215,7 +216,9 @@ nf_log_dump_tcp_header(struct nf_log_buf *m, /* Max length: 9 "RES=0x3C " */ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->ae) + nf_log_buf_add(m, "AE "); if (th->cwr) nf_log_buf_add(m, "CWR "); if (th->ece) @@ -322,7 +325,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ @@ -515,7 +518,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ - /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */ /* UDP: 10+max(25,20) = 35 */ /* UDPLITE: 14+max(25,20) = 39 */ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ @@ -525,7 +528,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* (ICMP allows recursion one level deep) */ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ - /* maxlen = 230+ 91 + 230 + 252 = 803 */ + /* maxlen = 230+ 91 + 230 + 255 = 806 */ } static noinline_for_stack void @@ -845,7 +848,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, - out, loginfo, prefix); + out, loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); @@ -880,7 +883,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, - loginfo, prefix); + loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); @@ -916,7 +919,7 @@ static void nf_log_unknown_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, - prefix); + prefix, net); dump_mac_header(m, loginfo, skb); diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 0fa5a0bbb0ff..481be15609b1 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -12,9 +12,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_nat.h> -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in nf_nat BTF"); +__bpf_kfunc_start_defs(); /* bpf_ct_set_nat_info - Set source or destination nat address * @@ -30,9 +28,9 @@ __diag_ignore_all("-Wmissing-prototypes", * interpreted as select a random port. * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST */ -int bpf_ct_set_nat_info(struct nf_conn___init *nfct, - union nf_inet_addr *addr, int port, - enum nf_nat_manip_type manip) +__bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, + union nf_inet_addr *addr, int port, + enum nf_nat_manip_type manip) { struct nf_conn *ct = (struct nf_conn *)nfct; u16 proto = nf_ct_l3num(ct); @@ -54,11 +52,11 @@ int bpf_ct_set_nat_info(struct nf_conn___init *nfct, return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } -__diag_pop() +__bpf_kfunc_end_defs(); -BTF_SET8_START(nf_nat_kfunc_set) +BTF_KFUNCS_START(nf_nat_kfunc_set) BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) -BTF_SET8_END(nf_nat_kfunc_set) +BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index e29e4ccb5c5a..78a61dac4ade 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -27,6 +27,9 @@ #include "nf_internals.h" +#define NF_NAT_MAX_ATTEMPTS 128 +#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) + static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); @@ -66,7 +69,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } @@ -78,7 +80,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } @@ -99,7 +100,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } @@ -111,7 +111,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } @@ -180,7 +179,35 @@ hash_by_src(const struct net *net, return reciprocal_scale(hash, nf_nat_htable_size); } -/* Is this tuple already taken? (not by us) */ +/** + * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry + * @tuple: proposed NAT binding + * @ignored_conntrack: our (unconfirmed) conntrack entry + * + * A conntrack entry can be inserted to the connection tracking table + * if there is no existing entry with an identical tuple in either direction. + * + * Example: + * INITIATOR -> NAT/PAT -> RESPONDER + * + * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). + * Then, later, NAT/PAT itself also connects to RESPONDER. + * + * This will not work if the SNAT done earlier has same IP:PORT source pair. + * + * Conntrack table has: + * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT + * + * and new locally originating connection wants: + * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT + * + * ... which would mean incoming packets cannot be distinguished between + * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). + * + * @return: true if the proposed NAT mapping collides with an existing entry. + */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) @@ -197,6 +224,182 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } +static bool nf_nat_allow_clash(const struct nf_conn *ct) +{ + return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; +} + +/** + * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry + * @tuple: proposed NAT binding + * @ignored_ct: our (unconfirmed) conntrack entry + * + * Same as nf_nat_used_tuple, but also check for rare clash in reverse + * direction. Should be called only when @tuple has not been altered, i.e. + * @ignored_conntrack will not be subject to NAT. + * + * @return: true if the proposed NAT mapping collides with existing entry. + */ +static noinline bool +nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_ct) +{ + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; + const struct nf_conntrack_tuple_hash *thash; + const struct nf_conntrack_zone *zone; + struct nf_conn *ct; + bool taken = true; + struct net *net; + + if (!nf_nat_used_tuple(tuple, ignored_ct)) + return false; + + if (!nf_nat_allow_clash(ignored_ct)) + return true; + + /* Initial choice clashes with existing conntrack. + * Check for (rare) reverse collision. + * + * This can happen when new packets are received in both directions + * at the exact same time on different CPUs. + * + * Without SMP, first packet creates new conntrack entry and second + * packet is resolved as established reply packet. + * + * With parallel processing, both packets could be picked up as + * new and both get their own ct entry allocated. + * + * If ignored_conntrack and colliding ct are not subject to NAT then + * pretend the tuple is available and let later clash resolution + * handle this at insertion time. + * + * Without it, the 'reply' packet has its source port rewritten + * by nat engine. + */ + if (READ_ONCE(ignored_ct->status) & uses_nat) + return true; + + net = nf_ct_net(ignored_ct); + zone = nf_ct_zone(ignored_ct); + + thash = nf_conntrack_find_get(net, zone, tuple); + if (unlikely(!thash)) { + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuple(&reply, tuple); + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) /* clashing entry went away */ + return false; + } + + ct = nf_ct_tuplehash_to_ctrack(thash); + + /* NB: IP_CT_DIR_ORIGINAL should be impossible because + * nf_nat_used_tuple() handles origin collisions. + * + * Handle remote chance other CPU confirmed its ct right after. + */ + if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) + goto out; + + /* clashing connection subject to NAT? Retry with new tuple. */ + if (READ_ONCE(ct->status) & uses_nat) + goto out; + + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && + nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { + taken = false; + goto out; + } +out: + nf_ct_put(ct); + return taken; +} + +static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) +{ + static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | + IPS_DYING; + static const unsigned long flags_needed = IPS_SRC_NAT; + enum tcp_conntrack old_state; + + old_state = READ_ONCE(ct->proto.tcp.state); + if (old_state < TCP_CONNTRACK_TIME_WAIT) + return false; + + if (flags & flags_refuse) + return false; + + return (flags & flags_needed) == flags_needed; +} + +/* reverse direction will send packets to new source, so + * make sure such packets are invalid. + */ +static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) +{ + return (__s32)(new->proto.tcp.seen[0].td_end - + old->proto.tcp.seen[0].td_end) > 0; +} + +static int +nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack, + unsigned int attempts_left) +{ + static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; + struct nf_conntrack_tuple_hash *thash; + const struct nf_conntrack_zone *zone; + struct nf_conntrack_tuple reply; + unsigned long flags; + struct nf_conn *ct; + bool taken = true; + struct net *net; + + nf_ct_invert_tuple(&reply, tuple); + + if (attempts_left > NF_NAT_HARDER_THRESH || + tuple->dst.protonum != IPPROTO_TCP || + ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) + return nf_conntrack_tuple_taken(&reply, ignored_conntrack); + + /* :ast few attempts to find a free tcp port. Destructive + * action: evict colliding if its in timewait state and the + * tcp sequence number has advanced past the one used by the + * old entry. + */ + net = nf_ct_net(ignored_conntrack); + zone = nf_ct_zone(ignored_conntrack); + + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) + return false; + + ct = nf_ct_tuplehash_to_ctrack(thash); + + if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) + goto out; + + if (WARN_ON_ONCE(ct == ignored_conntrack)) + goto out; + + flags = READ_ONCE(ct->status); + if (!nf_nat_may_kill(ct, flags)) + goto out; + + if (!nf_seq_has_advanced(ct, ignored_conntrack)) + goto out; + + /* Even if we can evict do not reuse if entry is offloaded. */ + if (nf_ct_kill(ct)) + taken = flags & flags_offload; +out: + nf_ct_put(ct); + return taken; +} + static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { @@ -225,7 +428,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: - case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; @@ -242,7 +444,7 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ -static int in_range(const struct nf_conntrack_tuple *tuple, +static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the @@ -291,7 +493,7 @@ find_appropriate_src(struct net *net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; - if (in_range(result, range)) + if (nf_in_range(result, range)) return 1; } } @@ -385,7 +587,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; - static const unsigned int max_attempts = 128; switch (tuple->dst.protonum) { case IPPROTO_ICMP: @@ -426,7 +627,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: - case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else @@ -467,12 +667,15 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); - else + else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || + maniptype != NF_NAT_MANIP_DST) off = get_random_u16(); + else + off = 0; attempts = range_size; - if (attempts > max_attempts) - attempts = max_attempts; + if (attempts > NF_NAT_MAX_ATTEMPTS) + attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. @@ -483,7 +686,7 @@ find_free_id: another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); - if (!nf_nat_used_tuple(tuple, ct)) + if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } @@ -523,8 +726,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ - if (in_range(orig_tuple, range)) { - if (!nf_nat_used_tuple(orig_tuple, ct)) { + if (nf_in_range(orig_tuple, range)) { + if (!nf_nat_used_tuple_new(orig_tuple, ct)) { *tuple = *orig_tuple; return; } @@ -549,8 +752,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + &range->min_proto, + &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; @@ -887,10 +1090,8 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], range->flags |= NF_NAT_RANGE_MAP_IPS; } - if (tb[CTA_NAT_V4_MAXIP]) - range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); - else - range->max_addr.ip = range->min_addr.ip; + range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], + range->min_addr.ip); return 0; } @@ -1017,7 +1218,7 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); - nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); + nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; @@ -1121,7 +1322,6 @@ static const struct nf_nat_hook nat_hook = { #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif - .manip_pkt = nf_nat_manip_pkt, .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; @@ -1179,6 +1379,7 @@ static void __exit nf_nat_cleanup(void) } MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Network address translation core"); module_init(nf_nat_init); module_exit(nf_nat_cleanup); diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c index 551abd2da614..0f9a559f6207 100644 --- a/net/netfilter/nf_nat_ovs.c +++ b/net/netfilter/nf_nat_ovs.c @@ -75,9 +75,10 @@ static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, } err = nf_nat_packet(ct, ctinfo, hooknum, skb); +out: if (err == NF_ACCEPT) *action |= BIT(maniptype); -out: + return err; } diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 48cc60084d28..b14a434b9561 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -180,46 +180,6 @@ tcp_manip_pkt(struct sk_buff *skb, } static bool -dccp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct dccp_hdr *hdr; - __be16 *portptr, oldport, newport; - int hdrsize = 8; /* DCCP connection tracking guarantees this much */ - - if (skb->len >= hdroff + sizeof(struct dccp_hdr)) - hdrsize = sizeof(struct dccp_hdr); - - if (skb_ensure_writable(skb, hdroff + hdrsize)) - return false; - - hdr = (struct dccp_hdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - newport = tuple->src.u.dccp.port; - portptr = &hdr->dccph_sport; - } else { - newport = tuple->dst.u.dccp.port; - portptr = &hdr->dccph_dport; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return true; - - nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); - inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - false); -#endif - return true; -} - -static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, @@ -338,9 +298,6 @@ static bool l4proto_manip_pkt(struct sk_buff *skb, case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); - case IPPROTO_DCCP: - return dccp_manip_pkt(skb, iphdroff, hdroff, - tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); @@ -668,7 +625,7 @@ static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int struct flowi fl; int err; - err = xfrm_decode_session(skb, &fl, family); + err = xfrm_decode_session(net, skb, &fl, family); if (err < 0) return err; @@ -697,6 +654,31 @@ static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int } #endif +static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport) +{ + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + const struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + case IPPROTO_UDP: + break; + default: + return false; + } + + dir = CTINFO2DIR(ctinfo); + if (dir != IP_CT_DIR_ORIGINAL) + return false; + + return ct->tuplehash[!dir].tuple.dst.u.all != sport; +} + static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -707,8 +689,20 @@ nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, ret = nf_nat_ipv4_fn(priv, skb, state); - if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr && - !inet_sk_transparent(sk)) + if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) + return ret; + + /* skb has a socket assigned via tcp edemux. We need to check + * if nf_nat_ipv4_fn() has mangled the packet in a way that + * edemux would not have found this socket. + * + * This includes both changes to the source address and changes + * to the source port, which are both handled by the + * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux + * might have found a socket for the old (pre-snat) address. + */ + if (saddr != ip_hdr(skb)->saddr || + nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); /* TCP edemux obtained wrong socket */ return ret; @@ -938,14 +932,36 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, } static unsigned int +nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct in6_addr saddr = ipv6_hdr(skb)->saddr; + struct sock *sk = skb->sk; + unsigned int ret; + + ret = nf_nat_ipv6_fn(priv, skb, state); + + if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) + return ret; + + /* see nf_nat_ipv4_local_in */ + if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) || + nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) + skb_orphan(skb); + + return ret; +} + +static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - unsigned int ret; + unsigned int ret, verdict; struct in6_addr daddr = ipv6_hdr(skb)->daddr; ret = nf_nat_ipv6_fn(priv, skb, state); - if (ret != NF_DROP && ret != NF_STOLEN && + verdict = ret & NF_VERDICT_MASK; + if (verdict != NF_DROP && verdict != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); @@ -1051,7 +1067,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_fn, + .hook = nf_nat_ipv6_local_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index f91579c821e9..5b37487d9d11 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -10,6 +10,7 @@ #include <linux/if.h> #include <linux/inetdevice.h> +#include <linux/in.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/netdevice.h> @@ -24,81 +25,104 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_redirect.h> +static unsigned int +nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range, + const union nf_inet_addr *newdst) +{ + struct nf_nat_range2 newrange; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + + memset(&newrange, 0, sizeof(newrange)); + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr = *newdst; + newrange.max_addr = *newdst; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} + unsigned int -nf_nat_redirect_ipv4(struct sk_buff *skb, - const struct nf_nat_ipv4_multi_range_compat *mr, +nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - __be32 newdst; - struct nf_nat_range2 newrange; + union nf_inet_addr newdst = {}; WARN_ON(hooknum != NF_INET_PRE_ROUTING && hooknum != NF_INET_LOCAL_OUT); - ct = nf_ct_get(skb, &ctinfo); - WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); - /* Local packets: make them go to loopback */ if (hooknum == NF_INET_LOCAL_OUT) { - newdst = htonl(0x7F000001); + newdst.ip = htonl(INADDR_LOOPBACK); } else { const struct in_device *indev; - newdst = 0; - indev = __in_dev_get_rcu(skb->dev); if (indev) { const struct in_ifaddr *ifa; ifa = rcu_dereference(indev->ifa_list); if (ifa) - newdst = ifa->ifa_local; + newdst.ip = ifa->ifa_local; } - if (!newdst) + if (!newdst.ip) return NF_DROP; } - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newdst; - newrange.max_addr.ip = newdst; - newrange.min_proto = mr->range[0].min; - newrange.max_proto = mr->range[0].max; - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); + return nf_nat_redirect(skb, range, &newdst); } EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; +static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope) +{ + unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr); + + if (ifa_addr_type & IPV6_ADDR_MAPPED) + return false; + + if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC))) + return false; + + if (scope) { + unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK; + + if (!(scope & ifa_scope)) + return false; + } + + return true; +} + unsigned int nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_nat_range2 newrange; - struct in6_addr newdst; - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; + union nf_inet_addr newdst = {}; - ct = nf_ct_get(skb, &ctinfo); if (hooknum == NF_INET_LOCAL_OUT) { - newdst = loopback_addr; + newdst.in6 = loopback_addr; } else { + unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr); struct inet6_dev *idev; - struct inet6_ifaddr *ifa; bool addr = false; idev = __in6_dev_get(skb->dev); if (idev != NULL) { + const struct inet6_ifaddr *ifa; + read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { - newdst = ifa->addr; + if (!nf_nat_redirect_ipv6_usable(ifa, scope)) + continue; + + newdst.in6 = ifa->addr; addr = true; break; } @@ -109,12 +133,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, return NF_DROP; } - newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.in6 = newdst; - newrange.max_addr.in6 = newdst; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; - - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); + return nf_nat_redirect(skb, range, &newdst); } EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 63d1516816b1..7f12e56e6e52 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -82,11 +82,9 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct sk_buff *skb = entry->skb; - struct nf_bridge_info *nf_bridge; - nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge) { - entry->physin = nf_bridge_get_physindev(skb); + if (nf_bridge_info_exists(skb)) { + entry->physin = nf_bridge_get_physindev(skb, entry->state.net); entry->physout = nf_bridge_get_physoutdev(skb); } else { entry->physin = NULL; @@ -250,109 +248,3 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, return 0; } EXPORT_SYMBOL_GPL(nf_queue); - -static unsigned int nf_iterate(struct sk_buff *skb, - struct nf_hook_state *state, - const struct nf_hook_entries *hooks, - unsigned int *index) -{ - const struct nf_hook_entry *hook; - unsigned int verdict, i = *index; - - while (i < hooks->num_hook_entries) { - hook = &hooks->hooks[i]; -repeat: - verdict = nf_hook_entry_hookfn(hook, skb, state); - if (verdict != NF_ACCEPT) { - *index = i; - if (verdict != NF_REPEAT) - return verdict; - goto repeat; - } - i++; - } - - *index = i; - return NF_ACCEPT; -} - -static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) -{ - switch (pf) { -#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - case NFPROTO_BRIDGE: - return rcu_dereference(net->nf.hooks_bridge[hooknum]); -#endif - case NFPROTO_IPV4: - return rcu_dereference(net->nf.hooks_ipv4[hooknum]); - case NFPROTO_IPV6: - return rcu_dereference(net->nf.hooks_ipv6[hooknum]); - default: - WARN_ON_ONCE(1); - return NULL; - } - - return NULL; -} - -/* Caller must hold rcu read-side lock */ -void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) -{ - const struct nf_hook_entry *hook_entry; - const struct nf_hook_entries *hooks; - struct sk_buff *skb = entry->skb; - const struct net *net; - unsigned int i; - int err; - u8 pf; - - net = entry->state.net; - pf = entry->state.pf; - - hooks = nf_hook_entries_head(net, pf, entry->state.hook); - - i = entry->hook_index; - if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { - kfree_skb(skb); - nf_queue_entry_free(entry); - return; - } - - hook_entry = &hooks->hooks[i]; - - /* Continue traversal iff userspace said ok... */ - if (verdict == NF_REPEAT) - verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); - - if (verdict == NF_ACCEPT) { - if (nf_reroute(skb, entry) < 0) - verdict = NF_DROP; - } - - if (verdict == NF_ACCEPT) { -next_hook: - ++i; - verdict = nf_iterate(skb, &entry->state, hooks, &i); - } - - switch (verdict & NF_VERDICT_MASK) { - case NF_ACCEPT: - case NF_STOP: - local_bh_disable(); - entry->state.okfn(entry->state.net, entry->state.sk, skb); - local_bh_enable(); - break; - case NF_QUEUE: - err = nf_queue(skb, &entry->state, i, verdict); - if (err == 1) - goto next_hook; - break; - case NF_STOLEN: - break; - default: - kfree_skb(skb); - } - - nf_queue_entry_free(entry); -} -EXPORT_SYMBOL(nf_reinject); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 16915f8eef2b..3fa3f5dfb264 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/tcp.h> #include <net/netns/generic.h> #include <linux/proc_fs.h> @@ -153,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; - opts->tsval = tcp_time_stamp_raw() & ~0x3f; + opts->tsval = tcp_clock_ms() & ~0x3f; if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; @@ -617,7 +617,7 @@ synproxy_recv_client_ack(struct net *net, struct synproxy_net *snet = synproxy_pernet(net); int mss; - mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + mss = __cookie_v4_check(ip_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; @@ -800,7 +800,7 @@ synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, skb_reset_network_header(skb); iph = skb_put(skb, sizeof(*iph)); ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; + iph->hop_limit = READ_ONCE(net->ipv6.devconf_all->hop_limit); iph->nexthdr = IPPROTO_TCP; iph->saddr = *saddr; iph->daddr = *daddr; @@ -1034,7 +1034,7 @@ synproxy_recv_client_ack_ipv6(struct net *net, struct synproxy_net *snet = synproxy_pernet(net); int mss; - mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + mss = nf_cookie_v6_check(ipv6_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8c09e4d12ac1..f3de2f9bbebf 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -24,14 +24,19 @@ #include <net/sock.h> #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) +#define NFT_SET_MAX_ANONLEN 16 + +/* limit compaction to avoid huge kmalloc/krealloc sizes. */ +#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem)) unsigned int nf_tables_net_id __read_mostly; static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); -static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); enum { NFT_VALIDATE_SKIP = 0, @@ -100,13 +105,12 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types [NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER, [NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, + [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET, }; -static void nft_validate_state_update(struct net *net, u8 new_validate_state) +static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state) { - struct nftables_pernet *nft_net = nft_pernet(net); - - switch (nft_net->validate_state) { + switch (table->validate_state) { case NFT_VALIDATE_SKIP: WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); break; @@ -117,10 +121,12 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) return; } - nft_net->validate_state = new_validate_state; + table->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); -static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); + +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, @@ -141,37 +147,61 @@ static void nft_ctx_init(struct nft_ctx *ctx, ctx->report = nlmsg_report(nlh); ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; + + bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } -static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, - int msg_type, u32 size, gfp_t gfp) +static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, + int msg_type, u32 size) { struct nft_trans *trans; - trans = kzalloc(sizeof(struct nft_trans) + size, gfp); + trans = kzalloc(size, GFP_KERNEL); if (trans == NULL) return NULL; INIT_LIST_HEAD(&trans->list); trans->msg_type = msg_type; - trans->ctx = *ctx; + + trans->net = ctx->net; + trans->table = ctx->table; + trans->seq = ctx->seq; + trans->flags = ctx->flags; + trans->report = ctx->report; return trans; } -static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, - int msg_type, u32 size) +static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) +{ + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + case NFT_MSG_NEWSET: + return container_of(trans, struct nft_trans_binding, nft_trans); + } + + return NULL; +} + +static void nft_trans_list_del(struct nft_trans *trans) { - return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); + struct nft_trans_binding *trans_binding; + + list_del(&trans->list); + + trans_binding = nft_trans_get_binding(trans); + if (trans_binding) + list_del(&trans_binding->binding_list); } static void nft_trans_destroy(struct nft_trans *trans) { - list_del(&trans->list); + nft_trans_list_del(trans); kfree(trans); } -static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, + bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; @@ -185,53 +215,154 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) - nft_trans_set_bound(trans) = true; + nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) - nft_trans_elem_set_bound(trans) = true; + nft_trans_elem_set_bound(trans) = bind; + break; + } + } +} + +static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, true); +} + +static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, false); +} + +static void __nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain, bool bind) +{ + struct nftables_pernet *nft_net; + struct net *net = ctx->net; + struct nft_trans *trans; + + if (!nft_chain_binding(chain)) + return; + + nft_net = nft_pernet(net); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain(trans) == chain) + nft_trans_chain_bound(trans) = bind; + break; + case NFT_MSG_NEWRULE: + if (nft_trans_rule_chain(trans) == chain) + nft_trans_rule_bound(trans) = bind; break; } } } +static void nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, true); +} + +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + if (!nft_chain_binding(chain)) + return 0; + + if (nft_chain_binding(ctx->chain)) + return -EOPNOTSUPP; + + if (chain->bound) + return -EBUSY; + + if (!nft_use_inc(&chain->use)) + return -EMFILE; + + chain->bound = true; + nft_chain_trans_bind(ctx, chain); + + return 0; +} + +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, false); +} + static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { + struct nf_hook_ops *ops; struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) - goto err_register; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nf_register_net_hook(net, ops); + if (err < 0) + goto err_register; - j++; + j++; + } } return 0; err_register: list_for_each_entry(hook, hook_list, list) { - if (j-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (j-- <= 0) + break; - nf_unregister_net_hook(net, &hook->ops); + nf_unregister_net_hook(net, ops); + } } return err; } +static void nft_netdev_hook_free_ops(struct nft_hook *hook) +{ + struct nf_hook_ops *ops, *next; + + list_for_each_entry_safe(ops, next, &hook->ops_list, list) { + list_del(&ops->list); + kfree(ops); + } +} + +static void nft_netdev_hook_free(struct nft_hook *hook) +{ + nft_netdev_hook_free_ops(hook); + kfree(hook); +} + +static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu) +{ + struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu); + + nft_netdev_hook_free(hook); +} + +static void nft_netdev_hook_free_rcu(struct nft_hook *hook) +{ + call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } @@ -290,11 +421,130 @@ static void nf_tables_unregister_hook(struct net *net, return __nf_tables_unregister_hook(net, table, chain, false); } +static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b) +{ + /* NB: the ->bound equality check is defensive, at this time we only merge + * a new nft_trans_elem transaction request with the transaction tail + * element, but a->bound != b->bound would imply a NEWRULE transaction + * is queued in-between. + * + * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents + * huge krealloc() requests. + */ + return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS; +} + +static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, + struct nft_trans_elem *tail, + struct nft_trans_elem *trans) +{ + unsigned int nelems, old_nelems = tail->nelems; + struct nft_trans_elem *new_trans; + + if (!nft_trans_collapse_set_elem_allowed(tail, trans)) + return false; + + /* "cannot happen", at this time userspace element add + * requests always allocate a new transaction element. + * + * This serves as a reminder to adjust the list_add_tail + * logic below in case this ever changes. + */ + if (WARN_ON_ONCE(trans->nelems != 1)) + return false; + + if (check_add_overflow(old_nelems, trans->nelems, &nelems)) + return false; + + /* krealloc might free tail which invalidates list pointers */ + list_del_init(&tail->nft_trans.list); + + new_trans = krealloc(tail, struct_size(tail, elems, nelems), + GFP_KERNEL); + if (!new_trans) { + list_add_tail(&tail->nft_trans.list, + &nft_net->commit_list); + return false; + } + + /* + * new_trans->nft_trans.list contains garbage, but + * list_add_tail() doesn't care. + */ + new_trans->nelems = nelems; + new_trans->elems[old_nelems] = trans->elems[0]; + list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list); + + return true; +} + +static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, + struct nft_trans *trans) +{ + struct nft_trans *tail; + + if (list_empty(&nft_net->commit_list)) + return false; + + tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list); + + if (tail->msg_type != trans->msg_type) + return false; + + switch (trans->msg_type) { + case NFT_MSG_NEWSETELEM: + case NFT_MSG_DELSETELEM: + return nft_trans_collapse_set_elem(nft_net, + nft_trans_container_elem(tail), + nft_trans_container_elem(trans)); + } + + return false; +} + static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); + struct nft_trans_binding *binding; + struct nft_trans_set *trans_set; list_add_tail(&trans->list, &nft_net->commit_list); + + binding = nft_trans_get_binding(trans); + if (!binding) + return; + + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + trans_set = nft_trans_container_set(trans); + + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans))) + list_add_tail(&binding->binding_list, &nft_net->binding_list); + + list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list); + break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans))) + list_add_tail(&binding->binding_list, &nft_net->binding_list); + break; + } +} + +static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + + WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && + trans->msg_type != NFT_MSG_DELSETELEM); + + if (nft_trans_try_collapse(nft_net, trans)) { + kfree(trans); + return; + } + + nft_trans_commit_list_add_tail(net, trans); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -324,11 +574,28 @@ static int nft_deltable(struct nft_ctx *ctx) return err; } -static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +static struct nft_trans * +nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type) { + struct nft_trans_chain *trans_chain; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (!trans) + return NULL; + + trans_chain = nft_trans_container_chain(trans); + INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list); + trans_chain->chain = ctx->chain; + + return trans; +} + +static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc_chain(ctx, msg_type); if (trans == NULL) return ERR_PTR(-ENOMEM); @@ -340,8 +607,8 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); } } - nft_trans_commit_list_add_tail(ctx->net, trans); + return trans; } @@ -353,14 +620,13 @@ static int nft_delchain(struct nft_ctx *ctx) if (IS_ERR(trans)) return PTR_ERR(trans); - ctx->table->use--; + nft_use_dec(&ctx->table->use); nft_deactivate_next(ctx->net, ctx->chain); return 0; } -static void nft_rule_expr_activate(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr; @@ -373,9 +639,8 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, } } -static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, - struct nft_rule *rule, - enum nft_trans_phase phase) +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase) { struct nft_expr *expr; @@ -394,7 +659,7 @@ nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) /* You cannot delete the same rule twice */ if (nft_is_active_next(ctx->net, rule)) { nft_deactivate_next(ctx->net, rule); - ctx->chain->use--; + nft_use_dec(&ctx->chain->use); return 0; } return -ENOENT; @@ -414,6 +679,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); } nft_trans_rule(trans) = rule; + nft_trans_rule_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); return trans; @@ -469,12 +735,17 @@ static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, struct nft_set *set, const struct nft_set_desc *desc) { + struct nft_trans_set *trans_set; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); if (trans == NULL) return -ENOMEM; + trans_set = nft_trans_container_set(trans); + INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list); + INIT_LIST_HEAD(&trans_set->list_trans_newset); + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); @@ -485,6 +756,7 @@ static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, nft_trans_set_update(trans) = true; nft_trans_set_gc_int(trans) = desc->gc_int; nft_trans_set_timeout(trans) = desc->timeout; + nft_trans_set_size(trans) = desc->size; } nft_trans_commit_list_add_tail(ctx->net, trans); @@ -497,6 +769,60 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } +static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_set_elem_change_active(ctx->net, set, ext); + nft_setelem_data_deactivate(ctx->net, set, elem_priv); + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + struct nft_elem_priv *elem; +}; + +static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + nft_set_elem_change_active(ctx->net, set, ext); + nft_setelem_data_deactivate(ctx->net, set, catchall->elem); + break; + } +} + +static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nft_mapelem_deactivate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_deactivate(ctx, set); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -505,8 +831,11 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -538,20 +867,21 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; nft_deactivate_next(ctx->net, obj); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } -static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, - struct nft_flowtable *flowtable) +static struct nft_trans * +nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, + struct nft_flowtable *flowtable) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_flowtable)); if (trans == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWFLOWTABLE) nft_activate_next(ctx->net, flowtable); @@ -560,22 +890,22 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, nft_trans_flowtable(trans) = flowtable; nft_trans_commit_list_add_tail(ctx->net, trans); - return 0; + return trans; } static int nft_delflowtable(struct nft_ctx *ctx, struct nft_flowtable *flowtable) { - int err; + struct nft_trans *trans; - err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); - if (err < 0) - return err; + trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); + if (IS_ERR(trans)) + return PTR_ERR(trans); nft_deactivate_next(ctx->net, flowtable); - ctx->table->use--; + nft_use_dec(&ctx->table->use); - return err; + return 0; } static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg) @@ -663,7 +993,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, - u8 genmask, u32 nlpid) + int family, u8 genmask, u32 nlpid) { struct nftables_pernet *nft_net; struct nft_table *table; @@ -671,6 +1001,7 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, nft_net = nft_pernet(net); list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && + table->family == family && nft_active_genmask(table, genmask)) { if (nft_table_has_owner(table) && nlpid && table->nlpid != nlpid) @@ -792,11 +1123,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = nft_pernet(net); + return READ_ONCE(net->nft.base_seq); +} - return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -814,19 +1148,28 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, { struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, - NFNETLINK_V0, nft_base_seq(net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || - nla_put_be32(skb, NFTA_TABLE_FLAGS, - htonl(table->flags & NFT_TABLE_F_MASK)) || nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), NFTA_TABLE_PAD)) goto nla_put_failure; + + if (event == NFT_MSG_DELTABLE || + event == NFT_MSG_DESTROYTABLE) { + nlmsg_end(skb, nlh); + return 0; + } + + if (nla_put_be32(skb, NFTA_TABLE_FLAGS, + htonl(table->flags & NFT_TABLE_F_MASK))) + goto nla_put_failure; + if (nft_table_has_owner(table) && nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid))) goto nla_put_failure; @@ -900,7 +1243,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -1044,8 +1387,30 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) #define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1) #define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0) #define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1) +#define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2) #define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ - __NFT_TABLE_F_WAS_AWAKEN) + __NFT_TABLE_F_WAS_AWAKEN | \ + __NFT_TABLE_F_WAS_ORPHAN) + +static bool nft_table_pending_update(const struct nft_ctx *ctx) +{ + struct nftables_pernet *nft_net = nft_pernet(ctx->net); + struct nft_trans *trans; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return true; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->table == ctx->table && + ((trans->msg_type == NFT_MSG_NEWCHAIN && + nft_trans_chain_update(trans)) || + (trans->msg_type == NFT_MSG_DELCHAIN && + nft_is_base_chain(nft_trans_chain(trans))))) + return true; + } + + return false; +} static int nf_tables_updtable(struct nft_ctx *ctx) { @@ -1060,15 +1425,22 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if (flags & ~NFT_TABLE_F_MASK) return -EOPNOTSUPP; - if (flags == ctx->table->flags) + if (flags == (ctx->table->flags & NFT_TABLE_F_MASK)) return 0; if ((nft_table_has_owner(ctx->table) && !(flags & NFT_TABLE_F_OWNER)) || - (!nft_table_has_owner(ctx->table) && - flags & NFT_TABLE_F_OWNER)) + (flags & NFT_TABLE_F_OWNER && + !nft_table_is_orphan(ctx->table))) return -EOPNOTSUPP; + if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST) + return -EOPNOTSUPP; + + /* No dormant off/on/off/on games in single transaction */ + if (nft_table_pending_update(ctx)) + return -EINVAL; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) @@ -1091,12 +1463,20 @@ static int nf_tables_updtable(struct nft_ctx *ctx) } } + if ((flags & NFT_TABLE_F_OWNER) && + !nft_table_has_owner(ctx->table)) { + ctx->table->nlpid = ctx->portid; + ctx->table->flags |= NFT_TABLE_F_OWNER | + __NFT_TABLE_F_WAS_ORPHAN; + } + nft_trans_table_update(trans) = true; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_register_hooks: + ctx->table->flags |= NFT_TABLE_F_DORMANT; nft_trans_destroy(trans); return ret; } @@ -1224,6 +1604,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, if (table == NULL) goto err_kzalloc; + table->validate_state = nft_net->validate_state; table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (table->name == NULL) goto err_strdup; @@ -1281,7 +1662,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx->chain = chain; @@ -1295,8 +1676,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, set)) continue; - if (nft_set_is_anonymous(set) && - !list_empty(&set->bindings)) + if (nft_set_is_anonymous(set)) continue; err = nft_delset(ctx, set); @@ -1326,7 +1706,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx->chain = chain; @@ -1392,7 +1772,7 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_TABLE_HANDLE]) { attr = nla[NFTA_TABLE_HANDLE]; - table = nft_table_lookup_byhandle(net, attr, genmask, + table = nft_table_lookup_byhandle(net, attr, family, genmask, NETLINK_CB(skb).portid); } else { attr = nla[NFTA_TABLE_NAME]; @@ -1401,6 +1781,10 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(table)) { + if (PTR_ERR(table) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); } @@ -1415,15 +1799,15 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, return nft_flush_table(&ctx); } -static void nf_tables_table_destroy(struct nft_ctx *ctx) +static void nf_tables_table_destroy(struct nft_table *table) { - if (WARN_ON(ctx->table->use > 0)) + if (WARN_ON(table->use > 0)) return; - rhltable_destroy(&ctx->table->chains_ht); - kfree(ctx->table->name); - kfree(ctx->table->udata); - kfree(ctx->table); + rhltable_destroy(&table->chains_ht); + kfree(table->name); + kfree(table->udata); + kfree(table); } void nft_register_chain_type(const struct nft_chain_type *ctype) @@ -1570,8 +1954,22 @@ nla_put_failure: return -ENOSPC; } -static int nft_dump_basechain_hook(struct sk_buff *skb, int family, - const struct nft_base_chain *basechain) +static bool hook_is_prefix(struct nft_hook *hook) +{ + return strlen(hook->ifname) >= hook->ifnamelen; +} + +static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) +{ + int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME; + + return nla_put_string(skb, attr, hook->ifname); +} + +static int nft_dump_basechain_hook(struct sk_buff *skb, + const struct net *net, int family, + const struct nft_base_chain *basechain, + const struct list_head *hook_list) { const struct nf_hook_ops *ops = &basechain->ops; struct nft_hook *hook, *first = NULL; @@ -1588,19 +1986,26 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family, if (nft_base_chain_netdev(family, ops->hooknum)) { nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); - list_for_each_entry(hook, &basechain->hook_list, list) { + if (!nest_devs) + goto nla_put_failure; + + if (!hook_list) + hook_list = &basechain->hook_list; + + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { if (!first) first = hook; - if (nla_put_string(skb, NFTA_DEVICE_NAME, - hook->ops.dev->name)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + !hook_is_prefix(first) && + nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -1613,29 +2018,35 @@ nla_put_failure: static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, - const struct nft_chain *chain) + const struct nft_chain *chain, + const struct list_head *hook_list) { struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, - NFNETLINK_V0, nft_base_seq(net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) - goto nla_put_failure; - if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), + if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) || + nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) || + nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), NFTA_CHAIN_PAD)) goto nla_put_failure; - if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) - goto nla_put_failure; + + if (!hook_list && + (event == NFT_MSG_DELCHAIN || + event == NFT_MSG_DESTROYCHAIN)) { + nlmsg_end(skb, nlh); + return 0; + } if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; - if (nft_dump_basechain_hook(skb, family, basechain)) + if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -1670,7 +2081,8 @@ nla_put_failure: return -1; } -static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, + const struct list_head *hook_list) { struct nftables_pernet *nft_net; struct sk_buff *skb; @@ -1690,7 +2102,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, - ctx->chain); + ctx->chain, hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -1716,7 +2128,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -1736,7 +2148,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, NFT_MSG_NEWCHAIN, NLM_F_MULTI, table->family, table, - chain) < 0) + chain, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -1790,7 +2202,7 @@ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, - 0, family, table, chain); + 0, family, table, chain, NULL); if (err < 0) goto err_fill_chain_info; @@ -1816,14 +2228,14 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, NULL); if (err < 0) - return ERR_PTR(err); + return ERR_PTR_PCPU(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) - return ERR_PTR(-EINVAL); + return ERR_PTR_PCPU(-EINVAL); newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) - return ERR_PTR(-ENOMEM); + return ERR_PTR_PCPU(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. @@ -1837,18 +2249,19 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) return newstats; } -static void nft_chain_stats_replace(struct nft_trans *trans) +static void nft_chain_stats_replace(struct nft_trans_chain *trans) { - struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain); + const struct nft_trans *t = &trans->nft_trans_binding.nft_trans; + struct nft_base_chain *chain = nft_base_chain(trans->chain); - if (!nft_trans_chain_stats(trans)) + if (!trans->stats) return; - nft_trans_chain_stats(trans) = - rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans), - lockdep_commit_lock_is_held(trans->ctx.net)); + trans->stats = + rcu_replace_pointer(chain->stats, trans->stats, + lockdep_commit_lock_is_held(t->net)); - if (!nft_trans_chain_stats(trans)) + if (!trans->stats) static_branch_inc(&nft_counters_enabled); } @@ -1866,9 +2279,9 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->blob_next); } -void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_chain *chain) { - struct nft_chain *chain = ctx->chain; + const struct nft_table *table = chain->table; struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) @@ -1880,11 +2293,11 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); - if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } module_put(basechain->type->owner); @@ -1903,36 +2316,47 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx) } static struct nft_hook *nft_netdev_hook_alloc(struct net *net, - const struct nlattr *attr) + const struct nlattr *attr, + bool prefix) { + struct nf_hook_ops *ops; struct net_device *dev; - char ifname[IFNAMSIZ]; struct nft_hook *hook; int err; - hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); - if (!hook) { - err = -ENOMEM; - goto err_hook_alloc; - } + hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); + if (!hook) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&hook->ops_list); + + err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); + if (err < 0) + goto err_hook_free; + + /* include the terminating NUL-char when comparing non-prefixes */ + hook->ifnamelen = strlen(hook->ifname) + !prefix; - nla_strscpy(ifname, attr, IFNAMSIZ); /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with * the rtnl_mutex. */ - dev = __dev_get_by_name(net, ifname); - if (!dev) { - err = -ENOENT; - goto err_hook_dev; - } - hook->ops.dev = dev; + for_each_netdev(net, dev) { + if (strncmp(dev->name, hook->ifname, hook->ifnamelen)) + continue; + ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); + if (!ops) { + err = -ENOMEM; + goto err_hook_free; + } + ops->dev = dev; + list_add_tail(&ops->list, &hook->ops_list); + } return hook; -err_hook_dev: - kfree(hook); -err_hook_alloc: +err_hook_free: + nft_netdev_hook_free(hook); return ERR_PTR(err); } @@ -1942,7 +2366,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { - if (this->ops.dev == hook->ops.dev) + if (!strncmp(hook->ifname, this->ifname, + min(hook->ifnamelen, this->ifnamelen))) return hook; } @@ -1951,25 +2376,36 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, static int nf_tables_parse_netdev_hooks(struct net *net, const struct nlattr *attr, - struct list_head *hook_list) + struct list_head *hook_list, + struct netlink_ext_ack *extack) { struct nft_hook *hook, *next; const struct nlattr *tmp; int rem, n = 0, err; + bool prefix; nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { + switch (nla_type(tmp)) { + case NFTA_DEVICE_NAME: + prefix = false; + break; + case NFTA_DEVICE_PREFIX: + prefix = true; + break; + default: err = -EINVAL; goto err_hook; } - hook = nft_netdev_hook_alloc(net, tmp); + hook = nft_netdev_hook_alloc(net, tmp, prefix); if (IS_ERR(hook)) { + NL_SET_BAD_ATTR(extack, tmp); err = PTR_ERR(hook); goto err_hook; } if (nft_hook_list_find(hook_list, hook)) { - kfree(hook); + NL_SET_BAD_ATTR(extack, tmp); + nft_netdev_hook_free(hook); err = -EEXIST; goto err_hook; } @@ -1987,7 +2423,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, err_hook: list_for_each_entry_safe(hook, next, hook_list, list) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } return err; } @@ -1999,38 +2435,41 @@ struct nft_chain_hook { struct list_head list; }; -static int nft_chain_parse_netdev(struct net *net, - struct nlattr *tb[], - struct list_head *hook_list) +static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[], + struct list_head *hook_list, + struct netlink_ext_ack *extack, u32 flags) { struct nft_hook *hook; int err; if (tb[NFTA_HOOK_DEV]) { - hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); - if (IS_ERR(hook)) + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false); + if (IS_ERR(hook)) { + NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]); return PTR_ERR(hook); + } list_add_tail(&hook->list, hook_list); } else if (tb[NFTA_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], - hook_list); + hook_list, extack); if (err < 0) return err; - if (list_empty(hook_list)) - return -EINVAL; - } else { - return -EINVAL; } + if (flags & NFT_CHAIN_HW_OFFLOAD && + list_empty(hook_list)) + return -EINVAL; + return 0; } static int nft_chain_parse_hook(struct net *net, + struct nft_base_chain *basechain, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, - struct netlink_ext_ack *extack, bool autoload) + u32 flags, struct netlink_ext_ack *extack) { struct nftables_pernet *nft_net = nft_pernet(net); struct nlattr *ha[NFTA_HOOK_MAX + 1]; @@ -2046,31 +2485,57 @@ static int nft_chain_parse_hook(struct net *net, if (err < 0) return err; - if (ha[NFTA_HOOK_HOOKNUM] == NULL || - ha[NFTA_HOOK_PRIORITY] == NULL) - return -EINVAL; + if (!basechain) { + if (!ha[NFTA_HOOK_HOOKNUM] || + !ha[NFTA_HOOK_PRIORITY]) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); + return -ENOENT; + } - hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); - hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); + hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); - if (!type) - return -EOPNOTSUPP; + type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); + if (!type) + return -EOPNOTSUPP; - if (nla[NFTA_CHAIN_TYPE]) { - type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], - family, autoload); - if (IS_ERR(type)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); - return PTR_ERR(type); + if (nla[NFTA_CHAIN_TYPE]) { + type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], + family, true); + if (IS_ERR(type)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); + return PTR_ERR(type); + } } - } - if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) - return -EOPNOTSUPP; + if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) + return -EOPNOTSUPP; - if (type->type == NFT_CHAIN_T_NAT && - hook->priority <= NF_IP_PRI_CONNTRACK) - return -EOPNOTSUPP; + if (type->type == NFT_CHAIN_T_NAT && + hook->priority <= NF_IP_PRI_CONNTRACK) + return -EOPNOTSUPP; + } else { + if (ha[NFTA_HOOK_HOOKNUM]) { + hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); + if (hook->num != basechain->ops.hooknum) + return -EOPNOTSUPP; + } + if (ha[NFTA_HOOK_PRIORITY]) { + hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + if (hook->priority != basechain->ops.priority) + return -EOPNOTSUPP; + } + + if (nla[NFTA_CHAIN_TYPE]) { + type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], + family); + if (!type) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); + return -ENOENT; + } + } else { + type = basechain->type; + } + } if (!try_module_get(type->owner)) { if (nla[NFTA_CHAIN_TYPE]) @@ -2082,7 +2547,7 @@ static int nft_chain_parse_hook(struct net *net, INIT_LIST_HEAD(&hook->list); if (nft_base_chain_netdev(family, hook->num)) { - err = nft_chain_parse_netdev(net, ha, &hook->list); + err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags); if (err < 0) { module_put(type->owner); return err; @@ -2101,43 +2566,39 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) list_for_each_entry_safe(h, next, &hook->list, list) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } module_put(hook->type->owner); } -struct nft_rules_old { - struct rcu_head h; - struct nft_rule_blob *blob; -}; - -static void nft_last_rule(struct nft_rule_blob *blob, const void *ptr) +static void nft_last_rule(const struct nft_chain *chain, const void *ptr) { - struct nft_rule_dp *prule; + struct nft_rule_dp_last *lrule; - prule = (struct nft_rule_dp *)ptr; - prule->is_last = 1; + BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0); + + lrule = (struct nft_rule_dp_last *)ptr; + lrule->end.is_last = 1; + lrule->chain = chain; /* blob size does not include the trailer rule */ } -static struct nft_rule_blob *nf_tables_chain_alloc_rules(unsigned int size) +static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain, + unsigned int size) { struct nft_rule_blob *blob; - /* size must include room for the last rule */ - if (size < offsetof(struct nft_rule_dp, data)) - return NULL; - - size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rules_old); if (size > INT_MAX) return NULL; + size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last); + blob = kvmalloc(size, GFP_KERNEL_ACCOUNT); if (!blob) return NULL; blob->size = 0; - nft_last_rule(blob, blob->data); + nft_last_rule(chain, blob->data); return blob; } @@ -2158,6 +2619,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; + struct nf_hook_ops *ops; struct nft_hook *h; basechain->type = hook->type; @@ -2166,14 +2628,12 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); - list_for_each_entry(h, &basechain->hook_list, list) - nft_basechain_hook_init(&h->ops, family, hook, chain); - - basechain->ops.hooknum = hook->num; - basechain->ops.priority = hook->priority; - } else { - nft_basechain_hook_init(&basechain->ops, family, hook, chain); + list_for_each_entry(h, &basechain->hook_list, list) { + list_for_each_entry(ops, &h->ops_list, list) + nft_basechain_hook_init(ops, family, hook, chain); + } } + nft_basechain_hook_init(&basechain->ops, family, hook, chain); chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; @@ -2188,7 +2648,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, return 0; } -static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) +int nft_chain_add(struct nft_table *table, struct nft_chain *chain) { int err; @@ -2204,9 +2664,8 @@ static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) static u64 chain_id; -static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, - u8 policy, u32 flags, - struct netlink_ext_ack *extack) +static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, + u32 flags, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -2216,21 +2675,20 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_rule_blob *blob; struct nft_trans *trans; struct nft_chain *chain; - unsigned int data_size; int err; - if (table->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_CHAIN_HOOK]) { struct nft_stats __percpu *stats = NULL; - struct nft_chain_hook hook; + struct nft_chain_hook hook = {}; + + if (table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; if (flags & NFT_CHAIN_BINDING) return -EOPNOTSUPP; - err = nft_chain_parse_hook(net, nla, &hook, family, extack, - true); + err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags, + extack); if (err < 0) return err; @@ -2243,10 +2701,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) { + if (IS_ERR_PCPU(stats)) { nft_chain_release_hook(&hook); kfree(basechain); - return PTR_ERR(stats); + return PTR_ERR_PCPU(stats); } rcu_assign_pointer(basechain->stats, stats); } @@ -2304,8 +2762,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]); } - data_size = offsetof(struct nft_rule_dp, data); /* last rule */ - blob = nf_tables_chain_alloc_rules(data_size); + blob = nf_tables_chain_alloc_rules(chain, 0); if (!blob) { err = -ENOMEM; goto err_destroy_chain; @@ -2314,14 +2771,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, RCU_INIT_POINTER(chain->blob_gen_0, blob); RCU_INIT_POINTER(chain->blob_gen_1, blob); - err = nf_tables_register_hook(net, table, chain); - if (err < 0) + if (!nft_use_inc(&table->use)) { + err = -EMFILE; goto err_destroy_chain; + } trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto err_unregister_hook; + goto err_trans; } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; @@ -2329,81 +2787,96 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, nft_trans_chain_policy(trans) = policy; err = nft_chain_add(table, chain); - if (err < 0) { - nft_trans_destroy(trans); - goto err_unregister_hook; - } + if (err < 0) + goto err_chain_add; - table->use++; + /* This must be LAST to ensure no packets are walking over this chain. */ + err = nf_tables_register_hook(net, table, chain); + if (err < 0) + goto err_register_hook; return 0; -err_unregister_hook: - nf_tables_unregister_hook(net, table, chain); + +err_register_hook: + nft_chain_del(chain); +err_chain_add: + nft_trans_destroy(trans); +err_trans: + nft_use_dec_restore(&table->use); err_destroy_chain: - nf_tables_chain_destroy(ctx); + nf_tables_chain_destroy(chain); return err; } -static bool nft_hook_list_equal(struct list_head *hook_list1, - struct list_head *hook_list2) -{ - struct nft_hook *hook; - int n = 0, m = 0; - - n = 0; - list_for_each_entry(hook, hook_list2, list) { - if (!nft_hook_list_find(hook_list1, hook)) - return false; - - n++; - } - list_for_each_entry(hook, hook_list1, list) - m++; - - return n == m; -} - static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, u32 flags, const struct nlattr *attr, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; + struct nft_base_chain *basechain = NULL; struct nft_table *table = ctx->table; struct nft_chain *chain = ctx->chain; - struct nft_base_chain *basechain; - struct nft_stats *stats = NULL; - struct nft_chain_hook hook; + struct nft_chain_hook hook = {}; + struct nft_stats __percpu *stats = NULL; + struct nftables_pernet *nft_net; + struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; + bool unregister = false; int err; if (chain->flags ^ flags) return -EOPNOTSUPP; + INIT_LIST_HEAD(&hook.list); + if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } - err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, - extack, false); + + basechain = nft_base_chain(chain); + err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook, + ctx->family, flags, extack); if (err < 0) return err; - basechain = nft_base_chain(chain); if (basechain->type != hook.type) { nft_chain_release_hook(&hook); NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } - if (nft_base_chain_netdev(ctx->family, hook.num)) { - if (!nft_hook_list_equal(&basechain->hook_list, - &hook.list)) { - nft_chain_release_hook(&hook); - NL_SET_BAD_ATTR(extack, attr); - return -EEXIST; + if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { + list_for_each_entry_safe(h, next, &hook.list, list) { + list_for_each_entry(ops, &h->ops_list, list) { + ops->pf = basechain->ops.pf; + ops->hooknum = basechain->ops.hooknum; + ops->priority = basechain->ops.priority; + ops->priv = basechain->ops.priv; + ops->hook = basechain->ops.hook; + } + + if (nft_hook_list_find(&basechain->hook_list, h)) { + list_del(&h->list); + nft_netdev_hook_free(h); + continue; + } + + nft_net = nft_pernet(ctx->net); + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWCHAIN || + trans->table != ctx->table || + !nft_trans_chain_update(trans)) + continue; + + if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) { + nft_chain_release_hook(&hook); + return -EEXIST; + } + } } } else { ops = &basechain->ops; @@ -2414,7 +2887,6 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EEXIST; } } - nft_chain_release_hook(&hook); } if (nla[NFTA_CHAIN_HANDLE] && @@ -2425,24 +2897,50 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); - return -EEXIST; + err = -EEXIST; + goto err_hooks; + } + } + + if (table->flags & __NFT_TABLE_F_UPDATE && + !list_empty(&hook.list)) { + NL_SET_BAD_ATTR(extack, attr); + err = -EOPNOTSUPP; + goto err_hooks; + } + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + nft_is_base_chain(chain) && + !list_empty(&hook.list)) { + basechain = nft_base_chain(chain); + ops = &basechain->ops; + + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { + err = nft_netdev_register_hooks(ctx->net, &hook.list); + if (err < 0) + goto err_hooks; + + unregister = true; } } if (nla[NFTA_CHAIN_COUNTERS]) { - if (!nft_is_base_chain(chain)) - return -EOPNOTSUPP; + if (!nft_is_base_chain(chain)) { + err = -EOPNOTSUPP; + goto err_hooks; + } stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) - return PTR_ERR(stats); + if (IS_ERR_PCPU(stats)) { + err = PTR_ERR_PCPU(stats); + goto err_hooks; + } } err = -ENOMEM; - trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN, - sizeof(struct nft_trans_chain)); + trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN); if (trans == NULL) - goto err; + goto err_trans; nft_trans_chain_stats(trans) = stats; nft_trans_chain_update(trans) = true; @@ -2461,47 +2959,67 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, err = -ENOMEM; name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT); if (!name) - goto err; + goto err_trans; err = -EEXIST; list_for_each_entry(tmp, &nft_net->commit_list, list) { if (tmp->msg_type == NFT_MSG_NEWCHAIN && - tmp->ctx.table == table && + tmp->table == table && nft_trans_chain_update(tmp) && nft_trans_chain_name(tmp) && strcmp(name, nft_trans_chain_name(tmp)) == 0) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); kfree(name); - goto err; + goto err_trans; } } nft_trans_chain_name(trans) = name; } + + nft_trans_basechain(trans) = basechain; + INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); + list_splice(&hook.list, &nft_trans_chain_hooks(trans)); + if (nla[NFTA_CHAIN_HOOK]) + module_put(hook.type->owner); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; -err: + +err_trans: free_percpu(stats); kfree(trans); +err_hooks: + if (nla[NFTA_CHAIN_HOOK]) { + list_for_each_entry_safe(h, next, &hook.list, list) { + if (unregister) { + list_for_each_entry(ops, &h->ops_list, list) + nf_unregister_net_hook(ctx->net, ops); + } + list_del(&h->list); + nft_netdev_hook_free_rcu(h); + } + module_put(hook.type->owner); + } + return err; } static struct nft_chain *nft_chain_lookup_byid(const struct net *net, const struct nft_table *table, - const struct nlattr *nla) + const struct nlattr *nla, u8 genmask) { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { - struct nft_chain *chain = trans->ctx.chain; - if (trans->msg_type == NFT_MSG_NEWCHAIN && - chain->table == table && - id == nft_trans_chain_id(trans)) - return chain; + nft_trans_chain(trans)->table == table && + id == nft_trans_chain_id(trans) && + nft_active_genmask(nft_trans_chain(trans), genmask)) + return nft_trans_chain(trans); } return ERR_PTR(-ENOENT); } @@ -2604,7 +3122,59 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, extack); } - return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack); + return nf_tables_addchain(&ctx, family, policy, flags, extack); +} + +static int nft_delchain_hook(struct nft_ctx *ctx, + struct nft_base_chain *basechain, + struct netlink_ext_ack *extack) +{ + const struct nft_chain *chain = &basechain->chain; + const struct nlattr * const *nla = ctx->nla; + struct nft_chain_hook chain_hook = {}; + struct nft_hook *this, *hook; + LIST_HEAD(chain_del_list); + struct nft_trans *trans; + int err; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return -EOPNOTSUPP; + + err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, + ctx->family, chain->flags, extack); + if (err < 0) + return err; + + list_for_each_entry(this, &chain_hook.list, list) { + hook = nft_hook_list_find(&basechain->hook_list, this); + if (!hook) { + err = -ENOENT; + goto err_chain_del_hook; + } + list_move(&hook->list, &chain_del_list); + } + + trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN); + if (!trans) { + err = -ENOMEM; + goto err_chain_del_hook; + } + + nft_trans_basechain(trans) = basechain; + nft_trans_chain_update(trans) = true; + INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); + list_splice(&chain_del_list, &nft_trans_chain_hooks(trans)); + nft_chain_release_hook(&chain_hook); + + nft_trans_commit_list_add_tail(ctx->net, trans); + + return 0; + +err_chain_del_hook: + list_splice(&chain_del_list, &basechain->hook_list); + nft_chain_release_hook(&chain_hook); + + return err; } static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, @@ -2639,16 +3209,36 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } + if (nft_chain_binding(chain)) + return -EOPNOTSUPP; + + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); + + if (nla[NFTA_CHAIN_HOOK]) { + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN || + chain->flags & NFT_CHAIN_HW_OFFLOAD) + return -EOPNOTSUPP; + + if (nft_is_base_chain(chain)) { + struct nft_base_chain *basechain = nft_base_chain(chain); + + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) + return nft_delchain_hook(&ctx, basechain, extack); + } + } + if (info->nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); - use = chain->use; list_for_each_entry(rule, &chain->rules, list) { if (!nft_is_active_next(net, rule)) @@ -2684,6 +3274,9 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, */ int nft_register_expr(struct nft_expr_type *type) { + if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR)) + return -ENOMEM; + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); @@ -2713,7 +3306,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -2745,9 +3338,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -2881,28 +3478,40 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, if (err < 0) return err; - if (!tb[NFTA_EXPR_DATA]) + if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME]) return -EINVAL; + rcu_read_lock(); + type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); - if (!type) - return -ENOENT; + if (!type) { + err = -ENOENT; + goto out_unlock; + } - if (!type->inner_ops) - return -EOPNOTSUPP; + if (!type->inner_ops) { + err = -EOPNOTSUPP; + goto out_unlock; + } err = nla_parse_nested_deprecated(info->tb, type->maxattr, tb[NFTA_EXPR_DATA], type->policy, NULL); if (err < 0) - goto err_nla_parse; + goto out_unlock; info->attr = nla; info->ops = type->inner_ops; + /* No module reference will be taken on type->owner. + * Presence of type->inner_ops implies that the expression + * is builtin, so it cannot go away. + */ + rcu_read_unlock(); return 0; -err_nla_parse: +out_unlock: + rcu_read_unlock(); return err; } @@ -2974,18 +3583,17 @@ err_expr_parse: return ERR_PTR(err); } -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp) { int err; - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } + if (WARN_ON_ONCE(!src->ops->clone)) + return -EINVAL; + + dst->ops = src->ops; + err = src->ops->clone(dst, src, gfp); + if (err < 0) + return err; __module_get(src->ops->type->owner); @@ -3002,13 +3610,15 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) * Rules */ -static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *__nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, u64 handle) { struct nft_rule *rule; // FIXME: this sucks - list_for_each_entry_rcu(rule, &chain->rules, list) { + list_for_each_entry_rcu(rule, &chain->rules, list, + lockdep_commit_lock_is_held(net)) { if (handle == rule->handle) return rule; } @@ -3016,13 +3626,14 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, return ERR_PTR(-ENOENT); } -static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); - return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); + return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { @@ -3031,7 +3642,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { [NFTA_RULE_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, - [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, [NFTA_RULE_POSITION] = { .type = NLA_U64 }, [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, @@ -3055,7 +3666,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, - nft_base_seq(net)); + nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -3143,33 +3754,44 @@ err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } +static void audit_log_rule_reset(const struct nft_table *table, + unsigned int base_seq, + unsigned int nentries) +{ + char *buf = kasprintf(GFP_ATOMIC, "%s:%u", + table->name, base_seq); + + audit_log_nfcfg(buf, table->family, nentries, + AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); + kfree(buf); +} + struct nft_rule_dump_ctx { + unsigned int s_idx; char *table; char *chain; + bool reset; }; static int __nf_tables_dump_rules(struct sk_buff *skb, unsigned int *idx, struct netlink_callback *cb, const struct nft_table *table, - const struct nft_chain *chain, - bool reset) + const struct nft_chain *chain) { + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; - unsigned int s_idx = cb->args[0]; + unsigned int entries = 0; + int ret = 0; u64 handle; prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) goto cont_skip; - if (*idx < s_idx) + if (*idx < ctx->s_idx) goto cont; - if (*idx > s_idx) { - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - } if (prule) handle = prule->handle; else @@ -3180,46 +3802,48 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, handle, reset) < 0) - return 1; - + table, chain, rule, handle, ctx->reset) < 0) { + ret = 1; + break; + } + entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: prule = rule; cont_skip: (*idx)++; } - return 0; + + if (ctx->reset && entries) + audit_log_rule_reset(table, cb->seq, entries); + + return ret; } static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_rule_dump_ctx *ctx = cb->data; + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct nft_table *table; const struct nft_chain *chain; unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; - bool reset = false; - - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) - reset = true; rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; - if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) + if (ctx->table && strcmp(ctx->table, table->name) != 0) continue; - if (ctx && ctx->table && ctx->chain) { + if (ctx->table && ctx->chain) { struct rhlist_head *list, *tmp; list = rhltable_lookup(&table->chains_ht, ctx->chain, @@ -3231,7 +3855,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (!nft_is_active(net, chain)) continue; __nf_tables_dump_rules(skb, &idx, - cb, table, chain, reset); + cb, table, chain); break; } goto done; @@ -3239,68 +3863,81 @@ static int nf_tables_dump_rules(struct sk_buff *skb, list_for_each_entry_rcu(chain, &table->chains, list) { if (__nf_tables_dump_rules(skb, &idx, - cb, table, chain, reset)) + cb, table, chain)) goto done; } - if (ctx && ctx->table) + if (ctx->table) break; } done: rcu_read_unlock(); - cb->args[0] = idx; + ctx->s_idx = idx; return skb->len; } +static int nf_tables_dumpreset_rules(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + int ret; + + /* Mutex is held is to prevent that two concurrent dump-and-reset calls + * do not underrun counters and quotas. The commit_mutex is used for + * the lack a better lock, this is not transaction path. + */ + mutex_lock(&nft_net->commit_mutex); + ret = nf_tables_dump_rules(skb, cb); + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_rules_start(struct netlink_callback *cb) { + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; - struct nft_rule_dump_ctx *ctx = NULL; - if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { - ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); - if (!ctx) - return -ENOMEM; + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - if (nla[NFTA_RULE_TABLE]) { - ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], - GFP_ATOMIC); - if (!ctx->table) { - kfree(ctx); - return -ENOMEM; - } - } - if (nla[NFTA_RULE_CHAIN]) { - ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], - GFP_ATOMIC); - if (!ctx->chain) { - kfree(ctx->table); - kfree(ctx); - return -ENOMEM; - } + if (nla[NFTA_RULE_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC); + if (!ctx->table) + return -ENOMEM; + } + if (nla[NFTA_RULE_CHAIN]) { + ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC); + if (!ctx->chain) { + kfree(ctx->table); + return -ENOMEM; } } - - cb->data = ctx; return 0; } +static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb) +{ + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; + + ctx->reset = true; + + return nf_tables_dump_rules_start(cb); +} + static int nf_tables_dump_rules_done(struct netlink_callback *cb) { - struct nft_rule_dump_ctx *ctx = cb->data; + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; - if (ctx) { - kfree(ctx->table); - kfree(ctx->chain); - kfree(ctx); - } + kfree(ctx->table); + kfree(ctx->chain); return 0; } -/* called with rcu_read_lock held */ -static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, - const struct nlattr * const nla[]) +/* Caller must hold rcu read lock or transaction mutex */ +static struct sk_buff * +nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, + const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); @@ -3310,61 +3947,113 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; - bool reset = false; int err; - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start= nf_tables_dump_rules_start, - .dump = nf_tables_dump_rules, - .done = nf_tables_dump_rules_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); - return PTR_ERR(table); + return ERR_CAST(table); } chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + return ERR_CAST(chain); } - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); - return PTR_ERR(rule); + return ERR_CAST(rule); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) - return -ENOMEM; - - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) - reset = true; + return ERR_PTR(-ENOMEM); - err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, + err = nf_tables_fill_rule_info(skb2, net, portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, 0, reset); - if (err < 0) - goto err_fill_rule_info; + if (err < 0) { + kfree_skb(skb2); + return ERR_PTR(err); + } - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + return skb2; +} -err_fill_rule_info: - kfree_skb(skb2); - return err; +static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start= nf_tables_dump_rules_start, + .dump = nf_tables_dump_rules, + .done = nf_tables_dump_rules_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + skb2 = nf_tables_getrule_single(portid, info, nla, false); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + return nfnetlink_unicast(skb2, net, portid); +} + +static int nf_tables_getrule_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + char *buf; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start= nf_tables_dumpreset_rules_start, + .dump = nf_tables_dumpreset_rules, + .done = nf_tables_dump_rules_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + skb2 = nf_tables_getrule_single(portid, info, nla, true); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); + + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", + nla_len(nla[NFTA_RULE_TABLE]), + (char *)nla_data(nla[NFTA_RULE_TABLE]), + nft_base_seq(net)); + audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, + AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); + kfree(buf); + + return nfnetlink_unicast(skb2, net, portid); } -static void nf_tables_rule_destroy(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr, *next; @@ -3381,16 +4070,27 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) +/* can only be used if rule is no longer visible to dumps */ +static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { + WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net)); + nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); } +/** nft_chain_validate - loop detection and hook validation + * + * @ctx: context containing call depth and base chain + * @chain: chain to validate + * + * Walk through the rules of the given chain and chase all jumps/gotos + * and set lookups until either the jump limit is hit or all reachable + * chains have been validated. + */ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; - const struct nft_data *data; struct nft_rule *rule; int err; @@ -3398,6 +4098,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) return -EMLINK; list_for_each_entry(rule, &chain->rules, list) { + if (fatal_signal_pending(current)) + return -EINTR; + if (!nft_is_active_next(ctx->net, rule)) continue; @@ -3405,12 +4108,13 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (!expr->ops->validate) continue; - err = expr->ops->validate(ctx, expr, &data); + /* This may call nft_chain_validate() recursively, + * callers that do so must increment ctx->level. + */ + err = expr->ops->validate(ctx, expr); if (err < 0) return err; } - - cond_resched(); } return 0; @@ -3434,11 +4138,70 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) err = nft_chain_validate(&ctx, chain); if (err < 0) return err; + + cond_resched(); + } + + return 0; +} + +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; + const struct nft_data *data; + int err; + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) + return 0; + + data = nft_set_ext_data(ext); + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + pctx->level++; + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + pctx->level--; + break; + default: + break; } return 0; } +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter dummy_iter = { + .genmask = nft_genmask_next(ctx->net), + }; + struct nft_set_elem_catchall *catchall; + + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, dummy_iter.genmask)) + continue; + + ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); + if (ret < 0) + return ret; + } + + return ret; +} + static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla); @@ -3483,11 +4246,10 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } - if (nft_chain_is_bound(chain)) - return -EOPNOTSUPP; } else if (nla[NFTA_RULE_CHAIN_ID]) { - chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID]); + chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID], + genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); return PTR_ERR(chain); @@ -3496,9 +4258,12 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return -EINVAL; } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; + if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); - rule = __nft_rule_lookup(chain, handle); + rule = __nft_rule_lookup(net, chain, handle); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); @@ -3518,12 +4283,9 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return -EINVAL; handle = nf_tables_alloc_handle(table); - if (chain->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nft_rule_lookup(chain, pos_handle); + old_rule = __nft_rule_lookup(net, chain, pos_handle); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); @@ -3600,7 +4362,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } if (expr_info[i].ops->validate) - nft_validate_state_update(net, NFT_VALIDATE_NEED); + nft_validate_state_update(table, NFT_VALIDATE_NEED); expr_info[i].ops = NULL; expr = nft_expr_next(expr); @@ -3614,7 +4376,17 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } } + if (!nft_use_inc(&chain->use)) { + err = -EMFILE; + goto err_release_rule; + } + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { + if (nft_chain_binding(chain)) { + err = -EOPNOTSUPP; + goto err_destroy_flow_rule; + } + err = nft_delrule(&ctx, old_rule); if (err < 0) goto err_destroy_flow_rule; @@ -3645,21 +4417,22 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } } kvfree(expr_info); - chain->use++; if (flow) nft_trans_flow_rule(trans) = flow; - if (nft_net->validate_state == NFT_VALIDATE_DO) + if (table->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); return 0; err_destroy_flow_rule: + nft_use_dec_restore(&chain->use); if (flow) nft_flow_rule_destroy(flow); err_release_rule: - nf_tables_rule_release(&ctx, rule); + nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); + nf_tables_rule_destroy(&ctx, rule); err_release_expr: for (i = 0; i < n; i++) { if (expr_info[i].ops) { @@ -3682,12 +4455,10 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { - struct nft_rule *rule = nft_trans_rule(trans); - if (trans->msg_type == NFT_MSG_NEWRULE && - trans->ctx.chain == chain && + nft_trans_rule_chain(trans) == chain && id == nft_trans_rule_id(trans)) - return rule; + return nft_trans_rule(trans); } return ERR_PTR(-ENOENT); } @@ -3716,10 +4487,14 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) return -EOPNOTSUPP; } @@ -3727,8 +4502,12 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, if (chain) { if (nla[NFTA_RULE_HANDLE]) { - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } @@ -3749,6 +4528,8 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; + if (nft_chain_binding(chain)) + continue; ctx.chain = chain; err = nft_delrule_by_chain(&ctx); @@ -3790,23 +4571,18 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) * given, in that case the amount of memory per element is used. */ static const struct nft_set_ops * -nft_select_set_ops(const struct nft_ctx *ctx, - const struct nlattr * const nla[], +nft_select_set_ops(const struct nft_ctx *ctx, u32 flags, const struct nft_set_desc *desc) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; - u32 flags = 0; int i; lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); - if (nla[NFTA_SET_FLAGS] != NULL) - flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); - bops = NULL; best.size = ~0; best.lookup = ~0; @@ -3874,15 +4650,22 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, - [NFTA_SET_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), + [NFTA_SET_TYPE] = { .type = NLA_REJECT }, + [NFTA_SET_COUNT] = { .type = NLA_REJECT }, +}; + +static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { + [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, - [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, + [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy), }; -static struct nft_set *nft_set_lookup(const struct nft_table *table, +static struct nft_set *nft_set_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -3890,7 +4673,8 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(set, &table->sets, list) { + list_for_each_entry_rcu(set, &table->sets, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; @@ -3918,17 +4702,16 @@ static struct nft_set *nft_set_lookup_byid(const struct net *net, { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); - struct nft_trans *trans; + struct nft_trans_set *trans; - list_for_each_entry(trans, &nft_net->commit_list, list) { - if (trans->msg_type == NFT_MSG_NEWSET) { - struct nft_set *set = nft_trans_set(trans); + /* its likely the id we need is at the tail, not at start */ + list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) { + struct nft_set *set = trans->set; - if (id == nft_trans_set_id(trans) && - set->table == table && - nft_active_genmask(set, genmask)) - return set; - } + if (id == trans->set_id && + set->table == table && + nft_active_genmask(set, genmask)) + return set; } return ERR_PTR(-ENOENT); } @@ -3941,7 +4724,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net, { struct nft_set *set; - set = nft_set_lookup(table, nla_set_name, genmask); + set = nft_set_lookup(net, table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; @@ -3965,6 +4748,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; + if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN) + return -EINVAL; + inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; @@ -4017,7 +4803,7 @@ int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) return -ERANGE; ms *= NSEC_PER_MSEC; - *result = nsecs_to_jiffies64(ms); + *result = nsecs_to_jiffies64(ms) ? : !!ms; return 0; } @@ -4053,6 +4839,35 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, return 0; } +static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) +{ + if (ops->usize) + return ops->usize(size); + + return size; +} + +static noinline_for_stack int +nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set) +{ + unsigned int nelems; + char str[40]; + int ret; + + ret = snprintf(str, sizeof(str), "%ps", set->ops); + + /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped + * to userspace purely for informational/debug purposes. + */ + DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str)); + + if (nla_put_string(skb, NFTA_SET_TYPE, str)) + return -EMSGSIZE; + + nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems)); + return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -4064,9 +4879,10 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq = ctx->seq; int i; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, ctx->family, NFNETLINK_V0, + nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -4077,6 +4893,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle), NFTA_SET_PAD)) goto nla_put_failure; + + if (event == NFT_MSG_DELSET || + event == NFT_MSG_DESTROYSET) { + nlmsg_end(skb, nlh); + return 0; + } + if (set->flags != 0) if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) goto nla_put_failure; @@ -4117,7 +4940,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (!nest) goto nla_put_failure; if (set->size && - nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + nla_put_be32(skb, NFTA_SET_DESC_SIZE, + htonl(nft_set_userspace_size(set->ops, set->size)))) goto nla_put_failure; if (set->field_count > 1 && @@ -4126,6 +4950,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); + if (nf_tables_fill_set_info(skb, set)) + goto nla_put_failure; + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) @@ -4200,7 +5027,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && @@ -4308,9 +5135,11 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); - if (IS_ERR(set)) + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) @@ -4327,10 +5156,6 @@ err_fill_set_info: return err; } -static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { - [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, -}; - static int nft_set_desc_concat_parse(const struct nlattr *attr, struct nft_set_desc *desc) { @@ -4361,8 +5186,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { + u32 len = 0, num_regs; struct nlattr *attr; - u32 num_regs = 0; int rem, err, i; nla_for_each_nested(attr, nla, rem) { @@ -4375,8 +5200,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, } for (i = 0; i < desc->field_count; i++) - num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + len += round_up(desc->field_len[i], sizeof(u32)); + if (len != desc->klen) + return -EINVAL; + + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; @@ -4483,6 +5312,15 @@ static bool nft_set_is_same(const struct nft_set *set, return true; } +static u32 nft_set_kernel_size(const struct nft_set_ops *ops, + const struct nft_set_desc *desc) +{ + if (ops->ksize) + return ops->ksize(desc->size); + + return desc->size; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { @@ -4538,6 +5376,12 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; + if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) == + (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; + if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == + (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; } desc.dtype = 0; @@ -4579,6 +5423,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; @@ -4587,20 +5434,36 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } desc.policy = NFT_SET_POL_PERFORMANCE; - if (nla[NFTA_SET_POLICY] != NULL) + if (nla[NFTA_SET_POLICY] != NULL) { desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + switch (desc.policy) { + case NFT_SET_POL_PERFORMANCE: + case NFT_SET_POL_MEMORY: + break; + default: + return -EOPNOTSUPP; + } + } if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); if (err < 0) return err; - if (desc.field_count > 1 && !(flags & NFT_SET_CONCAT)) + if (desc.field_count > 1) { + if (!(flags & NFT_SET_CONCAT)) + return -EINVAL; + } else if (flags & NFT_SET_CONCAT) { return -EINVAL; + } } else if (flags & NFT_SET_CONCAT) { return -EINVAL; } @@ -4617,7 +5480,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); @@ -4633,10 +5496,16 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); if (err < 0) return err; + if (desc.size) + desc.size = nft_set_kernel_size(set->ops, &desc); + err = 0; if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); @@ -4655,10 +5524,13 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(&ctx, nla, &desc); + ops = nft_select_set_ops(&ctx, flags, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); + if (desc.size) + desc.size = nft_set_kernel_size(ops, &desc); + udlen = 0; if (nla[NFTA_SET_USERDATA]) udlen = nla_len(nla[NFTA_SET_USERDATA]); @@ -4669,9 +5541,15 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, alloc_size = sizeof(*set) + size + udlen; if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; + + if (!nft_use_inc(&table->use)) + return -EMFILE; + set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT); - if (!set) - return -ENOMEM; + if (!set) { + err = -ENOMEM; + goto err_alloc; + } name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT); if (!name) { @@ -4692,6 +5570,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&set->bindings); INIT_LIST_HEAD(&set->catchall_list); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -4722,33 +5601,31 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, set->num_exprs = num_exprs; set->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&set->pending_update); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) goto err_set_expr_alloc; list_add_tail_rcu(&set->list, &table->sets); - table->use++; + return 0; err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); err_set_destroy: - ops->destroy(set); + ops->destroy(&ctx, set); err_set_init: kfree(set->name); err_set_name: kvfree(set); +err_alloc: + nft_use_dec_restore(&table->use); + return err; } -struct nft_set_elem_catchall { - struct list_head list; - struct rcu_head rcu; - void *elem; -}; - static void nft_set_catchall_destroy(const struct nft_ctx *ctx, struct nft_set *set) { @@ -4756,11 +5633,19 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); - nft_set_elem_destroy(set, catchall->elem, true); + nf_tables_set_elem_destroy(ctx, set, catchall->elem); kfree_rcu(catchall, rcu); } } +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; @@ -4771,10 +5656,9 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(ctx, set->exprs[i]); - set->ops->destroy(set); + set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, @@ -4804,10 +5688,14 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; - set = nft_set_lookup(table, attr, genmask); + set = nft_set_lookup(net, table, attr, genmask); } if (IS_ERR(set)) { + if (PTR_ERR(set) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(set); } @@ -4831,9 +5719,9 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, static int nft_setelem_data_validate(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); enum nft_registers dreg; dreg = nft_type_to_reg(set->dtype); @@ -4846,9 +5734,14 @@ static int nft_setelem_data_validate(const struct nft_ctx *ctx, static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - return nft_setelem_data_validate(ctx, set, elem); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + return nft_setelem_data_validate(ctx, set, elem_priv); } static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, @@ -4856,17 +5749,16 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; - struct nft_set_elem elem; struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; - elem.priv = catchall->elem; - ret = nft_setelem_data_validate(ctx, set, &elem); + ret = nft_setelem_data_validate(ctx, set, catchall->elem); if (ret < 0) break; } @@ -4878,10 +5770,11 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { struct nft_set_binding *i; - struct nft_set_iter iter; - - if (set->use == UINT_MAX) - return -EOVERFLOW; + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nf_tables_bind_check_setelem, + }; if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; @@ -4896,12 +5789,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, goto bind; } - iter.genmask = nft_genmask_next(ctx->net); - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_bind_check_setelem; - set->ops->walk(ctx, set, &iter); if (!iter.err) iter.err = nft_set_catchall_bind_check(ctx, set); @@ -4910,10 +5797,12 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, return iter.err; } bind: + if (!nft_use_inc(&set->use)) + return -EMFILE; + binding->chain = ctx->chain; list_add_tail_rcu(&binding->list, &set->bindings); nft_set_trans_bind(ctx, set); - set->use++; return 0; } @@ -4926,23 +5815,111 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); + set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); } } +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv); + +static int nft_mapelem_activate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + /* called from abort path, reverse check to undo changes. */ + if (nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_clear(ctx->net, ext); + nft_setelem_data_activate(ctx->net, set, elem_priv); + + return 0; +} + +static void nft_map_catchall_activate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + nft_clear(ctx->net, ext); + nft_setelem_data_activate(ctx->net, set, catchall->elem); + break; + } +} + +static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nft_mapelem_activate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_activate(ctx, set); +} + +void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) +{ + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(ctx, set); + + nft_clear(ctx->net, set); + } + + nft_use_inc_restore(&set->use); +} +EXPORT_SYMBOL_GPL(nf_tables_activate_set); + void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { + WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net)); + switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nft_set_trans_unbind(ctx, set); + if (nft_set_is_anonymous(set)) + nft_deactivate_next(ctx->net, set); + else + list_del_rcu(&binding->list); + + nft_use_dec(&set->use); + break; case NFT_TRANS_PREPARE: - set->use--; + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + + nft_deactivate_next(ctx->net, set); + } + nft_use_dec(&set->use); return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: - set->use--; + if (nft_set_is_anonymous(set) && + set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + + nft_use_dec(&set->use); fallthrough; default: nf_tables_unbind_set(ctx, set, binding, @@ -4977,12 +5954,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u8), }, [NFT_SET_EXT_TIMEOUT] = { - .len = sizeof(u64), - .align = __alignof__(u64), - }, - [NFT_SET_EXT_EXPIRATION] = { - .len = sizeof(u64), - .align = __alignof__(u64), + .len = sizeof(struct nft_timeout), + .align = __alignof__(struct nft_timeout), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), @@ -5009,7 +5982,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, - [NFTA_SET_ELEM_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -5017,13 +5990,14 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy), [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_ext *ext) + const struct nft_set_ext *ext, + bool reset) { struct nft_set_elem_expr *elem_expr; u32 size, num_exprs = 0; @@ -5036,7 +6010,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); - if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0) return -1; return 0; @@ -5047,7 +6021,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -5060,9 +6034,10 @@ nla_put_failure: static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_elem_priv *elem_priv, + bool reset) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -5082,12 +6057,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), - set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, - set->dlen) < 0) + nft_set_datatype(set), set->dlen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && - nft_set_elem_expr_dump(skb, set, ext)) + nft_set_elem_expr_dump(skb, set, ext, reset)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && @@ -5100,25 +6074,32 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && - nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout); + u64 set_timeout = READ_ONCE(set->timeout); + __be64 msecs = 0; + + if (set_timeout != timeout) { + msecs = nf_jiffies64_to_msecs(timeout); + if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs, + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - u64 expires, now = get_jiffies_64(); + if (timeout > 0) { + u64 expires, now = get_jiffies_64(); - expires = *nft_set_ext_expiration(ext); - if (time_before64(now, expires)) - expires -= now; - else - expires = 0; + expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration); + if (time_before64(now, expires)) + expires -= now; + else + expires = 0; - if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - nf_jiffies64_to_msecs(expires), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, + nf_jiffies64_to_msecs(expires), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -5142,30 +6123,50 @@ struct nft_set_dump_args { const struct netlink_callback *cb; struct nft_set_iter iter; struct sk_buff *skb; + bool reset; }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) + return 0; + args = container_of(iter, struct nft_set_dump_args, iter); - return nf_tables_fill_setelem(args->skb, set, elem); + return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset); +} + +static void audit_log_nft_set_reset(const struct nft_table *table, + unsigned int base_seq, + unsigned int nentries) +{ + char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); + + audit_log_nfcfg(buf, table->family, nentries, + AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC); + kfree(buf); } struct nft_set_dump_ctx { const struct nft_set *set; struct nft_ctx ctx; + bool reset; }; static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, - const struct nft_set *set) + const struct nft_set *set, bool reset, + unsigned int base_seq) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); - struct nft_set_elem elem; struct nft_set_ext *ext; int ret = 0; @@ -5175,8 +6176,9 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, nft_set_elem_expired(ext)) continue; - elem.priv = catchall->elem; - ret = nf_tables_fill_setelem(skb, set, &elem); + ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset); + if (reset && !ret) + audit_log_nft_set_reset(set->table, base_seq, 1); break; } @@ -5190,7 +6192,17 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; - struct nft_set_dump_args args; + struct nft_set_dump_args args = { + .cb = cb, + .skb = skb, + .reset = dump_ctx->reset, + .iter = { + .genmask = nft_genmask_cur(net), + .type = NFT_ITER_READ, + .skip = cb->args[0], + .fn = nf_tables_dump_setelem, + }, + }; bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; @@ -5199,7 +6211,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -5228,7 +6240,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, - table->family, NFNETLINK_V0, nft_base_seq(net)); + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -5241,22 +6253,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; - args.cb = cb; - args.skb = skb; - args.iter.genmask = nft_genmask_cur(net); - args.iter.skip = cb->args[0]; - args.iter.count = 0; - args.iter.err = 0; - args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) - args.iter.err = nft_set_catchall_dump(net, skb, set); - rcu_read_unlock(); - + args.iter.err = nft_set_catchall_dump(net, skb, set, + dump_ctx->reset, cb->seq); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); + rcu_read_unlock(); + if (args.iter.err && args.iter.err != -EMSGSIZE) return args.iter.err; if (args.iter.count == cb->args[0]) @@ -5270,6 +6276,26 @@ nla_put_failure: return -ENOSPC; } +static int nf_tables_dumpreset_set(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + struct nft_set_dump_ctx *dump_ctx = cb->data; + int ret, skip = cb->args[0]; + + mutex_lock(&nft_net->commit_mutex); + + ret = nf_tables_dump_set(skb, cb); + + if (cb->args[0] > skip) + audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq, + cb->args[0] - skip); + + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_set_start(struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; @@ -5289,7 +6315,8 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_elem_priv *elem_priv, + bool reset) { struct nlmsghdr *nlh; struct nlattr *nest; @@ -5297,7 +6324,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -5310,7 +6337,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - err = nf_tables_fill_setelem(skb, set, elem); + err = nf_tables_fill_setelem(skb, set, elem_priv, reset); if (err < 0) goto nla_put_failure; @@ -5343,7 +6370,7 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; } -static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, +static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set, struct nft_data *key, struct nlattr *attr) { struct nft_data_desc desc = { @@ -5396,7 +6423,7 @@ static void *nft_setelem_catchall_get(const struct net *net, return priv; } -static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, +static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set, struct nft_set_elem *elem, u32 flags) { void *priv; @@ -5415,8 +6442,8 @@ static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, return 0; } -static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) +static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set, + const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_elem elem; @@ -5460,7 +6487,8 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, - NFT_MSG_NEWSETELEM, 0, set, &elem); + NFT_MSG_NEWSETELEM, 0, set, elem.priv, + reset); if (err < 0) goto err_fill_setelem; @@ -5471,10 +6499,11 @@ err_fill_setelem: return err; } -/* called with rcu_read_lock held */ -static int nf_tables_getsetelem(struct sk_buff *skb, - const struct nfnl_info *info, - const struct nlattr * const nla[]) +static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx, + const struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[], + bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); @@ -5482,22 +6511,36 @@ static int nf_tables_getsetelem(struct sk_buff *skb, struct net *net = info->net; struct nft_table *table; struct nft_set *set; - struct nlattr *attr; - struct nft_ctx ctx; - int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask, NETLINK_CB(skb).portid); + genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); } - set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); - if (IS_ERR(set)) + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); + } - nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + nft_ctx_init(&dump_ctx->ctx, net, skb, + info->nlh, family, table, NULL, nla); + dump_ctx->set = set; + dump_ctx->reset = reset; + return 0; +} + +/* called with rcu_read_lock held */ +static int nf_tables_getsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct netlink_ext_ack *extack = info->extack; + struct nft_set_dump_ctx dump_ctx; + struct nlattr *attr; + int rem, err = 0; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -5506,11 +6549,55 @@ static int nf_tables_getsetelem(struct sk_buff *skb, .done = nf_tables_dump_set_done, .module = THIS_MODULE, }; - struct nft_set_dump_ctx dump_ctx = { - .set = set, - .ctx = ctx, + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + if (err) + return err; + + c.data = &dump_ctx; + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) + return -EINVAL; + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + if (err) + return err; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false); + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); + break; + } + } + + return err; +} + +static int nf_tables_getsetelem_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + struct netlink_ext_ack *extack = info->extack; + struct nft_set_dump_ctx dump_ctx; + int rem, err = 0, nelems = 0; + struct nlattr *attr; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dump_set_start, + .dump = nf_tables_dumpreset_set, + .done = nf_tables_dump_set_done, + .module = THIS_MODULE, }; + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); + if (err) + return err; + c.data = &dump_ctx; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } @@ -5518,20 +6605,38 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + rcu_read_lock(); + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); + if (err) + goto out_unlock; + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&ctx, set, attr); + err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; } + nelems++; } + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); + +out_unlock: + rcu_read_unlock(); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); return err; } static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, - const struct nft_set_elem *elem, + const struct nft_elem_priv *elem_priv, int event) { struct nftables_pernet *nft_net; @@ -5552,7 +6657,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, - set, elem); + set, elem_priv, false); if (err < 0) { kfree_skb(skb); goto err; @@ -5565,17 +6670,21 @@ err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } -static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, +static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { + struct nft_trans_elem *te; struct nft_trans *trans; - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1)); if (trans == NULL) return NULL; - nft_trans_elem_set(trans) = set; + te = nft_trans_container_elem(trans); + te->nelems = 1; + te->set = set; + return trans; } @@ -5627,10 +6736,11 @@ static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id, return 0; } -void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *key_end, - const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) +struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *key_end, + const u32 *data, + u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -5657,13 +6767,14 @@ void *nft_set_elem_init(const struct nft_set *set, nft_set_ext_data(ext), data, set->dlen) < 0) goto err_ext_check; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + nft_set_ext_timeout(ext)->timeout = timeout; + if (expiration == 0) - *nft_set_ext_expiration(ext) += timeout; + expiration = timeout; + + nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration; } - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) - *nft_set_ext_timeout(ext) = timeout; return elem; @@ -5694,39 +6805,75 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, __nft_set_elem_expr_destroy(ctx, expr); } -void nft_set_elem_destroy(const struct nft_set *set, void *elem, - bool destroy_expr) +/* Drop references and destroy. Called from gc, dynset and abort path. */ +static void __nft_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem); - struct nft_ctx ctx = { - .net = read_pnet(&set->net), - .family = set->table->family, - }; + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) - nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); - + nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; - kfree(elem); + nft_use_dec(&(*nft_set_ext_obj(ext))->use); + + kfree(elem_priv); +} + +/* Drop references and destroy. Called from gc and dynset. */ +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) +{ + struct nft_ctx ctx = { + .net = read_pnet(&set->net), + .family = set->table->family, + }; + + __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_setelem_data_deactivate() already deals - * with the refcounting from the preparation phase. +/* Drop references and destroy. Called from abort path. */ +static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + /* skip update request, see nft_trans_elems_new_abort() */ + if (!te->elems[i].priv) + continue; + + __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); + } +} + +/* Destroy element. References have been already dropped in the preparation + * path via nft_setelem_data_deactivate(). */ -static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_elem_priv *elem_priv) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); - kfree(elem); + kfree(elem_priv); +} + +static void nft_trans_elems_destroy(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) + nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv); } int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, @@ -5740,7 +6887,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, if (!expr) goto err_expr; - err = nft_expr_clone(expr, set->exprs[i]); + err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT); if (err < 0) { kfree(expr); goto err_expr; @@ -5779,7 +6926,7 @@ static int nft_set_elem_expr_setup(struct nft_ctx *ctx, for (i = 0; i < num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); - err = nft_expr_clone(expr, expr_array[i]); + err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT); if (err < 0) goto err_elem_expr_setup; @@ -5809,7 +6956,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask) && - !nft_set_elem_expired(ext)) + !nft_set_elem_expired(ext) && + !nft_set_elem_is_dead(ext)) return ext; } @@ -5817,33 +6965,10 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, } EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); -void *nft_set_catchall_gc(const struct nft_set *set) -{ - struct nft_set_elem_catchall *catchall, *next; - struct nft_set_ext *ext; - void *elem = NULL; - - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - - if (!nft_set_elem_expired(ext) || - nft_set_elem_mark_busy(ext)) - continue; - - elem = catchall->elem; - list_del_rcu(&catchall->list); - kfree_rcu(catchall, rcu); - break; - } - - return elem; -} -EXPORT_SYMBOL_GPL(nft_set_catchall_gc); - static int nft_setelem_catchall_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **pext) + struct nft_elem_priv **priv) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_next(net); @@ -5852,12 +6977,12 @@ static int nft_setelem_catchall_insert(const struct net *net, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask)) { - *pext = ext; + *priv = catchall->elem; return -EEXIST; } } - catchall = kmalloc(sizeof(*catchall), GFP_KERNEL); + catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT); if (!catchall) return -ENOMEM; @@ -5870,22 +6995,23 @@ static int nft_setelem_catchall_insert(const struct net *net, static int nft_setelem_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext, unsigned int flags) + struct nft_elem_priv **elem_priv, + unsigned int flags) { int ret; if (flags & NFT_SET_ELEM_CATCHALL) - ret = nft_setelem_catchall_insert(net, set, elem, ext); + ret = nft_setelem_catchall_insert(net, set, elem, elem_priv); else - ret = set->ops->insert(net, set, elem, ext); + ret = set->ops->insert(net, set, elem, elem_priv); return ret; } static bool nft_setelem_is_catchall(const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_elem_priv *elem_priv) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL) @@ -5895,15 +7021,46 @@ static bool nft_setelem_is_catchall(const struct nft_set *set, } static void nft_setelem_activate(struct net *net, struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - if (nft_setelem_is_catchall(set, elem)) { - nft_set_elem_change_active(net, set, ext); - nft_set_elem_clear_busy(ext); + if (nft_setelem_is_catchall(set, elem_priv)) { + nft_clear(net, ext); } else { - set->ops->activate(net, set, elem); + set->ops->activate(net, set, elem_priv); + } +} + +static void nft_trans_elem_update(const struct nft_set *set, + const struct nft_trans_one_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_elem_update *update = elem->update; + + if (update->flags & NFT_TRANS_UPD_TIMEOUT) + WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout); + + if (update->flags & NFT_TRANS_UPD_EXPIRATION) + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration); +} + +static void nft_trans_elems_add(const struct nft_ctx *ctx, + struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + struct nft_trans_one_elem *elem = &te->elems[i]; + + if (elem->update) + nft_trans_elem_update(te->set, elem); + else + nft_setelem_activate(ctx->net, te->set, elem->priv); + + nf_tables_setelem_notify(ctx, te->set, elem->priv, + NFT_MSG_NEWSETELEM); + kfree(elem->update); } } @@ -5916,8 +7073,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_is_active(net, ext) || - nft_set_elem_mark_busy(ext)) + if (!nft_is_active_next(net, ext)) continue; kfree(elem->priv); @@ -5960,16 +7116,21 @@ static int nft_setelem_deactivate(const struct net *net, return ret; } +static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall) +{ + list_del_rcu(&catchall->list); + kfree_rcu(catchall, rcu); +} + static void nft_setelem_catchall_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { struct nft_set_elem_catchall *catchall, *next; list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { - if (catchall->elem == elem->priv) { - list_del_rcu(&catchall->list); - kfree_rcu(catchall, rcu); + if (catchall->elem == elem_priv) { + nft_setelem_catchall_destroy(catchall); break; } } @@ -5977,12 +7138,32 @@ static void nft_setelem_catchall_remove(const struct net *net, static void nft_setelem_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - if (nft_setelem_is_catchall(set, elem)) - nft_setelem_catchall_remove(net, set, elem); + if (nft_setelem_is_catchall(set, elem_priv)) + nft_setelem_catchall_remove(net, set, elem_priv); else - set->ops->remove(net, set, elem); + set->ops->remove(net, set, elem_priv); +} + +static void nft_trans_elems_remove(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + WARN_ON_ONCE(te->elems[i].update); + + nf_tables_setelem_notify(ctx, te->set, + te->elems[i].priv, + te->nft_trans.msg_type); + + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) { + atomic_dec(&te->set->nelems); + te->set->ndeact--; + } + } } static bool nft_setelem_valid_key_end(const struct nft_set *set, @@ -6004,6 +7185,27 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set, return true; } +static u32 nft_set_maxsize(const struct nft_set *set) +{ + u32 maxsize, delta; + + if (!set->size) + return UINT_MAX; + + if (set->ops->adjust_maxsize) + delta = set->ops->adjust_maxsize(set); + else + delta = 0; + + if (check_add_overflow(set->size, set->ndeact, &maxsize)) + return UINT_MAX; + + if (check_add_overflow(maxsize, delta, &maxsize)) + return UINT_MAX; + + return maxsize; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { @@ -6015,13 +7217,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; + struct nft_elem_priv *elem_priv; struct nft_object *obj = NULL; struct nft_userdata *udata; struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u64 timeout; u64 expiration; + u64 timeout; int err, i; u8 ulen; @@ -6036,7 +7239,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) || + (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY])) return -EINVAL; if (flags != 0) { @@ -6087,17 +7291,23 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } else if (set->flags & NFT_SET_TIMEOUT && !(flags & NFT_SET_ELEM_INTERVAL_END)) { - timeout = READ_ONCE(set->timeout); + timeout = set->timeout; } expiration = 0; if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (timeout == 0) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], &expiration); if (err) return err; + + if (expiration > timeout) + return -ERANGE; } if (nla[NFTA_SET_ELEM_EXPR]) { @@ -6183,16 +7393,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_parse_key_end; } - if (timeout > 0) { - err = nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); + if (set->flags & NFT_SET_TIMEOUT) { + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); if (err < 0) goto err_parse_key_end; - - if (timeout != READ_ONCE(set->timeout)) { - err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); - if (err < 0) - goto err_parse_key_end; - } } if (num_exprs) { @@ -6211,8 +7415,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); + obj = NULL; + goto err_parse_key_end; + } + + if (!nft_use_inc(&obj->use)) { + err = -EMFILE; + obj = NULL; goto err_parse_key_end; } + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); if (err < 0) goto err_parse_key_end; @@ -6245,7 +7457,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (desc.type == NFT_DATA_VERDICT && (elem.data.val.verdict.code == NFT_GOTO || elem.data.val.verdict.code == NFT_JUMP)) - nft_validate_state_update(ctx->net, + nft_validate_state_update(ctx->table, NFT_VALIDATE_NEED); } @@ -6281,19 +7493,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (obj) + *nft_set_ext_obj(ext) = obj; + if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; - goto err_elem_userdata; + goto err_elem_free; } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } - if (obj) { - *nft_set_ext_obj(ext) = obj; - obj->use++; - } err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) goto err_elem_free; @@ -6304,11 +7515,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_elem_free; } - ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; + ext->genmask = nft_genmask_cur(ctx->net); - err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); + err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags); if (err) { if (err == -EEXIST) { + ext2 = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ @@ -6322,8 +7534,40 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) goto err_element_clash; - else if (!(nlmsg_flags & NLM_F_EXCL)) + else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; + if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { + struct nft_elem_update update = { }; + + if (timeout != nft_set_ext_timeout(ext2)->timeout) { + update.timeout = timeout; + if (expiration == 0) + expiration = timeout; + + update.flags |= NFT_TRANS_UPD_TIMEOUT; + } + if (expiration) { + update.expiration = expiration; + update.flags |= NFT_TRANS_UPD_EXPIRATION; + } + + if (update.flags) { + struct nft_trans_one_elem *ue; + + ue = &nft_trans_container_elem(trans)->elems[0]; + + ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL); + if (!ue->update) { + err = -ENOMEM; + goto err_element_clash; + } + + ue->priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans); + goto err_elem_free; + } + } + } } else if (err == -ENOTEMPTY) { /* ENOTEMPTY reports overlapping between this element * and an existing one. @@ -6333,29 +7577,32 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_element_clash; } - if (!(flags & NFT_SET_ELEM_CATCHALL) && set->size && - !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { - err = -ENFILE; - goto err_set_full; + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + unsigned int max = nft_set_maxsize(set); + + if (!atomic_add_unless(&set->nelems, 1, max)) { + err = -ENFILE; + goto err_set_full; + } } - nft_trans_elem(trans) = elem; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; err_set_full: - nft_setelem_remove(ctx->net, set, &elem); + nft_setelem_remove(ctx->net, set, elem.priv); err_element_clash: kfree(trans); err_elem_free: - if (obj) - obj->use--; -err_elem_userdata: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); err_parse_key_end: + if (obj) + nft_use_dec_restore(&obj->use); + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); @@ -6370,7 +7617,6 @@ static int nf_tables_newsetelem(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; @@ -6393,10 +7639,13 @@ static int nf_tables_newsetelem(struct sk_buff *skb, set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET], nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); - if (IS_ERR(set)) + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); + } - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -6409,7 +7658,7 @@ static int nf_tables_newsetelem(struct sk_buff *skb, } } - if (nft_net->validate_state == NFT_VALIDATE_DO) + if (table->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); return 0; @@ -6429,50 +7678,99 @@ static int nf_tables_newsetelem(struct sk_buff *skb, void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { struct nft_chain *chain; - struct nft_rule *rule; if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; - chain->use++; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use++; - list_for_each_entry(rule, &chain->rules, list) - chain->use++; - - nft_chain_add(chain->table, chain); + nft_use_inc_restore(&chain->use); break; } } } +static int nft_setelem_active_next(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + u8 genmask = nft_genmask_next(net); + + return nft_set_elem_active(ext, genmask); +} + static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_hold(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use++; + nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; + nft_use_dec(&(*nft_set_ext_obj(ext))->use); +} + +/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem. + * No notifications and no ndeact changes. + * + * Returns true if set had been added to (i.e., elements need to be removed again). + */ +static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, + struct nft_trans_elem *te) +{ + bool removed = false; + int i; + + for (i = 0; i < te->nelems; i++) { + if (te->elems[i].update) { + kfree(te->elems[i].update); + te->elems[i].update = NULL; + /* Update request, so do not release this element */ + te->elems[i].priv = NULL; + continue; + } + + if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + atomic_dec(&te->set->nelems); + + removed = true; + } + + return removed; +} + +/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */ +static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) { + nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv); + nft_setelem_activate(ctx->net, te->set, te->elems[i].priv); + } + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + te->set->ndeact--; + } } static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, @@ -6552,10 +7850,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto fail_ops; - nft_setelem_data_deactivate(ctx->net, set, &elem); + nft_setelem_data_deactivate(ctx->net, set, elem.priv); - nft_trans_elem(trans) = elem; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; fail_ops: @@ -6572,48 +7870,44 @@ fail_elem: static int nft_setelem_flush(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; - int err; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_ATOMIC); + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM, + struct_size_t(struct nft_trans_elem, elems, 1)); if (!trans) return -ENOMEM; - if (!set->ops->flush(ctx->net, set, elem->priv)) { - err = -ENOENT; - goto err1; - } + set->ops->flush(ctx->net, set, elem_priv); set->ndeact++; - nft_setelem_data_deactivate(ctx->net, set, elem); + nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; - nft_trans_elem(trans) = *elem; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_container_elem(trans)->nelems = 1; + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; -err1: - kfree(trans); - return err; } static int __nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { struct nft_trans *trans; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_KERNEL); + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (!trans) return -ENOMEM; - nft_setelem_data_deactivate(ctx->net, set, elem); - nft_trans_elem_set(trans) = set; - nft_trans_elem(trans) = *elem; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_setelem_data_deactivate(ctx->net, set, elem_priv); + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } @@ -6623,20 +7917,19 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; - struct nft_set_elem elem; struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask) || - nft_set_elem_mark_busy(ext)) + if (!nft_set_elem_active(ext, genmask)) continue; - elem.priv = catchall->elem; - ret = __nft_set_catchall_flush(ctx, set, &elem); + ret = __nft_set_catchall_flush(ctx, set, catchall->elem); if (ret < 0) break; + nft_set_elem_change_active(ctx->net, set, ext); } return ret; @@ -6646,6 +7939,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, + .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; @@ -6677,10 +7971,16 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return PTR_ERR(table); } - set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); - if (IS_ERR(set)) + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + } + + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + + if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT)) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -6690,35 +7990,17 @@ static int nf_tables_delsetelem(struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); + if (err == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) + continue; + if (err < 0) { NL_SET_BAD_ATTR(extack, attr); - break; + return err; } } - return err; -} - -void nft_set_gc_batch_release(struct rcu_head *rcu) -{ - struct nft_set_gc_batch *gcb; - unsigned int i; - - gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); - for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); - kfree(gcb); -} -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp) -{ - struct nft_set_gc_batch *gcb; - - gcb = kzalloc(sizeof(*gcb), gfp); - if (gcb == NULL) - return gcb; - gcb->head.set = set; - return gcb; + return 0; } /* @@ -6889,11 +8171,15 @@ nla_put_failure: return -1; } -static const struct nft_object_type *__nft_obj_type_get(u32 objtype) +static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; - list_for_each_entry(type, &nf_tables_objects, list) { + list_for_each_entry_rcu(type, &nf_tables_objects, list) { + if (type->family != NFPROTO_UNSPEC && + type->family != family) + continue; + if (objtype == type->type) return type; } @@ -6901,13 +8187,17 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype) } static const struct nft_object_type * -nft_obj_type_get(struct net *net, u32 objtype) +nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; - type = __nft_obj_type_get(objtype); - if (type != NULL && try_module_get(type->owner)) + rcu_read_lock(); + type = __nft_obj_type_get(objtype, family); + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -6928,9 +8218,7 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, struct nft_trans *trans; int err = -ENOMEM; - if (!try_module_get(type->owner)) - return -ENOENT; - + /* caller must have obtained type->owner reference. */ trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) @@ -6998,17 +8286,29 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - type = __nft_obj_type_get(objtype); + if (!obj->ops->update) + return 0; + + type = nft_obj_type_get(net, objtype, family); + if (WARN_ON_ONCE(IS_ERR(type))) + return PTR_ERR(type); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + /* type->owner reference is put when transaction object is released. */ return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - type = nft_obj_type_get(net, objtype); - if (IS_ERR(type)) - return PTR_ERR(type); + if (!nft_use_inc(&table->use)) + return -EMFILE; + + type = nft_obj_type_get(net, objtype, family); + if (IS_ERR(type)) { + err = PTR_ERR(type); + goto err_type; + } obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { @@ -7025,7 +8325,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, } if (nla[NFTA_OBJ_USERDATA]) { - obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL); + obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT); if (obj->udata == NULL) goto err_userdata; @@ -7042,7 +8342,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); - table->use++; + return 0; err_obj_ht: /* queued in transaction log */ @@ -7058,6 +8358,9 @@ err_strdup: kfree(obj); err_init: module_put(type->owner); +err_type: + nft_use_dec_restore(&table->use); + return err; } @@ -7068,21 +8371,29 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, { struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, - NFNETLINK_V0, nft_base_seq(net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || - nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || - nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) || nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle), NFTA_OBJ_PAD)) goto nla_put_failure; + if (event == NFT_MSG_DELOBJ || + event == NFT_MSG_DESTROYOBJ) { + nlmsg_end(skb, nlh); + return 0; + } + + if (nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || + nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) + goto nla_put_failure; + if (obj->udata && nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata)) goto nla_put_failure; @@ -7095,124 +8406,136 @@ nla_put_failure: return -1; } -struct nft_obj_filter { +static void audit_log_obj_reset(const struct nft_table *table, + unsigned int base_seq, unsigned int nentries) +{ + char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); + + audit_log_nfcfg(buf, table->family, nentries, + AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); + kfree(buf); +} + +struct nft_obj_dump_ctx { + unsigned int s_idx; char *table; u32 type; + bool reset; }; static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_table *table; - unsigned int idx = 0, s_idx = cb->args[0]; - struct nft_obj_filter *filter = cb->data; + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; + const struct nft_table *table; + unsigned int entries = 0; struct nft_object *obj; - bool reset = false; - - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - reset = true; + unsigned int idx = 0; + int rc = 0; rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; + entries = 0; list_for_each_entry_rcu(obj, &table->objects, list) { if (!nft_is_active(net, obj)) goto cont; - if (idx < s_idx) + if (idx < ctx->s_idx) goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (filter && filter->table && - strcmp(filter->table, table->name)) + if (ctx->table && strcmp(ctx->table, table->name)) goto cont; - if (filter && - filter->type != NFT_OBJECT_UNSPEC && - obj->ops->type->type != filter->type) + if (ctx->type != NFT_OBJECT_UNSPEC && + obj->ops->type->type != ctx->type) goto cont; - if (reset) { - char *buf = kasprintf(GFP_ATOMIC, - "%s:%u", - table->name, - nft_net->base_seq); - - audit_log_nfcfg(buf, - family, - obj->handle, - AUDIT_NFT_OP_OBJ_RESET, - GFP_ATOMIC); - kfree(buf); - } - if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWOBJ, - NLM_F_MULTI | NLM_F_APPEND, - table->family, table, - obj, reset) < 0) - goto done; + rc = nf_tables_fill_obj_info(skb, net, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWOBJ, + NLM_F_MULTI | NLM_F_APPEND, + table->family, table, + obj, ctx->reset); + if (rc < 0) + break; + entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } + if (ctx->reset && entries) + audit_log_obj_reset(table, nft_base_seq(net), entries); + if (rc < 0) + break; } -done: rcu_read_unlock(); - cb->args[0] = idx; + ctx->s_idx = idx; return skb->len; } +static int nf_tables_dumpreset_obj(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + int ret; + + mutex_lock(&nft_net->commit_mutex); + ret = nf_tables_dump_obj(skb, cb); + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_obj_start(struct netlink_callback *cb) { + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; - struct nft_obj_filter *filter = NULL; - if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) { - filter = kzalloc(sizeof(*filter), GFP_ATOMIC); - if (!filter) - return -ENOMEM; - - if (nla[NFTA_OBJ_TABLE]) { - filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); - if (!filter->table) { - kfree(filter); - return -ENOMEM; - } - } + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - if (nla[NFTA_OBJ_TYPE]) - filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + if (nla[NFTA_OBJ_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); + if (!ctx->table) + return -ENOMEM; } - cb->data = filter; + if (nla[NFTA_OBJ_TYPE]) + ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + return 0; } +static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) +{ + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; + + ctx->reset = true; + + return nf_tables_dump_obj_start(cb); +} + static int nf_tables_dump_obj_done(struct netlink_callback *cb) { - struct nft_obj_filter *filter = cb->data; + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; - if (filter) { - kfree(filter->table); - kfree(filter); - } + kfree(ctx->table); return 0; } -/* called with rcu_read_lock held */ -static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, - const struct nlattr * const nla[]) +/* Caller must hold rcu read lock or transaction mutex */ +static struct sk_buff * +nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, + const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); @@ -7221,72 +8544,109 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; - bool reset = false; u32 objtype; int err; - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start = nf_tables_dump_obj_start, - .dump = nf_tables_dump_obj, - .done = nf_tables_dump_obj_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - if (!nla[NFTA_OBJ_NAME] || !nla[NFTA_OBJ_TYPE]) - return -EINVAL; + return ERR_PTR(-EINVAL); table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); - return PTR_ERR(table); + return ERR_CAST(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); - return PTR_ERR(obj); + return ERR_CAST(obj); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + + err = nf_tables_fill_obj_info(skb2, net, portid, + info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + family, table, obj, reset); + if (err < 0) { + kfree_skb(skb2); + return ERR_PTR(err); + } - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - reset = true; + return skb2; +} - if (reset) { - const struct nftables_pernet *nft_net; - char *buf; +static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + u32 portid = NETLINK_CB(skb).portid; + struct sk_buff *skb2; - nft_net = nft_pernet(net); - buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq); + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dump_obj_start, + .dump = nf_tables_dump_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; - audit_log_nfcfg(buf, - family, - obj->handle, - AUDIT_NFT_OP_OBJ_RESET, - GFP_ATOMIC); - kfree(buf); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, - info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, - family, table, obj, reset); - if (err < 0) - goto err_fill_obj_info; + skb2 = nf_tables_getobj_single(portid, info, nla, false); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, info->net, portid); +} -err_fill_obj_info: - kfree_skb(skb2); - return err; +static int nf_tables_getobj_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + char *buf; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dumpreset_obj_start, + .dump = nf_tables_dumpreset_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + skb2 = nf_tables_getobj_single(portid, info, nla, true); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); + + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", + nla_len(nla[NFTA_OBJ_TABLE]), + (char *)nla_data(nla[NFTA_OBJ_TABLE]), + nft_base_seq(net)); + audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, + AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); + kfree(buf); + + return nfnetlink_unicast(skb2, net, portid); } static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) @@ -7334,6 +8694,10 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(obj)) { + if (PTR_ERR(obj) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); } @@ -7347,24 +8711,14 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, return nft_delobj(&ctx, obj); } -void nft_obj_notify(struct net *net, const struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, int event, - u16 flags, int family, int report, gfp_t gfp) +static void +__nft_obj_notify(struct net *net, const struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; int err; - char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); - - audit_log_nfcfg(buf, - family, - obj->handle, - event == NFT_MSG_NEWOBJ ? - AUDIT_NFT_OP_OBJ_REGISTER : - AUDIT_NFT_OP_OBJ_UNREGISTER, - gfp); - kfree(buf); if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -7387,13 +8741,34 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } + +void nft_obj_notify(struct net *net, const struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + u16 flags, int family, int report, gfp_t gfp) +{ + char *buf = kasprintf(gfp, "%s:%u", + table->name, nft_base_seq(net)); + + audit_log_nfcfg(buf, + family, + obj->handle, + event == NFT_MSG_NEWOBJ ? + AUDIT_NFT_OP_OBJ_REGISTER : + AUDIT_NFT_OP_OBJ_UNREGISTER, + gfp); + kfree(buf); + + __nft_obj_notify(net, table, obj, portid, seq, event, + flags, family, report, gfp); +} EXPORT_SYMBOL_GPL(nft_obj_notify); static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { - nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->flags, ctx->family, ctx->report, GFP_KERNEL); + __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, + ctx->seq, event, ctx->flags, ctx->family, + ctx->report, GFP_KERNEL); } /* @@ -7425,12 +8800,14 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; @@ -7444,10 +8821,11 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: - flowtable->use--; + nft_use_dec(&flowtable->use); fallthrough; default: return; @@ -7482,26 +8860,31 @@ static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX }; static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, - const struct nlattr *attr, + const struct nlattr * const nla[], struct nft_flowtable_hook *flowtable_hook, - struct nft_flowtable *flowtable, bool add) + struct nft_flowtable *flowtable, + struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; struct nft_hook *hook; int hooknum, priority; int err; INIT_LIST_HEAD(&flowtable_hook->list); - err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, + err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, + nla[NFTA_FLOWTABLE_HOOK], nft_flowtable_hook_policy, NULL); if (err < 0) return err; if (add) { if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || - !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) - return -EINVAL; + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); + return -ENOENT; + } hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); if (hooknum != NF_NETDEV_INGRESS) @@ -7531,27 +8914,32 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(ctx->net, tb[NFTA_FLOWTABLE_HOOK_DEVS], - &flowtable_hook->list); + &flowtable_hook->list, + extack); if (err < 0) return err; } list_for_each_entry(hook, &flowtable_hook->list, list) { - hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = flowtable_hook->num; - hook->ops.priority = flowtable_hook->priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + list_for_each_entry(ops, &hook->ops_list, list) { + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable_hook->num; + ops->priority = flowtable_hook->priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->hook_ops_type = NF_HOOK_OP_NFT_FT; + } } return err; } +/* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; - list_for_each_entry(type, &nf_tables_flowtables, list) { + list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } @@ -7563,9 +8951,13 @@ nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; + rcu_read_lock(); type = __nft_flowtable_type_get(family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -7578,34 +8970,58 @@ nft_flowtable_type_get(struct net *net, u8 family) } /* Only called from error and netdev event paths. */ -static void nft_unregister_flowtable_hook(struct net *net, - struct nft_flowtable *flowtable, - struct nft_hook *hook) +static void nft_unregister_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + nf_unregister_net_hook(net, ops); + flowtable->data.type->setup(&flowtable->data, ops->dev, FLOW_BLOCK_UNBIND); } static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list) { - __nft_unregister_flowtable_net_hooks(net, hook_list, false); + __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); +} + +static int nft_register_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) +{ + int err; + + err = flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_BIND); + if (err < 0) + return err; + + err = nf_register_net_hook(net, ops); + if (!err) + return 0; + + flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_UNBIND); + return err; } static int nft_register_flowtable_net_hooks(struct net *net, @@ -7613,8 +9029,9 @@ static int nft_register_flowtable_net_hooks(struct net *net, struct list_head *hook_list, struct nft_flowtable *flowtable) { - struct nft_hook *hook, *hook2, *next; + struct nft_hook *hook, *next; struct nft_flowtable *ft; + struct nf_hook_ops *ops; int err, i = 0; list_for_each_entry(hook, hook_list, list) { @@ -7622,77 +9039,85 @@ static int nft_register_flowtable_net_hooks(struct net *net, if (!nft_is_active_next(net, ft)) continue; - list_for_each_entry(hook2, &ft->hook_list, list) { - if (hook->ops.dev == hook2->ops.dev && - hook->ops.pf == hook2->ops.pf) { - err = -EEXIST; - goto err_unregister_net_hooks; - } + if (nft_hook_list_find(&ft->hook_list, hook)) { + err = -EEXIST; + goto err_unregister_net_hooks; } } - err = flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_BIND); - if (err < 0) - goto err_unregister_net_hooks; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nft_register_flowtable_ops(net, flowtable, ops); + if (err < 0) + goto err_unregister_net_hooks; - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) { - flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_UNBIND); - goto err_unregister_net_hooks; + i++; } - - i++; } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - nft_unregister_flowtable_hook(net, flowtable, hook); + nft_unregister_flowtable_ops(net, flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; } -static void nft_flowtable_hooks_destroy(struct list_head *hook_list) +static void nft_hooks_destroy(struct list_head *hook_list) { struct nft_hook *hook, *next; list_for_each_entry_safe(hook, next, hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, - struct nft_flowtable *flowtable) + struct nft_flowtable *flowtable, + struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; + struct nftables_pernet *nft_net; struct nft_hook *hook, *next; + struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; u32 flags; int err; - err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, flowtable, false); + err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable, + extack, false); if (err < 0) return err; list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); + continue; + } + + nft_net = nft_pernet(ctx->net); + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWFLOWTABLE || + trans->table != ctx->table || + !nft_trans_flowtable_update(trans)) + continue; + + if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) { + err = -EEXIST; + goto err_flowtable_update_hook; + } } } @@ -7736,10 +9161,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { - if (unregister) - nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + if (unregister) { + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(ctx->net, + flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -7756,9 +9184,9 @@ static int nf_tables_newflowtable(struct sk_buff *skb, u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; struct nft_flowtable *flowtable; - struct nft_hook *hook, *next; struct net *net = info->net; struct nft_table *table; + struct nft_trans *trans; struct nft_ctx ctx; int err; @@ -7774,7 +9202,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); @@ -7790,14 +9218,19 @@ static int nf_tables_newflowtable(struct sk_buff *skb, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - return nft_flowtable_update(&ctx, info->nlh, flowtable); + return nft_flowtable_update(&ctx, info->nlh, flowtable, extack); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (!nft_use_inc(&table->use)) + return -EMFILE; + flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT); - if (!flowtable) - return -ENOMEM; + if (!flowtable) { + err = -ENOMEM; + goto flowtable_alloc; + } flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); @@ -7830,38 +9263,37 @@ static int nf_tables_newflowtable(struct sk_buff *skb, if (err < 0) goto err3; - err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, flowtable, true); + err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable, + extack, true); if (err < 0) - goto err4; + goto err_flowtable_parse_hooks; list_splice(&flowtable_hook.list, &flowtable->hook_list); flowtable->data.priority = flowtable_hook.priority; flowtable->hooknum = flowtable_hook.num; + trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto err_flowtable_trans; + } + + /* This must be LAST to ensure no packets are walking over this flowtable. */ err = nft_register_flowtable_net_hooks(ctx.net, table, &flowtable->hook_list, flowtable); - if (err < 0) { - nft_flowtable_hooks_destroy(&flowtable->hook_list); - goto err4; - } - - err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err5; + goto err_flowtable_hooks; list_add_tail_rcu(&flowtable->list, &table->flowtables); - table->use++; return 0; -err5: - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - nft_unregister_flowtable_hook(net, flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); - } -err4: + +err_flowtable_hooks: + nft_trans_destroy(trans); +err_flowtable_trans: + nft_hooks_destroy(&flowtable->hook_list); +err_flowtable_parse_hooks: flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); @@ -7869,6 +9301,9 @@ err2: kfree(flowtable->name); err1: kfree(flowtable); +flowtable_alloc: + nft_use_dec_restore(&table->use); + return err; } @@ -7878,12 +9313,13 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { list_del(&this->list); - kfree(this); + nft_netdev_hook_free(this); } } static int nft_delflowtable_hook(struct nft_ctx *ctx, - struct nft_flowtable *flowtable) + struct nft_flowtable *flowtable, + struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; @@ -7892,8 +9328,8 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, struct nft_trans *trans; int err; - err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, flowtable, false); + err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable, + extack, false); if (err < 0) return err; @@ -7960,10 +9396,14 @@ static int nf_tables_delflowtable(struct sk_buff *skb, flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_FLOWTABLE_NAME]; - flowtable = nft_flowtable_lookup(table, attr, genmask); + flowtable = nft_flowtable_lookup(net, table, attr, genmask); } if (IS_ERR(flowtable)) { + if (PTR_ERR(flowtable) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } @@ -7971,7 +9411,7 @@ static int nf_tables_delflowtable(struct sk_buff *skb, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (nla[NFTA_FLOWTABLE_HOOK]) - return nft_delflowtable_hook(&ctx, flowtable); + return nft_delflowtable_hook(&ctx, flowtable, extack); if (flowtable->use > 0) { NL_SET_BAD_ATTR(extack, attr); @@ -7991,17 +9431,26 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, struct nft_hook *hook; struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, - NFNETLINK_V0, nft_base_seq(net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || - nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), - NFTA_FLOWTABLE_PAD) || + NFTA_FLOWTABLE_PAD)) + goto nla_put_failure; + + if (!hook_list && + (event == NFT_MSG_DELFLOWTABLE || + event == NFT_MSG_DESTROYFLOWTABLE)) { + nlmsg_end(skb, nlh); + return 0; + } + + if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags))) goto nla_put_failure; @@ -8016,8 +9465,12 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest_devs) goto nla_put_failure; - list_for_each_entry_rcu(hook, hook_list, list) { - if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) + if (!hook_list) + hook_list = &flowtable->hook_list; + + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -8049,7 +9502,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -8072,8 +9525,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, table->family, - flowtable, - &flowtable->hook_list) < 0) + flowtable, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -8128,6 +9580,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { + struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; @@ -8153,13 +9606,17 @@ static int nf_tables_getflowtable(struct sk_buff *skb, table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask, 0); - if (IS_ERR(table)) + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); + } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); - if (IS_ERR(flowtable)) + if (IS_ERR(flowtable)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return PTR_ERR(flowtable); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) @@ -8168,7 +9625,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, - flowtable, &flowtable->hook_list); + flowtable, NULL); if (err < 0) goto err_fill_flowtable_info; @@ -8181,8 +9638,7 @@ err_fill_flowtable_info: static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, - struct list_head *hook_list, - int event) + struct list_head *hook_list, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; @@ -8220,10 +9676,8 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); - kfree(hook); + nft_netdev_hook_free_rcu(hook); } kfree(flowtable->name); module_put(flowtable->data.type->owner); @@ -8233,17 +9687,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, - NFNETLINK_V0, nft_base_seq(net)); + NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -8256,46 +9709,132 @@ nla_put_failure: return -EMSGSIZE; } -static void nft_flowtable_event(unsigned long event, struct net_device *dev, - struct nft_flowtable *flowtable) +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops); + +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev) { + struct nf_hook_ops *ops; + + list_for_each_entry_rcu(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); + +static int nft_flowtable_event(unsigned long event, struct net_device *dev, + struct nft_flowtable *flowtable, bool changename) +{ + struct nf_hook_ops *ops; struct nft_hook *hook; + bool match; list_for_each_entry(hook, &flowtable->hook_list, list) { - if (hook->ops.dev != dev) - continue; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - /* flow_offload_netdev_event() cleans up entries for us. */ - nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + /* flow_offload_netdev_event() cleans up entries for us. */ + nft_unregister_flowtable_ops(dev_net(dev), + flowtable, ops); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kzalloc(sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable->hooknum; + ops->priority = flowtable->data.priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->hook_ops_type = NF_HOOK_OP_NFT_FT; + ops->dev = dev; + if (nft_register_flowtable_ops(dev_net(dev), + flowtable, ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } break; } + return 0; +} + +static int __nf_tables_flowtable_event(unsigned long event, + struct net_device *dev, + bool changename) +{ + struct nftables_pernet *nft_net = nft_pernet(dev_net(dev)); + struct nft_flowtable *flowtable; + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + if (nft_flowtable_event(event, dev, + flowtable, changename)) + return 1; + } + } + return 0; } static int nf_tables_flowtable_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; - struct nft_table *table; + int ret = NOTIFY_DONE; struct net *net; - if (event != NETDEV_UNREGISTER) - return 0; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; net = dev_net(dev); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); - list_for_each_entry(table, &nft_net->tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - nft_flowtable_event(event, dev, flowtable); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; } + __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_flowtable_event(event, dev, false)) { + ret = NOTIFY_BAD; } +out_unlock: mutex_unlock(&nft_net->commit_mutex); - - return NOTIFY_DONE; + return ret; } static struct notifier_block nf_tables_flowtable_notifier = { @@ -8373,6 +9912,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, + [NFT_MSG_DESTROYTABLE] = { + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, [NFT_MSG_NEWCHAIN] = { .call = nf_tables_newchain, .type = NFNL_CB_BATCH, @@ -8391,6 +9936,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, + [NFT_MSG_DESTROYCHAIN] = { + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, [NFT_MSG_NEWRULE] = { .call = nf_tables_newrule, .type = NFNL_CB_BATCH, @@ -8404,7 +9955,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_GETRULE_RESET] = { - .call = nf_tables_getrule, + .call = nf_tables_getrule_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, @@ -8415,6 +9966,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, + [NFT_MSG_DESTROYRULE] = { + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, [NFT_MSG_NEWSET] = { .call = nf_tables_newset, .type = NFNL_CB_BATCH, @@ -8433,6 +9990,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, + [NFT_MSG_DESTROYSET] = { + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, [NFT_MSG_NEWSETELEM] = { .call = nf_tables_newsetelem, .type = NFNL_CB_BATCH, @@ -8445,12 +10008,24 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETSETELEM_RESET] = { + .call = nf_tables_getsetelem_reset, + .type = NFNL_CB_RCU, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_DELSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_DESTROYSETELEM] = { + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, .type = NFNL_CB_RCU, @@ -8473,8 +10048,14 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, + [NFT_MSG_DESTROYOBJ] = { + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, [NFT_MSG_GETOBJ_RESET] = { - .call = nf_tables_getobj, + .call = nf_tables_getobj_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, @@ -8497,6 +10078,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, + [NFT_MSG_DESTROYFLOWTABLE] = { + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, }; static int nf_tables_validate(struct net *net) @@ -8504,18 +10091,20 @@ static int nf_tables_validate(struct net *net) struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; - switch (nft_net->validate_state) { - case NFT_VALIDATE_SKIP: - break; - case NFT_VALIDATE_NEED: - nft_validate_state_update(net, NFT_VALIDATE_DO); - fallthrough; - case NFT_VALIDATE_DO: - list_for_each_entry(table, &nft_net->tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { + switch (table->validate_state) { + case NFT_VALIDATE_SKIP: + continue; + case NFT_VALIDATE_NEED: + nft_validate_state_update(table, NFT_VALIDATE_DO); + fallthrough; + case NFT_VALIDATE_DO: if (nft_table_validate(net, table) < 0) return -EAGAIN; + + nft_validate_state_update(table, NFT_VALIDATE_SKIP); + break; } - break; } return 0; @@ -8528,51 +10117,53 @@ static int nf_tables_validate(struct net *net) * * We defer the drop policy until the transaction has been finalized. */ -static void nft_chain_commit_drop_policy(struct nft_trans *trans) +static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans) { struct nft_base_chain *basechain; - if (nft_trans_chain_policy(trans) != NF_DROP) + if (trans->policy != NF_DROP) return; - if (!nft_is_base_chain(trans->ctx.chain)) + if (!nft_is_base_chain(trans->chain)) return; - basechain = nft_base_chain(trans->ctx.chain); + basechain = nft_base_chain(trans->chain); basechain->policy = NF_DROP; } -static void nft_chain_commit_update(struct nft_trans *trans) +static void nft_chain_commit_update(struct nft_trans_chain *trans) { + struct nft_table *table = trans->nft_trans_binding.nft_trans.table; struct nft_base_chain *basechain; - if (nft_trans_chain_name(trans)) { - rhltable_remove(&trans->ctx.table->chains_ht, - &trans->ctx.chain->rhlhead, + if (trans->name) { + rhltable_remove(&table->chains_ht, + &trans->chain->rhlhead, nft_chain_ht_params); - swap(trans->ctx.chain->name, nft_trans_chain_name(trans)); - rhltable_insert_key(&trans->ctx.table->chains_ht, - trans->ctx.chain->name, - &trans->ctx.chain->rhlhead, + swap(trans->chain->name, trans->name); + rhltable_insert_key(&table->chains_ht, + trans->chain->name, + &trans->chain->rhlhead, nft_chain_ht_params); } - if (!nft_is_base_chain(trans->ctx.chain)) + if (!nft_is_base_chain(trans->chain)) return; nft_chain_stats_replace(trans); - basechain = nft_base_chain(trans->ctx.chain); + basechain = nft_base_chain(trans->chain); - switch (nft_trans_chain_policy(trans)) { + switch (trans->policy) { case NF_DROP: case NF_ACCEPT: - basechain->policy = nft_trans_chain_policy(trans); + basechain->policy = trans->policy; break; } } -static void nft_obj_commit_update(struct nft_trans *trans) +static void nft_obj_commit_update(const struct nft_ctx *ctx, + struct nft_trans *trans) { struct nft_object *newobj; struct nft_object *obj; @@ -8580,60 +10171,76 @@ static void nft_obj_commit_update(struct nft_trans *trans) obj = nft_trans_obj(trans); newobj = nft_trans_obj_newobj(trans); - if (obj->ops->update) - obj->ops->update(obj, newobj); + if (WARN_ON_ONCE(!obj->ops->update)) + return; - nft_obj_destroy(&trans->ctx, newobj); + obj->ops->update(obj, newobj); + nft_obj_destroy(ctx, newobj); } static void nft_commit_release(struct nft_trans *trans) { + struct nft_ctx ctx = { + .net = trans->net, + }; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_DELTABLE: - nf_tables_table_destroy(&trans->ctx); + case NFT_MSG_DESTROYTABLE: + nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: - nf_tables_chain_destroy(&trans->ctx); + case NFT_MSG_DESTROYCHAIN: + if (nft_trans_chain_update(trans)) + nft_hooks_destroy(&nft_trans_chain_hooks(trans)); + else + nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: - nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + case NFT_MSG_DESTROYRULE: + nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: - nft_set_destroy(&trans->ctx, nft_trans_set(trans)); + case NFT_MSG_DESTROYSET: + nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: - nf_tables_set_elem_destroy(&trans->ctx, - nft_trans_elem_set(trans), - nft_trans_elem(trans).priv); + case NFT_MSG_DESTROYSETELEM: + nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_DELOBJ: - nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); + case NFT_MSG_DESTROYOBJ: + nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) - nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } if (trans->put_net) - put_net(trans->ctx.net); + put_net(trans->net); kfree(trans); } static void nf_tables_trans_destroy_work(struct work_struct *w) { + struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work); struct nft_trans *trans, *next; LIST_HEAD(head); spin_lock(&nf_tables_destroy_list_lock); - list_splice_init(&nf_tables_destroy_list, &head); + list_splice_init(&nft_net->destroy_list, &head); spin_unlock(&nf_tables_destroy_list_lock); if (list_empty(&head)) @@ -8642,14 +10249,16 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nft_commit_release(trans); } } -void nf_tables_trans_destroy_flush_work(void) +void nf_tables_trans_destroy_flush_work(struct net *net) { - flush_work(&trans_destroy_work); + struct nftables_pernet *nft_net = nft_pernet(net); + + flush_work(&nft_net->destroy_work); } EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); @@ -8680,9 +10289,8 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha return -ENOMEM; } } - data_size += offsetof(struct nft_rule_dp, data); /* last rule */ - chain->blob_next = nf_tables_chain_alloc_rules(data_size); + chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size); if (!chain->blob_next) return -ENOMEM; @@ -8709,7 +10317,7 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha continue; } - if (WARN_ON_ONCE(data + expr->ops->size > data_boundary)) + if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary)) return -ENOMEM; memcpy(data + size, expr, expr->ops->size); @@ -8727,12 +10335,11 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha chain->blob_next->size += (unsigned long)(data - (void *)prule); } - prule = (struct nft_rule_dp *)data; - data += offsetof(struct nft_rule_dp, data); if (WARN_ON_ONCE(data > data_boundary)) return -ENOMEM; - nft_last_rule(chain->blob_next, prule); + prule = (struct nft_rule_dp *)data; + nft_last_rule(chain, prule); return 0; } @@ -8743,32 +10350,32 @@ static void nf_tables_commit_chain_prepare_cancel(struct net *net) struct nft_trans *trans, *next; list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { - struct nft_chain *chain = trans->ctx.chain; - if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { + struct nft_chain *chain = nft_trans_rule_chain(trans); + kvfree(chain->blob_next); chain->blob_next = NULL; } } } -static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h) +static void __nf_tables_commit_chain_free_rules(struct rcu_head *h) { - struct nft_rules_old *o = container_of(h, struct nft_rules_old, h); + struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h); - kvfree(o->blob); + kvfree(l->blob); } static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob) { - struct nft_rules_old *old; + struct nft_rule_dp_last *last; - /* rcu_head is after end marker */ - old = (void *)blob + sizeof(*blob) + blob->size; - old->blob = blob; + /* last rule trailer is after end marker */ + last = (void *)blob + sizeof(*blob) + blob->size; + last->blob = blob; - call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old); + call_rcu(&last->h, __nf_tables_commit_chain_free_rules); } static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) @@ -8834,6 +10441,243 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + struct nft_elem_priv **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]); + nft_setelem_remove(ctx->net, trans->set, priv[i]); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_elem_priv *elem_priv; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem_priv = trans->priv[i]; + if (!nft_setelem_is_catchall(trans->set, elem_priv)) + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct nftables_pernet *nft_net; + struct nft_ctx ctx = {}; + + nft_net = nft_pernet(trans->net); + + mutex_lock(&nft_net->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&nft_net->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&nft_net->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_gc_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + trans->net = maybe_get_net(net); + if (!trans->net) { + kfree(trans); + return NULL; + } + + refcount_inc(&set->refs); + trans->set = set; + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + struct nft_set *set; + + if (nft_trans_gc_space(gc)) + return gc; + + set = gc->set; + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + struct nft_set *set; + + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + set = gc->set; + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq) +{ + struct nft_set_elem_catchall *catchall; + const struct nft_set *set = gc->set; + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + if (nft_set_elem_is_dead(ext)) + goto dead_elem; + + nft_set_elem_dead(ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + return NULL; + + nft_trans_gc_elem_add(gc, catchall->elem); + } + + return gc; +} + +struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) +{ + struct nft_set_elem_catchall *catchall, *next; + u64 tstamp = nft_net_tstamp(gc->net); + const struct nft_set *set = gc->set; + struct nft_elem_priv *elem_priv; + struct nft_set_ext *ext; + + WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!__nft_set_elem_expired(ext, tstamp)) + continue; + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return NULL; + + elem_priv = catchall->elem; + nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); + nft_setelem_catchall_destroy(catchall); + nft_trans_gc_elem_add(gc, elem_priv); + } + + return gc; +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -8867,16 +10711,16 @@ static void nf_tables_commit_release(struct net *net) trans = list_last_entry(&nft_net->commit_list, struct nft_trans, list); - get_net(trans->ctx.net); + get_net(trans->net); WARN_ON_ONCE(trans->put_net); trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); - list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list); + list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); - schedule_work(&trans_destroy_work); + schedule_work(&nft_net->destroy_work); mutex_unlock(&nft_net->commit_mutex); } @@ -8944,9 +10788,24 @@ static void nf_tables_commit_audit_free(struct list_head *adl) } } +/* nft audit emits the number of elements that get added/removed/updated, + * so NEW/DELSETELEM needs to increment based on the total elem count. + */ +static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans) +{ + switch (trans->msg_type) { + case NFT_MSG_NEWSETELEM: + case NFT_MSG_DELSETELEM: + return nft_trans_container_elem(trans)->nelems; + } + + return 1; +} + static void nf_tables_commit_audit_collect(struct list_head *adl, - struct nft_table *table, u32 op) + const struct nft_trans *trans, u32 op) { + const struct nft_table *table = trans->table; struct nft_audit_data *adp; list_for_each_entry(adp, adl, list) { @@ -8956,7 +10815,7 @@ static void nf_tables_commit_audit_collect(struct list_head *adl, WARN_ONCE(1, "table=%s not expected in commit list", table->name); return; found: - adp->entries++; + adp->entries += nf_tables_commit_audit_entrycount(trans); if (!adp->op || adp->op > op) adp->op = op; } @@ -8978,14 +10837,48 @@ static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) } } +static void nft_set_commit_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->commit || set->dead) + continue; + + set->ops->commit(set); + } +} + +static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) +{ + unsigned int gc_seq; + + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + + return gc_seq; +} + +static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) +{ + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); + const struct nlmsghdr *nlh = nlmsg_hdr(skb); + struct nft_trans_binding *trans_binding; struct nft_trans *trans, *next; + unsigned int base_seq, gc_seq; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; - unsigned int base_seq; + struct nft_ctx ctx; LIST_HEAD(adl); int err; @@ -8994,9 +10887,38 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } + nft_ctx_init(&ctx, net, skb, nlh, NFPROTO_UNSPEC, NULL, NULL, NULL); + + list_for_each_entry(trans_binding, &nft_net->binding_list, binding_list) { + trans = &trans_binding->nft_trans; + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans)) && + !nft_trans_set_bound(trans)) { + pr_warn_once("nftables ruleset with unbound set\n"); + return -EINVAL; + } + break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans)) && + !nft_trans_chain_bound(trans)) { + pr_warn_once("nftables ruleset with unbound chain\n"); + return -EINVAL; + } + break; + default: + WARN_ONCE(1, "Unhandled bind type %d", trans->msg_type); + break; + } + } + /* 0. Validate ruleset, otherwise roll back for error reporting. */ - if (nf_tables_validate(net) < 0) + if (nf_tables_validate(net) < 0) { + nft_net->validate_state = NFT_VALIDATE_DO; return -EAGAIN; + } err = nft_flow_rule_offload_commit(net); if (err < 0) @@ -9004,9 +10926,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) /* 1. Allocate space for next generation rules_gen_X[] */ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { + struct nft_table *table = trans->table; int ret; - ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table); + ret = nf_tables_commit_audit_alloc(&adl, table); if (ret) { nf_tables_commit_chain_prepare_cancel(net); nf_tables_commit_audit_free(&adl); @@ -9014,7 +10937,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { - chain = trans->ctx.chain; + chain = nft_trans_rule_chain(trans); ret = nf_tables_commit_chain_prepare(net, chain); if (ret < 0) { @@ -9035,86 +10958,109 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - base_seq = READ_ONCE(nft_net->base_seq); + base_seq = nft_base_seq(net); while (++base_seq == 0) ; - WRITE_ONCE(nft_net->base_seq, base_seq); + /* pairs with smp_load_acquire in nft_lookup_eval */ + smp_store_release(&net->nft.base_seq, base_seq); + + gc_seq = nft_gc_seq_begin(nft_net); /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { - nf_tables_commit_audit_collect(&adl, trans->ctx.table, - trans->msg_type); + struct nft_table *table = trans->table; + + nft_ctx_update(&ctx, trans); + + nf_tables_commit_audit_collect(&adl, trans, trans->msg_type); switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) { + if (!(table->flags & __NFT_TABLE_F_UPDATE)) { nft_trans_destroy(trans); break; } - if (trans->ctx.table->flags & NFT_TABLE_F_DORMANT) - nf_tables_table_disable(net, trans->ctx.table); + if (table->flags & NFT_TABLE_F_DORMANT) + nf_tables_table_disable(net, table); - trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE; + table->flags &= ~__NFT_TABLE_F_UPDATE; } else { - nft_clear(net, trans->ctx.table); + nft_clear(net, table); } - nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); + nf_tables_table_notify(&ctx, NFT_MSG_NEWTABLE); nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: - list_del_rcu(&trans->ctx.table->list); - nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + case NFT_MSG_DESTROYTABLE: + list_del_rcu(&table->list); + nf_tables_table_notify(&ctx, trans->msg_type); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - nft_chain_commit_update(trans); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_chain_commit_update(nft_trans_container_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, + &nft_trans_chain_hooks(trans)); + list_splice(&nft_trans_chain_hooks(trans), + &nft_trans_basechain(trans)->hook_list); /* trans destroyed after rcu grace period */ } else { - nft_chain_commit_drop_policy(trans); - nft_clear(net, trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_chain_commit_drop_policy(nft_trans_container_chain(trans)); + nft_clear(net, nft_trans_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL); nft_trans_destroy(trans); } break; case NFT_MSG_DELCHAIN: - nft_chain_del(trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); - nf_tables_unregister_hook(trans->ctx.net, - trans->ctx.table, - trans->ctx.chain); + case NFT_MSG_DESTROYCHAIN: + if (nft_trans_chain_update(trans)) { + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, + &nft_trans_chain_hooks(trans)); + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } + } else { + nft_chain_del(nft_trans_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, + NULL); + nf_tables_unregister_hook(ctx.net, ctx.table, + nft_trans_chain(trans)); + } break; case NFT_MSG_NEWRULE: - nft_clear(trans->ctx.net, nft_trans_rule(trans)); - nf_tables_rule_notify(&trans->ctx, - nft_trans_rule(trans), + nft_clear(net, nft_trans_rule(trans)); + nf_tables_rule_notify(&ctx, nft_trans_rule(trans), NFT_MSG_NEWRULE); - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: list_del_rcu(&nft_trans_rule(trans)->list); - nf_tables_rule_notify(&trans->ctx, - nft_trans_rule(trans), - NFT_MSG_DELRULE); - nft_rule_expr_deactivate(&trans->ctx, - nft_trans_rule(trans), + nf_tables_rule_notify(&ctx, nft_trans_rule(trans), + trans->msg_type); + nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: + list_del(&nft_trans_container_set(trans)->list_trans_newset); if (nft_trans_set_update(trans)) { struct nft_set *set = nft_trans_set(trans); WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans)); WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans)); + + if (nft_trans_set_size(trans)) + WRITE_ONCE(set->size, nft_trans_set_size(trans)); } else { nft_clear(net, nft_trans_set(trans)); /* This avoids hitting -EBUSY when deleting the table @@ -9122,62 +11068,68 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) */ if (nft_set_is_anonymous(nft_trans_set(trans)) && !list_empty(&nft_trans_set(trans)->bindings)) - trans->ctx.table->use--; + nft_use_dec(&table->use); } - nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + nf_tables_set_notify(&ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); - nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), - NFT_MSG_DELSET, GFP_KERNEL); + nf_tables_set_notify(&ctx, nft_trans_set(trans), + trans->msg_type, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: - te = (struct nft_trans_elem *)trans->data; + te = nft_trans_container_elem(trans); + + nft_trans_elems_add(&ctx, te); - nft_setelem_activate(net, te->set, &te->elem); - nf_tables_setelem_notify(&trans->ctx, te->set, - &te->elem, - NFT_MSG_NEWSETELEM); + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: - te = (struct nft_trans_elem *)trans->data; - - nf_tables_setelem_notify(&trans->ctx, te->set, - &te->elem, - NFT_MSG_DELSETELEM); - nft_setelem_remove(net, te->set, &te->elem); - if (!nft_setelem_is_catchall(te->set, &te->elem)) { - atomic_dec(&te->set->nelems); - te->set->ndeact--; + case NFT_MSG_DESTROYSETELEM: + te = nft_trans_container_elem(trans); + + nft_trans_elems_remove(&ctx, te); + + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { - nft_obj_commit_update(trans); - nf_tables_obj_notify(&trans->ctx, + nft_obj_commit_update(&ctx, trans); + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), NFT_MSG_NEWOBJ); } else { nft_clear(net, nft_trans_obj(trans)); - nf_tables_obj_notify(&trans->ctx, + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), NFT_MSG_NEWOBJ); nft_trans_destroy(trans); } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_del(nft_trans_obj(trans)); - nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), - NFT_MSG_DELOBJ); + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), + trans->msg_type); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_trans_flowtable(trans)->data.flags = nft_trans_flowtable_flags(trans); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), NFT_MSG_NEWFLOWTABLE); @@ -9185,37 +11137,45 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) &nft_trans_flowtable(trans)->hook_list); } else { nft_clear(net, nft_trans_flowtable(trans)); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), - &nft_trans_flowtable(trans)->hook_list, + NULL, NFT_MSG_NEWFLOWTABLE); } nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), - &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_DELFLOWTABLE); + NULL, + trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; } } + nft_set_commit_update(&set_update_list); + nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, nft_net->base_seq); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); + + nft_gc_seq_end(nft_net, gc_seq); + nft_net->validate_state = NFT_VALIDATE_SKIP; nf_tables_commit_release(net); return 0; @@ -9239,29 +11199,35 @@ static void nf_tables_module_autoload(struct net *net) static void nf_tables_abort_release(struct nft_trans *trans) { + struct nft_ctx ctx = { }; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_NEWTABLE: - nf_tables_table_destroy(&trans->ctx); + nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: - nf_tables_chain_destroy(&trans->ctx); + if (nft_trans_chain_update(trans)) + nft_hooks_destroy(&nft_trans_chain_hooks(trans)); + else + nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_NEWRULE: - nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_NEWSET: - nft_set_destroy(&trans->ctx, nft_trans_set(trans)); + nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: - nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv, true); + nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_NEWOBJ: - nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); + nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) - nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; @@ -9269,96 +11235,145 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } +static void nft_set_abort_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->abort) + continue; + + set->ops->abort(set); + } +} + static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; + struct nft_ctx ctx = { + .net = net, + }; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { + struct nft_table *table = trans->table; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) { + if (!(table->flags & __NFT_TABLE_F_UPDATE)) { nft_trans_destroy(trans); break; } - if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_DORMANT) { - nf_tables_table_disable(net, trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; - } else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) { - trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT; + if (table->flags & __NFT_TABLE_F_WAS_DORMANT) { + nf_tables_table_disable(net, table); + table->flags |= NFT_TABLE_F_DORMANT; + } else if (table->flags & __NFT_TABLE_F_WAS_AWAKEN) { + table->flags &= ~NFT_TABLE_F_DORMANT; } - trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE; + if (table->flags & __NFT_TABLE_F_WAS_ORPHAN) { + table->flags &= ~NFT_TABLE_F_OWNER; + table->nlpid = 0; + } + table->flags &= ~__NFT_TABLE_F_UPDATE; nft_trans_destroy(trans); } else { - list_del_rcu(&trans->ctx.table->list); + list_del_rcu(&table->list); } break; case NFT_MSG_DELTABLE: - nft_clear(trans->ctx.net, trans->ctx.table); + case NFT_MSG_DESTROYTABLE: + nft_clear(trans->net, table); nft_trans_destroy(trans); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { - if (nft_chain_is_bound(trans->ctx.chain)) { + if (nft_trans_chain_bound(trans)) { nft_trans_destroy(trans); break; } - trans->ctx.table->use--; - nft_chain_del(trans->ctx.chain); - nf_tables_unregister_hook(trans->ctx.net, - trans->ctx.table, - trans->ctx.chain); + nft_use_dec_restore(&table->use); + nft_chain_del(nft_trans_chain(trans)); + nf_tables_unregister_hook(trans->net, table, + nft_trans_chain(trans)); } break; case NFT_MSG_DELCHAIN: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, trans->ctx.chain); + case NFT_MSG_DESTROYCHAIN: + if (nft_trans_chain_update(trans)) { + list_splice(&nft_trans_chain_hooks(trans), + &nft_trans_basechain(trans)->hook_list); + } else { + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_chain(trans)); + } nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: - trans->ctx.chain->use--; + if (nft_trans_rule_bound(trans)) { + nft_trans_destroy(trans); + break; + } + nft_use_dec_restore(&nft_trans_rule_chain(trans)->use); list_del_rcu(&nft_trans_rule(trans)->list); - nft_rule_expr_deactivate(&trans->ctx, + nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans), NFT_TRANS_ABORT); - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: - trans->ctx.chain->use++; - nft_clear(trans->ctx.net, nft_trans_rule(trans)); - nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + case NFT_MSG_DESTROYRULE: + nft_use_inc_restore(&nft_trans_rule_chain(trans)->use); + nft_clear(trans->net, nft_trans_rule(trans)); + nft_rule_expr_activate(&ctx, nft_trans_rule(trans)); + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: + list_del(&nft_trans_container_set(trans)->list_trans_newset); if (nft_trans_set_update(trans)) { nft_trans_destroy(trans); break; } - trans->ctx.table->use--; + nft_use_dec_restore(&table->use); if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; } + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_set(trans)); + case NFT_MSG_DESTROYSET: + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_set(trans)); + if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(&ctx, nft_trans_set(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -9366,85 +11381,108 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - te = (struct nft_trans_elem *)trans->data; - nft_setelem_remove(net, te->set, &te->elem); - if (!nft_setelem_is_catchall(te->set, &te->elem)) - atomic_dec(&te->set->nelems); + te = nft_trans_container_elem(trans); + if (!nft_trans_elems_new_abort(&ctx, te)) { + nft_trans_destroy(trans); + break; + } + + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_DELSETELEM: - te = (struct nft_trans_elem *)trans->data; + case NFT_MSG_DESTROYSETELEM: + te = nft_trans_container_elem(trans); - nft_setelem_data_activate(net, te->set, &te->elem); - nft_setelem_activate(net, te->set, &te->elem); - if (!nft_setelem_is_catchall(te->set, &te->elem)) - te->set->ndeact--; + nft_trans_elems_destroy_abort(&ctx, te); + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { - nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans)); + nft_obj_destroy(&ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&table->use); nft_obj_del(nft_trans_obj(trans)); } break; case NFT_MSG_DELOBJ: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_obj(trans)); + case NFT_MSG_DESTROYOBJ: + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); } else { - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_flowtable(trans)); } nft_trans_destroy(trans); break; } } + WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); + + nft_set_abort_update(&set_update_list); + synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - - return 0; -} - -static void nf_tables_cleanup(struct net *net) -{ - nft_validate_state_update(net, NFT_VALIDATE_SKIP); + return err; } static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = nft_pernet(net); - int ret = __nf_tables_abort(net, action); + unsigned int gc_seq; + int ret; + + gc_seq = nft_gc_seq_begin(nft_net); + ret = __nf_tables_abort(net, action); + nft_gc_seq_end(nft_net, gc_seq); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); mutex_unlock(&nft_net->commit_mutex); @@ -9457,8 +11495,9 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) bool genid_ok; mutex_lock(&nft_net->commit_mutex); + nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || nft_net->base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); @@ -9473,7 +11512,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, - .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, .owner = THIS_MODULE, }; @@ -9510,143 +11548,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain, } EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); -/* - * Loop detection - walk through the ruleset beginning at the destination chain - * of a new jump until either the source chain is reached (loop) or all - * reachable chains have been traversed. - * - * The loop check is performed whenever a new jump verdict is added to an - * expression or verdict map or a verdict map is bound to a new chain. - */ - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain); - -static int nft_check_loops(const struct nft_ctx *ctx, - const struct nft_set_ext *ext) -{ - const struct nft_data *data; - int ret; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - ret = nf_tables_check_loops(ctx, data->verdict.chain); - break; - default: - ret = 0; - break; - } - - return ret; -} - -static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - return nft_check_loops(ctx, ext); -} - -static int nft_set_catchall_loops(const struct nft_ctx *ctx, - struct nft_set *set) -{ - u8 genmask = nft_genmask_next(ctx->net); - struct nft_set_elem_catchall *catchall; - struct nft_set_ext *ext; - int ret = 0; - - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) - continue; - - ret = nft_check_loops(ctx, ext); - if (ret < 0) - return ret; - } - - return ret; -} - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain) -{ - const struct nft_rule *rule; - const struct nft_expr *expr, *last; - struct nft_set *set; - struct nft_set_binding *binding; - struct nft_set_iter iter; - - if (ctx->chain == chain) - return -ELOOP; - - list_for_each_entry(rule, &chain->rules, list) { - nft_rule_for_each_expr(expr, last, rule) { - struct nft_immediate_expr *priv; - const struct nft_data *data; - int err; - - if (strcmp(expr->ops->type->name, "immediate")) - continue; - - priv = nft_expr_priv(expr); - if (priv->dreg != NFT_REG_VERDICT) - continue; - - data = &priv->data; - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - err = nf_tables_check_loops(ctx, - data->verdict.chain); - if (err < 0) - return err; - break; - default: - break; - } - } - } - - list_for_each_entry(set, &ctx->table->sets, list) { - if (!nft_is_active_next(ctx->net, set)) - continue; - if (!(set->flags & NFT_SET_MAP) || - set->dtype != NFT_DATA_VERDICT) - continue; - - list_for_each_entry(binding, &set->bindings, list) { - if (!(binding->flags & NFT_SET_MAP) || - binding->chain != chain) - continue; - - iter.genmask = nft_genmask_next(ctx->net); - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_loop_check_setelem; - - set->ops->walk(ctx, set, &iter); - if (!iter.err) - iter.err = nft_set_catchall_loops(ctx, set); - - if (iter.err < 0) - return iter.err; - } - } - - return 0; -} - /** * nft_parse_u32_check - fetch u32 attribute and check for maximum value * @@ -9725,10 +11626,11 @@ static int nft_validate_register_load(enum nft_registers reg, unsigned int len) return 0; } -int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) +int nft_parse_register_load(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *sreg, u32 len) { - u32 reg; - int err; + int err, invalid_reg; + u32 reg, next_register; err = nft_parse_register(attr, ®); if (err < 0) @@ -9738,11 +11640,36 @@ int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) if (err < 0) return err; + next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg; + + /* Can't happen: nft_validate_register_load() should have failed */ + if (WARN_ON_ONCE(next_register > NFT_REG32_NUM)) + return -EINVAL; + + /* find first register that did not see an earlier store. */ + invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg); + + /* invalid register within the range that we're loading from? */ + if (invalid_reg < next_register) + return -ENODATA; + *sreg = reg; return 0; } EXPORT_SYMBOL_GPL(nft_parse_register_load); +static void nft_saw_register_store(const struct nft_ctx *__ctx, + int reg, unsigned int len) +{ + unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE); + struct nft_ctx *ctx = (struct nft_ctx *)__ctx; + + if (WARN_ON_ONCE(len == 0 || reg < 0)) + return; + + bitmap_set(ctx->reg_inited, reg, registers); +} + static int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, @@ -9759,13 +11686,16 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, if (data != NULL && (data->verdict.code == NFT_GOTO || data->verdict.code == NFT_JUMP)) { - err = nf_tables_check_loops(ctx, data->verdict.chain); + err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; } - return 0; + break; default: + if (type != NFT_DATA_VALUE) + return -EINVAL; + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; if (len == 0) @@ -9774,10 +11704,11 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, sizeof_field(struct nft_regs, data)) return -ERANGE; - if (data != NULL && type != NFT_DATA_VALUE) - return -EINVAL; - return 0; + break; } + + nft_saw_register_store(ctx, reg, len); + return 0; } int nft_parse_register_store(const struct nft_ctx *ctx, @@ -9823,19 +11754,16 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CODE]) return -EINVAL; + + /* zero padding hole for memcmp */ + memset(data, 0, sizeof(*data)); data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict.code) { - default: - switch (data->verdict.code & NF_VERDICT_MASK) { - case NF_ACCEPT: - case NF_DROP: - case NF_QUEUE: - break; - default: - return -EINVAL; - } - fallthrough; + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + break; case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: @@ -9848,7 +11776,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, genmask); } else if (tb[NFTA_VERDICT_CHAIN_ID]) { chain = nft_chain_lookup_byid(ctx->net, ctx->table, - tb[NFTA_VERDICT_CHAIN_ID]); + tb[NFTA_VERDICT_CHAIN_ID], + genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } else { @@ -9864,10 +11793,13 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (desc->flags & NFT_DATA_DESC_SETELEM && chain->flags & NFT_CHAIN_BINDING) return -EINVAL; + if (!nft_use_inc(&chain->use)) + return -EMFILE; - chain->use++; data->verdict.chain = chain; break; + default: + return -EINVAL; } desc->len = sizeof(data->verdict); @@ -9878,22 +11810,12 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { struct nft_chain *chain; - struct nft_rule *rule; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; - chain->use--; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use--; - list_for_each_entry(rule, &chain->rules, list) - chain->use--; - - nft_chain_del(chain); + nft_use_dec(&chain->use); break; } } @@ -10052,27 +11974,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -int __nft_release_basechain(struct nft_ctx *ctx) -{ - struct nft_rule *rule, *nr; - - if (WARN_ON(!nft_is_base_chain(ctx->chain))) - return 0; - - nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); - list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { - list_del(&rule->list); - ctx->chain->use--; - nf_tables_rule_release(ctx, rule); - } - nft_chain_del(ctx->chain); - ctx->table->use--; - nf_tables_chain_destroy(ctx); - - return 0; -} -EXPORT_SYMBOL_GPL(__nft_release_basechain); - static void __nft_release_hook(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable; @@ -10081,7 +11982,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) list_for_each_entry(chain, &table->chains, list) __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) - __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list, + __nft_unregister_flowtable_net_hooks(net, flowtable, + &flowtable->hook_list, true); } @@ -10113,35 +12015,40 @@ static void __nft_release_table(struct net *net, struct nft_table *table) ctx.family = table->family; ctx.table = table; list_for_each_entry(chain, &table->chains, list) { + if (nft_chain_binding(chain)) + continue; + ctx.chain = chain; list_for_each_entry_safe(rule, nr, &chain->rules, list) { list_del(&rule->list); - chain->use--; + nft_use_dec(&chain->use); nf_tables_rule_release(&ctx, rule); } } list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { list_del(&flowtable->list); - table->use--; + nft_use_dec(&table->use); nf_tables_flowtable_destroy(flowtable); } list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); - table->use--; + nft_use_dec(&table->use); + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(&ctx, set); + nft_set_destroy(&ctx, set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { nft_obj_del(obj); - table->use--; + nft_use_dec(&table->use); nft_obj_destroy(&ctx, obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { - ctx.chain = chain; nft_chain_del(chain); - table->use--; - nf_tables_chain_destroy(&ctx); + nft_use_dec(&table->use); + nf_tables_chain_destroy(chain); } - nf_tables_table_destroy(&ctx); + nf_tables_table_destroy(table); } static void __nft_release_tables(struct net *net) @@ -10168,6 +12075,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, struct net *net = n->net; unsigned int deleted; bool restart = false; + unsigned int gc_seq; if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; @@ -10175,12 +12083,18 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, nft_net = nft_pernet(net); deleted = 0; mutex_lock(&nft_net->commit_mutex); - if (!list_empty(&nf_tables_destroy_list)) - rcu_barrier(); + + gc_seq = nft_gc_seq_begin(nft_net); + + nf_tables_trans_destroy_flush_work(net); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { + if (table->flags & NFT_TABLE_F_PERSIST) { + table->flags &= ~NFT_TABLE_F_OWNER; + continue; + } __nft_release_hook(net, table); list_del_rcu(&table->list); to_delete[deleted++] = table; @@ -10197,6 +12111,8 @@ again: if (restart) goto again; } + nft_gc_seq_end(nft_net, gc_seq); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; @@ -10212,11 +12128,16 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->destroy_list); + INIT_LIST_HEAD(&nft_net->commit_set_list); + INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); - nft_net->base_seq = 1; + net->nft.base_seq = 1; + nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; + INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); return 0; } @@ -10233,22 +12154,41 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); + unsigned int gc_seq; mutex_lock(&nft_net->commit_mutex); - if (!list_empty(&nft_net->commit_list) || - !list_empty(&nft_net->module_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + + gc_seq = nft_gc_seq_begin(nft_net); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); + + cancel_work_sync(&nft_net->destroy_work); __nft_release_tables(net); + + nft_gc_seq_end(nft_net, gc_seq); + mutex_unlock(&nft_net->commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->tables)); WARN_ON_ONCE(!list_empty(&nft_net->module_list)); WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); + WARN_ON_ONCE(!list_empty(&nft_net->destroy_list)); +} + +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); } static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, .id = &nf_tables_net_id, .size = sizeof(struct nftables_pernet), }; @@ -10257,6 +12197,14 @@ static int __init nf_tables_module_init(void) { int err; + BUILD_BUG_ON(offsetof(struct nft_trans_table, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_chain, nft_trans_binding.nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_rule, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_set, nft_trans_binding.nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_elem, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_obj, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_flowtable, nft_trans) != 0); + err = register_pernet_subsys(&nf_tables_net_ops); if (err < 0) return err; @@ -10320,7 +12268,7 @@ static void __exit nf_tables_module_exit(void) nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); - cancel_work_sync(&trans_destroy_work); + cancel_work_sync(&trans_gc_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); nf_tables_core_module_exit(); @@ -10331,4 +12279,5 @@ module_exit(nf_tables_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Framework for packet filtering and classification"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 709a736c301c..6557a4018c09 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,39 +21,54 @@ #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> -static noinline void __nft_trace_packet(struct nft_traceinfo *info, - const struct nft_chain *chain, +#ifdef CONFIG_MITIGATION_RETPOLINE +static struct static_key_false nf_tables_skip_direct_calls; + +static inline bool nf_skip_indirect_calls(void) +{ + return static_branch_likely(&nf_tables_skip_direct_calls); +} + +static inline void __init nf_skip_indirect_calls_enable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + static_branch_enable(&nf_tables_skip_direct_calls); +} +#else +static inline void nf_skip_indirect_calls_enable(void) { } +#endif /* CONFIG_MITIGATION_RETPOLINE */ + +static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_rule_dp *rule, + struct nft_traceinfo *info, enum nft_trace_types type) { if (!info->trace || !info->nf_trace) return; - info->chain = chain; info->type = type; - nft_trace_notify(info); + nft_trace_notify(pkt, verdict, rule, info); } static inline void nft_trace_packet(const struct nft_pktinfo *pkt, + struct nft_verdict *verdict, struct nft_traceinfo *info, - const struct nft_chain *chain, const struct nft_rule_dp *rule, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { info->nf_trace = pkt->skb->nf_trace; - info->rule = rule; - __nft_trace_packet(info, chain, type); + __nft_trace_packet(pkt, verdict, rule, info, type); } } static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt, struct nft_traceinfo *info) { - if (static_branch_unlikely(&nft_trace_enabled)) { - if (info->trace) - info->nf_trace = pkt->skb->nf_trace; - } + if (static_branch_unlikely(&nft_trace_enabled)) + info->nf_trace = pkt->skb->nf_trace; } static void nft_bitwise_fast_eval(const struct nft_expr *expr, @@ -90,13 +105,14 @@ static void nft_cmp16_fast_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } -static noinline void __nft_trace_verdict(struct nft_traceinfo *info, - const struct nft_chain *chain, +static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt, + struct nft_traceinfo *info, + const struct nft_rule_dp *rule, const struct nft_regs *regs) { enum nft_trace_types type; - switch (regs->verdict.code) { + switch (regs->verdict.code & NF_VERDICT_MASK) { case NFT_CONTINUE: case NFT_RETURN: type = NFT_TRACETYPE_RETURN; @@ -109,22 +125,20 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, type = NFT_TRACETYPE_RULE; if (info->trace) - info->nf_trace = info->pkt->skb->nf_trace; + info->nf_trace = pkt->skb->nf_trace; break; } - __nft_trace_packet(info, chain, type); + __nft_trace_packet(pkt, ®s->verdict, rule, info, type); } -static inline void nft_trace_verdict(struct nft_traceinfo *info, - const struct nft_chain *chain, +static inline void nft_trace_verdict(const struct nft_pktinfo *pkt, + struct nft_traceinfo *info, const struct nft_rule_dp *rule, const struct nft_regs *regs) { - if (static_branch_unlikely(&nft_trace_enabled)) { - info->rule = rule; - __nft_trace_verdict(info, chain, regs); - } + if (static_branch_unlikely(&nft_trace_enabled)) + __nft_trace_verdict(pkt, info, rule, regs); } static bool nft_payload_fast_eval(const struct nft_expr *expr, @@ -141,7 +155,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, else { if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) return false; - ptr = skb_network_header(skb) + nft_thoff(pkt); + ptr = skb->data + nft_thoff(pkt); } ptr += priv->offset; @@ -183,17 +197,20 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, } struct nft_jumpstack { - const struct nft_chain *chain; const struct nft_rule_dp *rule; - const struct nft_rule_dp *last_rule; }; static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_pktinfo *pkt) { -#ifdef CONFIG_RETPOLINE - unsigned long e = (unsigned long)expr->ops->eval; +#ifdef CONFIG_MITIGATION_RETPOLINE + unsigned long e; + + if (nf_skip_indirect_calls()) + goto indirect_call; + + e = (unsigned long)expr->ops->eval; #define X(e, fun) \ do { if ((e) == (unsigned long)(fun)) \ return fun(expr, regs, pkt); } while (0) @@ -203,21 +220,26 @@ static void expr_call_ops_eval(const struct nft_expr *expr, X(e, nft_counter_eval); X(e, nft_meta_get_eval); X(e, nft_lookup_eval); +#if IS_ENABLED(CONFIG_NFT_CT) + X(e, nft_ct_get_fast_eval); +#endif X(e, nft_range_eval); X(e, nft_immediate_eval); X(e, nft_byteorder_eval); X(e, nft_dynset_eval); X(e, nft_rt_get_eval); X(e, nft_bitwise_eval); + X(e, nft_objref_eval); + X(e, nft_objref_map_eval); #undef X -#endif /* CONFIG_RETPOLINE */ +indirect_call: +#endif /* CONFIG_MITIGATION_RETPOLINE */ expr->ops->eval(expr, regs, pkt); } #define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0] #define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size #define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen] -#define nft_rule_next(rule) (void *)rule + sizeof(*rule) + rule->dlen #define nft_rule_dp_for_each_expr(expr, last, rule) \ for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \ @@ -228,10 +250,10 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv) { const struct nft_chain *chain = priv, *basechain = chain; - const struct nft_rule_dp *rule, *last_rule; const struct net *net = nft_net(pkt); const struct nft_expr *expr, *last; - struct nft_regs regs = {}; + const struct nft_rule_dp *rule; + struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; bool genbit = READ_ONCE(net->nft.gencursor); @@ -240,7 +262,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) info.trace = false; if (static_branch_unlikely(&nft_trace_enabled)) - nft_trace_init(&info, pkt, ®s.verdict, basechain); + nft_trace_init(&info, pkt, basechain); do_chain: if (genbit) blob = rcu_dereference(chain->blob_gen_1); @@ -248,10 +270,9 @@ do_chain: blob = rcu_dereference(chain->blob_gen_0); rule = (struct nft_rule_dp *)blob->data; - last_rule = (void *)blob->data + blob->size; next_rule: regs.verdict.code = NFT_CONTINUE; - for (; rule < last_rule; rule = nft_rule_next(rule)) { + for (; !rule->is_last ; rule = nft_rule_next(rule)) { nft_rule_dp_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) nft_cmp_fast_eval(expr, ®s); @@ -273,30 +294,29 @@ next_rule: nft_trace_copy_nftrace(pkt, &info); continue; case NFT_CONTINUE: - nft_trace_packet(pkt, &info, chain, rule, + nft_trace_packet(pkt, ®s.verdict, &info, rule, NFT_TRACETYPE_RULE); continue; } break; } - nft_trace_verdict(&info, chain, rule, ®s); + nft_trace_verdict(pkt, &info, rule, ®s); switch (regs.verdict.code & NF_VERDICT_MASK) { case NF_ACCEPT: - case NF_DROP: case NF_QUEUE: case NF_STOLEN: return regs.verdict.code; + case NF_DROP: + return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM); } switch (regs.verdict.code) { case NFT_JUMP: if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE)) return NF_DROP; - jumpstack[stackptr].chain = chain; jumpstack[stackptr].rule = nft_rule_next(rule); - jumpstack[stackptr].last_rule = last_rule; stackptr++; fallthrough; case NFT_GOTO: @@ -311,17 +331,18 @@ next_rule: if (stackptr > 0) { stackptr--; - chain = jumpstack[stackptr].chain; rule = jumpstack[stackptr].rule; - last_rule = jumpstack[stackptr].last_rule; goto next_rule; } - nft_trace_packet(pkt, &info, basechain, NULL, NFT_TRACETYPE_POLICY); + nft_trace_packet(pkt, ®s.verdict, &info, NULL, NFT_TRACETYPE_POLICY); if (static_branch_unlikely(&nft_counters_enabled)) nft_update_chain_stats(basechain, pkt); + if (nft_base_chain(basechain)->policy == NF_DROP) + return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM); + return nft_base_chain(basechain)->policy; } EXPORT_SYMBOL_GPL(nft_do_chain); @@ -369,6 +390,8 @@ int __init nf_tables_core_module_init(void) goto err; } + nf_skip_indirect_calls_enable(); + return 0; err: diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 910ef881c3b8..fd30e205de84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -35,12 +35,12 @@ void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } @@ -59,7 +59,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, .mask = match->mask.basic.n_proto, }; - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) && + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; @@ -70,8 +70,9 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); - } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) && + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); + } else if (match->dissector.used_keys & + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; @@ -80,7 +81,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } @@ -219,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain) bool nft_chain_offload_support(const struct nft_base_chain *basechain) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; @@ -226,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain) return false; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.pf != NFPROTO_NETDEV || - hook->ops.hooknum != NF_NETDEV_INGRESS) - return false; - - dev = hook->ops.dev; - if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) - return false; + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->pf != NFPROTO_NETDEV || + ops->hooknum != NF_NETDEV_INGRESS) + return false; + + dev = ops->dev; + if (!dev->netdev_ops->ndo_setup_tc && + !flow_indr_dev_exists()) + return false; + } } return true; @@ -454,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { - struct net_device *dev; + struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { - dev = hook->ops.dev; - if (this_dev && this_dev != dev) - continue; + list_for_each_entry(ops, &hook->ops_list, list) { + if (this_dev && this_dev != ops->dev) + continue; - err = nft_chain_offload_cmd(basechain, dev, cmd); - if (err < 0 && cmd == FLOW_BLOCK_BIND) { - if (!this_dev) - goto err_flow_block; + err = nft_chain_offload_cmd(basechain, ops->dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; - return err; + return err; + } + i++; } - i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - dev = hook->ops.dev; - nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + nft_chain_offload_cmd(basechain, ops->dev, + FLOW_BLOCK_UNBIND); + } } return err; } @@ -512,38 +520,38 @@ static void nft_flow_rule_offload_abort(struct net *net, int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) + if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; - err = nft_flow_offload_chain(trans->ctx.chain, NULL, + err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_chain(trans->ctx.chain, NULL, + err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); @@ -563,46 +571,46 @@ int nft_flow_rule_offload_commit(struct net *net) u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) + if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); - err = nft_flow_offload_chain(trans->ctx.chain, &policy, + err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); - err = nft_flow_offload_chain(trans->ctx.chain, &policy, + err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - if (trans->ctx.flags & NLM_F_REPLACE || - !(trans->ctx.flags & NLM_F_APPEND)) { + if (trans->flags & NLM_F_REPLACE || + !(trans->flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; @@ -637,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops(hook, dev)) continue; found = hook; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 1163ba9c1401..a88abae5a9de 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -15,6 +15,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -90,6 +91,52 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb, return 0; } +static int nf_trace_fill_ct_info(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + const struct nf_ct_hook *ct_hook; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + u32 state; + + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) + return 0; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { + if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */ + return 0; + + state = NF_CT_STATE_UNTRACKED_BIT; + } else { + state = NF_CT_STATE_BIT(ctinfo); + } + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state))) + return -1; + + if (ct) { + u32 id = ct_hook->get_id(&ct->ct_general); + u32 status = READ_ONCE(ct->status); + u8 dir = CTINFO2DIR(ctinfo); + + if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir)) + return -1; + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id)) + return -1; + + /* Kernel implementation detail, withhold this from userspace for now */ + status &= ~IPS_NAT_CLASH; + + if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status))) + return -1; + } + + return 0; +} + static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { @@ -124,9 +171,11 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, } static int nf_trace_fill_rule_info(struct sk_buff *nlskb, + const struct nft_verdict *verdict, + const struct nft_rule_dp *rule, const struct nft_traceinfo *info) { - if (!info->rule || info->rule->is_last) + if (!rule || rule->is_last) return 0; /* a continue verdict with ->type == RETURN means that this is @@ -135,15 +184,16 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb, * Since no rule matched, the ->rule pointer is invalid. */ if (info->type == NFT_TRACETYPE_RETURN && - info->verdict->code == NFT_CONTINUE) + verdict->code == NFT_CONTINUE) return 0; return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE, - cpu_to_be64(info->rule->handle), + cpu_to_be64(rule->handle), NFTA_TRACE_PAD); } -static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) +static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict, + struct nft_traceinfo *info) { switch (info->type) { case NFT_TRACETYPE_RETURN: @@ -153,7 +203,7 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) return false; } - switch (info->verdict->code) { + switch (verdict->code) { case NFT_JUMP: case NFT_GOTO: break; @@ -164,9 +214,31 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) return true; } -void nft_trace_notify(struct nft_traceinfo *info) +static const struct nft_chain *nft_trace_get_chain(const struct nft_rule_dp *rule, + const struct nft_traceinfo *info) +{ + const struct nft_rule_dp_last *last; + + if (!rule) + return &info->basechain->chain; + + while (!rule->is_last) + rule = nft_rule_next(rule); + + last = (const struct nft_rule_dp_last *)rule; + + if (WARN_ON_ONCE(!last->chain)) + return &info->basechain->chain; + + return last->chain; +} + +void nft_trace_notify(const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_rule_dp *rule, + struct nft_traceinfo *info) { - const struct nft_pktinfo *pkt = info->pkt; + const struct nft_chain *chain; struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; @@ -176,14 +248,20 @@ void nft_trace_notify(struct nft_traceinfo *info) if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE)) return; + chain = nft_trace_get_chain(rule, info); + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + - nla_total_size(strlen(info->chain->table->name)) + - nla_total_size(strlen(info->chain->name)) + + nla_total_size(strlen(chain->table->name)) + + nla_total_size(strlen(chain->name)) + nla_total_size_64bit(sizeof(__be64)) + /* rule handle */ nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(sizeof(u32)) + /* ct id */ + nla_total_size(sizeof(u8)) + /* ct direction */ + nla_total_size(sizeof(u32)) + /* ct state */ + nla_total_size(sizeof(u32)) + /* ct status */ + nla_total_size(sizeof(u32)) + /* trace id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + @@ -195,8 +273,8 @@ void nft_trace_notify(struct nft_traceinfo *info) nla_total_size(sizeof(u32)) + /* nfproto */ nla_total_size(sizeof(u32)); /* policy */ - if (nft_trace_have_verdict_chain(info)) - size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */ + if (nft_trace_have_verdict_chain(verdict, info)) + size += nla_total_size(strlen(verdict->chain->name)); /* jump target */ skb = nlmsg_new(size, GFP_ATOMIC); if (!skb) @@ -217,13 +295,13 @@ void nft_trace_notify(struct nft_traceinfo *info) if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid)) goto nla_put_failure; - if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name)) + if (nla_put_string(skb, NFTA_TRACE_CHAIN, chain->name)) goto nla_put_failure; - if (nla_put_string(skb, NFTA_TRACE_TABLE, info->chain->table->name)) + if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name)) goto nla_put_failure; - if (nf_trace_fill_rule_info(skb, info)) + if (nf_trace_fill_rule_info(skb, verdict, rule, info)) goto nla_put_failure; switch (info->type) { @@ -231,17 +309,21 @@ void nft_trace_notify(struct nft_traceinfo *info) case __NFT_TRACETYPE_MAX: break; case NFT_TRACETYPE_RETURN: - case NFT_TRACETYPE_RULE: - if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict)) + case NFT_TRACETYPE_RULE: { + unsigned int v; + + if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, verdict)) goto nla_put_failure; /* pkt->skb undefined iff NF_STOLEN, disable dump */ - if (info->verdict->code == NF_STOLEN) + v = verdict->code & NF_VERDICT_MASK; + if (v == NF_STOLEN) info->packet_dumped = true; else mark = pkt->skb->mark; break; + } case NFT_TRACETYPE_POLICY: mark = pkt->skb->mark; @@ -260,6 +342,10 @@ void nft_trace_notify(struct nft_traceinfo *info) if (nf_trace_fill_pkt_info(skb, pkt)) goto nla_put_failure; + + if (nf_trace_fill_ct_info(skb, pkt->skb)) + goto nla_put_failure; + info->packet_dumped = true; } @@ -273,7 +359,6 @@ void nft_trace_notify(struct nft_traceinfo *info) } void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, - const struct nft_verdict *verdict, const struct nft_chain *chain) { static siphash_key_t trace_key __read_mostly; @@ -283,13 +368,11 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, info->trace = true; info->nf_trace = pkt->skb->nf_trace; info->packet_dumped = false; - info->pkt = pkt; - info->verdict = verdict; net_get_random_once(&trace_key, sizeof(trace_key)); info->skbid = (u32)siphash_3u32(hash32_ptr(skb), - skb_get_hash(skb), + skb_get_hash_net(nft_net(pkt), skb), skb->skb_iif, &trace_key); } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 6d18fb346868..811d02b4c4f7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -29,6 +29,7 @@ #include <net/netlink.h> #include <net/netns/generic.h> +#include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); @@ -375,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; + struct nlmsghdr *onlh = nlh; LIST_HEAD(err_list); u32 status; int err; @@ -385,6 +387,7 @@ replay: status = 0; replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); + nlh = onlh; if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); @@ -401,31 +404,36 @@ replay_abort: { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } } if (!ss->valid_genid || !ss->commit || !ss->abort) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } if (!try_module_get(ss->owner)) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } if (!ss->valid_genid(net, genid)) { module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); - return kfree_skb(skb); + return consume_skb(skb); } nfnl_unlock(subsys_id); + if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); + nfnl_err_add(&err_list, nlh, 0, &extack); + } + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -511,7 +519,7 @@ replay_abort: err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, attr, attrlen, - ss->cb[cb_id].policy, NULL); + ss->cb[cb_id].policy, &extack); if (err < 0) goto ack; @@ -532,7 +540,8 @@ ack: * processed, this avoids that the same error is * reported several times when replaying the batch. */ - if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) { + if (err == -ENOMEM || + nfnl_err_add(&err_list, nlh, err, &extack) < 0) { /* We failed to enqueue an error, reset the * list of errors and send OOM to userspace * pointing to the batch header. @@ -560,7 +569,7 @@ done: if (status & NFNL_BATCH_REPLAY) { ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD); nfnl_err_reset(&err_list); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { @@ -571,6 +580,9 @@ done: } else if (err) { ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); + } else if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); + nfnl_err_add(&err_list, nlh, 0, &extack); } } else { enum nfnl_abort_action abort_action; @@ -583,17 +595,15 @@ done: err = ss->abort(net, oskb, abort_action); if (err == -EAGAIN) { nfnl_err_reset(&err_list); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); status |= NFNL_BATCH_FAILURE; goto replay_abort; } } - if (ss->cleanup) - ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); } @@ -685,12 +695,12 @@ static void nfnetlink_bind_event(struct net *net, unsigned int group) group_bit = (1 << group); spin_lock(&nfnl_grp_active_lock); - v = READ_ONCE(net->ct.ctnetlink_has_listener); + v = READ_ONCE(nf_ctnetlink_has_listener); if ((v & group_bit) == 0) { v |= group_bit; /* read concurrently without nfnl_grp_active_lock held. */ - WRITE_ONCE(net->ct.ctnetlink_has_listener, v); + WRITE_ONCE(nf_ctnetlink_has_listener, v); } spin_unlock(&nfnl_grp_active_lock); @@ -744,12 +754,12 @@ static void nfnetlink_unbind(struct net *net, int group) spin_lock(&nfnl_grp_active_lock); if (!nfnetlink_has_listeners(net, group)) { - u8 v = READ_ONCE(net->ct.ctnetlink_has_listener); + u8 v = READ_ONCE(nf_ctnetlink_has_listener); v &= ~group_bit; /* read concurrently without nfnl_grp_active_lock held. */ - WRITE_ONCE(net->ct.ctnetlink_has_listener, v); + WRITE_ONCE(nf_ctnetlink_has_listener, v); } spin_unlock(&nfnl_grp_active_lock); #endif diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index f466af4f8531..38d75484e531 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -366,8 +366,7 @@ static int cttimeout_default_set(struct sk_buff *skb, __u8 l4num; int ret; - if (!cda[CTA_TIMEOUT_L3PROTO] || - !cda[CTA_TIMEOUT_L4PROTO] || + if (!cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; @@ -462,11 +461,6 @@ static int cttimeout_default_get(struct sk_buff *skb, case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(info->net)->timeouts; break; - case IPPROTO_DCCP: -#ifdef CONFIG_NF_CT_PROTO_DCCP - timeouts = nf_dccp_pernet(info->net)->dccp_timeout; -#endif - break; case IPPROTO_ICMPV6: timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 8120aadf6a0f..92d869317cba 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -5,6 +5,7 @@ * Author: Florian Westphal <fw@strlen.de> */ +#include <linux/bpf.h> #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/kernel.h> @@ -57,60 +58,136 @@ struct nfnl_dump_hook_data { u8 hook; }; -static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, - const struct nfnl_dump_hook_data *ctx, - unsigned int seq, - const struct nf_hook_ops *ops) +static struct nlattr *nfnl_start_info_type(struct sk_buff *nlskb, enum nfnl_hook_chaintype t) { - struct net *net = sock_net(nlskb->sk); - struct nlattr *nest, *nest2; - struct nft_chain *chain; - int ret = 0; + struct nlattr *nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO); + int ret; - if (ops->hook_ops_type != NF_HOOK_OP_NF_TABLES) - return 0; + if (!nest) + return NULL; - chain = ops->priv; - if (WARN_ON_ONCE(!chain)) + ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, htonl(t)); + if (ret == 0) + return nest; + + nla_nest_cancel(nlskb, nest); + return NULL; +} + +static int nfnl_hook_put_bpf_prog_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + const struct bpf_prog *prog) +{ + struct nlattr *nest, *nest2; + int ret; + + if (!IS_ENABLED(CONFIG_NETFILTER_BPF_LINK)) return 0; - if (!nft_is_active(net, chain)) + if (WARN_ON_ONCE(!prog)) return 0; - nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO); + nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_BPF); if (!nest) return -EMSGSIZE; - ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, - htonl(NFNL_HOOK_TYPE_NFTABLES)); - if (ret) - goto cancel_nest; - nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); if (!nest2) goto cancel_nest; - ret = nla_put_string(nlskb, NFNLA_CHAIN_TABLE, chain->table->name); - if (ret) - goto cancel_nest; - - ret = nla_put_string(nlskb, NFNLA_CHAIN_NAME, chain->name); - if (ret) - goto cancel_nest; - - ret = nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, chain->table->family); + ret = nla_put_be32(nlskb, NFNLA_HOOK_BPF_ID, htonl(prog->aux->id)); if (ret) goto cancel_nest; nla_nest_end(nlskb, nest2); nla_nest_end(nlskb, nest); - return ret; + return 0; cancel_nest: nla_nest_cancel(nlskb, nest); return -EMSGSIZE; } +static int nfnl_hook_put_nft_info_desc(struct sk_buff *nlskb, const char *tname, + const char *name, u8 family) +{ + struct nlattr *nest; + + nest = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); + if (!nest || + nla_put_string(nlskb, NFNLA_CHAIN_TABLE, tname) || + nla_put_string(nlskb, NFNLA_CHAIN_NAME, name) || + nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, family)) { + nla_nest_cancel(nlskb, nest); + return -EMSGSIZE; + } + nla_nest_end(nlskb, nest); + return 0; +} + +static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + struct nft_chain *chain) +{ + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest; + int ret = 0; + + if (WARN_ON_ONCE(!chain)) + return 0; + + if (!nft_is_active(net, chain)) + return 0; + + nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFTABLES); + if (!nest) + return -EMSGSIZE; + + ret = nfnl_hook_put_nft_info_desc(nlskb, chain->table->name, + chain->name, chain->table->family); + if (ret) { + nla_nest_cancel(nlskb, nest); + return ret; + } + + nla_nest_end(nlskb, nest); + return 0; +} + +static int nfnl_hook_put_nft_ft_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + struct nf_flowtable *nf_ft) +{ + struct nft_flowtable *ft = + container_of(nf_ft, struct nft_flowtable, data); + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest; + int ret = 0; + + if (WARN_ON_ONCE(!nf_ft)) + return 0; + + if (!nft_is_active(net, ft)) + return 0; + + nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFT_FLOWTABLE); + if (!nest) + return -EMSGSIZE; + + ret = nfnl_hook_put_nft_info_desc(nlskb, ft->table->name, + ft->name, ft->table->family); + if (ret) { + nla_nest_cancel(nlskb, nest); + return ret; + } + + nla_nest_end(nlskb, nest); + return 0; +} + static int nfnl_hook_dump_one(struct sk_buff *nlskb, const struct nfnl_dump_hook_data *ctx, const struct nf_hook_ops *ops, @@ -171,7 +248,23 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb, if (ret) goto nla_put_failure; - ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops); + switch (ops->hook_ops_type) { + case NF_HOOK_OP_NF_TABLES: + ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops->priv); + break; + case NF_HOOK_OP_BPF: + ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv); + break; + case NF_HOOK_OP_NFT_FT: + ret = nfnl_hook_put_nft_ft_info(nlskb, ctx, seq, ops->priv); + break; + case NF_HOOK_OP_UNDEFINED: + break; + default: + WARN_ON_ONCE(1); + break; + } + if (ret) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index d97eb280cb2e..bfcb9cd335bf 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -103,9 +103,9 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num) } static struct nfulnl_instance * -__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) +__instance_lookup(const struct nfnl_log_net *log, u16 group_num) { - struct hlist_head *head; + const struct hlist_head *head; struct nfulnl_instance *inst; head = &log->instance_table[instance_hashfn(group_num)]; @@ -123,15 +123,25 @@ instance_get(struct nfulnl_instance *inst) } static struct nfulnl_instance * -instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) +instance_lookup_get_rcu(const struct nfnl_log_net *log, u16 group_num) { struct nfulnl_instance *inst; - rcu_read_lock_bh(); inst = __instance_lookup(log, group_num); if (inst && !refcount_inc_not_zero(&inst->use)) inst = NULL; - rcu_read_unlock_bh(); + + return inst; +} + +static struct nfulnl_instance * +instance_lookup_get(const struct nfnl_log_net *log, u16 group_num) +{ + struct nfulnl_instance *inst; + + rcu_read_lock(); + inst = instance_lookup_get_rcu(log, group_num); + rcu_read_unlock(); return inst; } @@ -371,7 +381,7 @@ static void __nfulnl_flush(struct nfulnl_instance *inst) { /* timer holds a reference */ - if (del_timer(&inst->timer)) + if (timer_delete(&inst->timer)) instance_put(inst); if (inst->skb) __nfulnl_send(inst); @@ -380,7 +390,7 @@ __nfulnl_flush(struct nfulnl_instance *inst) static void nfulnl_timer(struct timer_list *t) { - struct nfulnl_instance *inst = from_timer(inst, t, timer); + struct nfulnl_instance *inst = timer_container_of(inst, t, timer); spin_lock_bh(&inst->lock); if (inst->skb) @@ -460,7 +470,6 @@ __build_packet_message(struct nfnl_log_net *log, sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; - ktime_t tstamp; nlh = nfnl_msg_put(inst->skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), @@ -499,7 +508,7 @@ __build_packet_message(struct nfnl_log_net *log, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { - struct net_device *physindev; + int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ @@ -507,10 +516,10 @@ __build_packet_message(struct nfnl_log_net *log, htonl(indev->ifindex))) goto nla_put_failure; - physindev = nf_bridge_get_physindev(skb); - if (physindev && + physinif = nf_bridge_get_physinif(skb); + if (physinif && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, - htonl(physindev->ifindex))) + htonl(physinif))) goto nla_put_failure; } #endif @@ -589,10 +598,9 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - tstamp = skb_tstamp_cond(skb, false); - if (hooknum <= NF_INET_FORWARD && tstamp) { + if (hooknum <= NF_INET_FORWARD) { + struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true)); struct nfulnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); @@ -690,15 +698,15 @@ nfulnl_log_packet(struct net *net, unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; + enum ip_conntrack_info ctinfo = 0; struct nf_conn *ct = NULL; - enum ip_conntrack_info ctinfo; if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; else li = &default_loginfo; - inst = instance_lookup_get(log, li->u.ulog.group); + inst = instance_lookup_get_rcu(log, li->u.ulog.group); if (!inst) return; @@ -1030,7 +1038,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st) struct hlist_head *head = &log->instance_table[st->bucket]; if (!hlist_empty(head)) - return rcu_dereference_bh(hlist_first_rcu(head)); + return rcu_dereference(hlist_first_rcu(head)); } return NULL; } @@ -1038,7 +1046,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st) static struct hlist_node *get_next(struct net *net, struct iter_state *st, struct hlist_node *h) { - h = rcu_dereference_bh(hlist_next_rcu(h)); + h = rcu_dereference(hlist_next_rcu(h)); while (!h) { struct nfnl_log_net *log; struct hlist_head *head; @@ -1048,7 +1056,7 @@ static struct hlist_node *get_next(struct net *net, struct iter_state *st, log = nfnl_log_pernet(net); head = &log->instance_table[st->bucket]; - h = rcu_dereference_bh(hlist_first_rcu(head)); + h = rcu_dereference(hlist_first_rcu(head)); } return h; } @@ -1066,9 +1074,9 @@ static struct hlist_node *get_idx(struct net *net, struct iter_state *st, } static void *seq_start(struct seq_file *s, loff_t *pos) - __acquires(rcu_bh) + __acquires(rcu) { - rcu_read_lock_bh(); + rcu_read_lock(); return get_idx(seq_file_net(s), s->private, *pos); } @@ -1079,9 +1087,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) } static void seq_stop(struct seq_file *s, void *v) - __releases(rcu_bh) + __releases(rcu) { - rcu_read_unlock_bh(); + rcu_read_unlock(); } static int seq_show(struct seq_file *s, void *v) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index ee6840bd5933..c0fc431991e8 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -315,6 +315,14 @@ static int nfnl_osf_add_callback(struct sk_buff *skb, f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + if (f->opt_num > ARRAY_SIZE(f->opt)) + return -EINVAL; + + if (!memchr(f->genre, 0, MAXGENRELEN) || + !memchr(f->subtype, 0, MAXGENRELEN) || + !memchr(f->version, 0, MAXGENRELEN)) + return -EINVAL; + kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; @@ -439,3 +447,5 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Passive OS fingerprint matching"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 87a9009d5234..8b7b39d8a109 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -29,6 +29,8 @@ #include <linux/netfilter/nfnetlink_queue.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> +#include <linux/cgroup-defs.h> +#include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> @@ -167,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head) struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); + rcu_read_lock(); nfqnl_flush(inst, NULL, 0); + rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } @@ -223,22 +227,174 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) return entry; } +static unsigned int nf_iterate(struct sk_buff *skb, + struct nf_hook_state *state, + const struct nf_hook_entries *hooks, + unsigned int *index) +{ + const struct nf_hook_entry *hook; + unsigned int verdict, i = *index; + + while (i < hooks->num_hook_entries) { + hook = &hooks->hooks[i]; +repeat: + verdict = nf_hook_entry_hookfn(hook, skb, state); + if (verdict != NF_ACCEPT) { + *index = i; + if (verdict != NF_REPEAT) + return verdict; + goto repeat; + } + i++; + } + + *index = i; + return NF_ACCEPT; +} + +static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) +{ + switch (pf) { +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + case NFPROTO_BRIDGE: + return rcu_dereference(net->nf.hooks_bridge[hooknum]); +#endif + case NFPROTO_IPV4: + return rcu_dereference(net->nf.hooks_ipv4[hooknum]); + case NFPROTO_IPV6: + return rcu_dereference(net->nf.hooks_ipv6[hooknum]); + default: + WARN_ON_ONCE(1); + return NULL; + } + + return NULL; +} + +static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) +{ +#ifdef CONFIG_INET + const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->state.hook == NF_INET_LOCAL_OUT) { + const struct iphdr *iph = ip_hdr(skb); + + if (!(iph->tos == rt_info->tos && + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) + return ip_route_me_harder(entry->state.net, entry->state.sk, + skb, RTN_UNSPEC); + } +#endif + return 0; +} + +static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) +{ + const struct nf_ipv6_ops *v6ops; + int ret = 0; + + switch (entry->state.pf) { + case AF_INET: + ret = nf_ip_reroute(skb, entry); + break; + case AF_INET6: + v6ops = rcu_dereference(nf_ipv6_ops); + if (v6ops) + ret = v6ops->reroute(skb, entry); + break; + } + return ret; +} + +/* caller must hold rcu read-side lock */ +static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) +{ + const struct nf_hook_entry *hook_entry; + const struct nf_hook_entries *hooks; + struct sk_buff *skb = entry->skb; + const struct net *net; + unsigned int i; + int err; + u8 pf; + + net = entry->state.net; + pf = entry->state.pf; + + hooks = nf_hook_entries_head(net, pf, entry->state.hook); + + i = entry->hook_index; + if (!hooks || i >= hooks->num_hook_entries) { + kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); + nf_queue_entry_free(entry); + return; + } + + hook_entry = &hooks->hooks[i]; + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) + verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); + + if (verdict == NF_ACCEPT) { + if (nf_reroute(skb, entry) < 0) + verdict = NF_DROP; + } + + if (verdict == NF_ACCEPT) { +next_hook: + ++i; + verdict = nf_iterate(skb, &entry->state, hooks, &i); + } + + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_STOP: + local_bh_disable(); + entry->state.okfn(entry->state.net, entry->state.sk, skb); + local_bh_enable(); + break; + case NF_QUEUE: + err = nf_queue(skb, &entry->state, i, verdict); + if (err == 1) + goto next_hook; + break; + case NF_STOLEN: + break; + default: + kfree_skb(skb); + } + + nf_queue_entry_free(entry); +} + static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; - int err; if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { + unsigned int ct_verdict = verdict; + rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - if (ct_hook) { - err = ct_hook->update(entry->state.net, entry->skb); - if (err < 0) - verdict = NF_DROP; - } + if (ct_hook) + ct_verdict = ct_hook->update(entry->state.net, entry->skb); rcu_read_unlock(); + + switch (ct_verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + /* follow userspace verdict, could be REPEAT */ + break; + case NF_STOLEN: + nf_queue_entry_free(entry); + return; + default: + verdict = ct_verdict & NF_VERDICT_MASK; + break; + } } nf_reinject(entry, verdict); } @@ -301,18 +457,31 @@ nla_put_failure: return -1; } -static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) { - u32 seclen = 0; +#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) + if (sk && sk_fullsock(sk)) { + u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data); + + if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid))) + return -1; + } +#endif + return 0; +} + +static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx) +{ + int seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) - security_secid_to_secctx(skb->secmark, secdata, &seclen); - + seclen = security_secid_to_secctx(skb->secmark, ctx); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; @@ -371,6 +540,14 @@ nla_put_failure: return -1; } +static int nf_queue_checksum_help(struct sk_buff *entskb) +{ + if (skb_csum_is_sctp(entskb)) + return skb_crc32c_csum_help(entskb); + + return skb_checksum_help(entskb); +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -390,8 +567,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; - char *secdata = NULL; - u32 seclen = 0; + struct lsm_context ctx = { NULL, 0, 0 }; + int seclen = 0; ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) @@ -406,6 +583,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + nla_total_size(sizeof(u_int32_t)) /* priority */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ +#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) + + nla_total_size(sizeof(u_int32_t)) /* classid */ +#endif + nla_total_size(sizeof(u_int32_t)); /* cap_len */ tstamp = skb_tstamp_cond(entskb, false); @@ -430,7 +610,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(entskb)) + nf_queue_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); @@ -462,7 +642,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { - seclen = nfqnl_get_sk_secctx(entskb, &secdata); + seclen = nfqnl_get_sk_secctx(entskb, &ctx); + if (seclen < 0) + return NULL; if (seclen) size += nla_total_size(seclen); } @@ -599,7 +781,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; - if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) + goto nla_put_failure; + + if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) @@ -627,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } nlh->nlmsg_len = skb->len; - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return skb; nla_put_failure: @@ -636,8 +821,8 @@ nla_put_failure: kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return NULL; } @@ -645,10 +830,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; - const struct nf_conn *ct = (void *)skb_nfct(entry->skb); + struct nf_conn *ct = (void *)skb_nfct(entry->skb); + unsigned long status; + unsigned int use; - if (ct && ((ct->status & flags) == IPS_DYING)) + if (!ct) + return false; + + status = READ_ONCE(ct->status); + if ((status & flags) == IPS_DYING) return true; + + if (status & IPS_CONFIRMED) + return false; + + /* in some cases skb_clone() can occur after initial conntrack + * pickup, but conntrack assumes exclusive skb->_nfct ownership for + * unconfirmed entries. + * + * This happens for br_netfilter and with ip multicast routing. + * We can't be solved with serialization here because one clone could + * have been queued for local delivery. + */ + use = refcount_read(&ct->ct_general.use); + if (likely(use == 1)) + return false; + + /* Can't decrement further? Exclusive ownership. */ + if (!refcount_dec_not_one(&ct->ct_general.use)) + return false; + + skb_set_nfct(entry->skb, 0); + /* No nf_ct_put(): we already decremented .use and it cannot + * drop down to 0. + */ + return true; #endif return false; } @@ -808,7 +1024,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) break; } - if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) + if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb))) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 84eae7cabc67..d550910aabec 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -17,6 +17,7 @@ struct nft_bitwise { u8 sreg; + u8 sreg2; u8 dreg; enum nft_bitwise_ops op:8; u8 len; @@ -25,8 +26,8 @@ struct nft_bitwise { struct nft_data data; }; -static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, - const struct nft_bitwise *priv) +static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) { unsigned int i; @@ -60,38 +61,82 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, } } +static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] & src2[i]; +} + +static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] | src2[i]; +} + +static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] ^ src2[i]; +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); - const u32 *src = ®s->data[priv->sreg]; + const u32 *src = ®s->data[priv->sreg], *src2; u32 *dst = ®s->data[priv->dreg]; - switch (priv->op) { - case NFT_BITWISE_BOOL: - nft_bitwise_eval_bool(dst, src, priv); - break; - case NFT_BITWISE_LSHIFT: + if (priv->op == NFT_BITWISE_MASK_XOR) { + nft_bitwise_eval_mask_xor(dst, src, priv); + return; + } + if (priv->op == NFT_BITWISE_LSHIFT) { nft_bitwise_eval_lshift(dst, src, priv); - break; - case NFT_BITWISE_RSHIFT: + return; + } + if (priv->op == NFT_BITWISE_RSHIFT) { nft_bitwise_eval_rshift(dst, src, priv); - break; + return; + } + + src2 = priv->sreg2 ? ®s->data[priv->sreg2] : priv->data.data; + + if (priv->op == NFT_BITWISE_AND) { + nft_bitwise_eval_and(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_OR) { + nft_bitwise_eval_or(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_XOR) { + nft_bitwise_eval_xor(dst, src, src2, priv); + return; } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_SREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_SREG2] = { .type = NLA_U32 }, [NFTA_BITWISE_DREG] = { .type = NLA_U32 }, [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, - [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; -static int nft_bitwise_init_bool(struct nft_bitwise *priv, - const struct nlattr *const tb[]) +static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { struct nft_data_desc mask = { .type = NFT_DATA_VALUE, @@ -105,7 +150,8 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv, }; int err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || @@ -139,7 +185,8 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, int err; if (tb[NFTA_BITWISE_MASK] || - tb[NFTA_BITWISE_XOR]) + tb[NFTA_BITWISE_XOR] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_DATA]) @@ -157,6 +204,41 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, return 0; } +static int nft_bitwise_init_bool(const struct nft_ctx *ctx, + struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + int err; + + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) || + (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2])) + return -EINVAL; + + if (tb[NFTA_BITWISE_DATA]) { + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = priv->len, + }; + + err = nft_data_init(NULL, &priv->data, &desc, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + } else { + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2], + &priv->sreg2, priv->len); + if (err < 0) + return err; + } + + return 0; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -171,7 +253,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg, priv->len); if (err < 0) return err; @@ -185,32 +267,40 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (tb[NFTA_BITWISE_OP]) { priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); switch (priv->op) { - case NFT_BITWISE_BOOL: + case NFT_BITWISE_MASK_XOR: case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: break; default: return -EOPNOTSUPP; } } else { - priv->op = NFT_BITWISE_BOOL; + priv->op = NFT_BITWISE_MASK_XOR; } switch(priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_init_bool(priv, tb); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_init_mask_xor(priv, tb); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: err = nft_bitwise_init_shift(priv, tb); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_init_bool(ctx, priv, tb); + break; } return err; } -static int nft_bitwise_dump_bool(struct sk_buff *skb, - const struct nft_bitwise *priv) +static int nft_bitwise_dump_mask_xor(struct sk_buff *skb, + const struct nft_bitwise *priv) { if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) @@ -232,6 +322,21 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb, return 0; } +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (priv->sreg2) { + if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2)) + return -1; + } else { + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + } + + return 0; +} + static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { @@ -248,13 +353,18 @@ static int nft_bitwise_dump(struct sk_buff *skb, return -1; switch (priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_dump_bool(skb, priv); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_dump_mask_xor(skb, priv); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: err = nft_bitwise_dump_shift(skb, priv); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_dump_bool(skb, priv); + break; } return err; @@ -269,7 +379,7 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; - if (priv->op != NFT_BITWISE_BOOL) + if (priv->op != NFT_BITWISE_MASK_XOR) return -EOPNOTSUPP; if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || @@ -299,6 +409,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track, track->regs[priv->dreg].bitwise && track->regs[priv->dreg].bitwise->ops == expr->ops && priv->sreg == bitwise->sreg && + priv->sreg2 == bitwise->sreg2 && priv->dreg == bitwise->dreg && priv->op == bitwise->op && priv->len == bitwise->len && @@ -323,7 +434,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track, dreg = priv->dreg; regcount = DIV_ROUND_UP(priv->len, NFT_REG32_SIZE); for (i = 0; i < regcount; i++, dreg++) - track->regs[priv->dreg].bitwise = expr; + track->regs[dreg].bitwise = expr; return false; } @@ -365,7 +476,7 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); int err; - err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg, sizeof(u32)); if (err < 0) return err; @@ -375,7 +486,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || @@ -406,7 +518,7 @@ nft_bitwise_fast_dump(struct sk_buff *skb, return -1; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32)))) return -1; - if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_BOOL))) + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR))) return -1; data.data[0] = priv->mask; @@ -501,7 +613,7 @@ nft_bitwise_select_ops(const struct nft_ctx *ctx, return &nft_bitwise_ops; if (tb[NFTA_BITWISE_OP] && - ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_BOOL) + ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR) return &nft_bitwise_ops; return &nft_bitwise_fast_ops; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index b66647a5a171..af9206a3afd1 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -5,7 +5,7 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> @@ -30,21 +30,22 @@ void nft_byteorder_eval(const struct nft_expr *expr, const struct nft_byteorder *priv = nft_expr_priv(expr); u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - union { u32 u32; u16 u16; } *s, *d; + u16 *s16, *d16; unsigned int i; - s = (void *)src; - d = (void *)dst; + s16 = (void *)src; + d16 = (void *)dst; switch (priv->size) { case 8: { + u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); - nft_reg_store64(&dst[i], + nft_reg_store64(&dst64[i], be64_to_cpu((__force __be64)src64)); } break; @@ -52,7 +53,7 @@ void nft_byteorder_eval(const struct nft_expr *expr, for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); - nft_reg_store64(&dst[i], src64); + nft_reg_store64(&dst64[i], src64); } break; } @@ -62,11 +63,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 4; i++) - d[i].u32 = ntohl((__force __be32)s[i].u32); + dst[i] = ntohl((__force __be32)src[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 4; i++) - d[i].u32 = (__force __u32)htonl(s[i].u32); + dst[i] = (__force __u32)htonl(src[i]); break; } break; @@ -74,11 +75,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 2; i++) - d[i].u16 = ntohs((__force __be16)s[i].u16); + d16[i] = ntohs((__force __be16)s16[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 2; i++) - d[i].u16 = (__force __u16)htons(s[i].u16); + d16[i] = (__force __u16)htons(s16[i]); break; } break; @@ -88,9 +89,9 @@ void nft_byteorder_eval(const struct nft_expr *expr, static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_OP] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_byteorder_init(const struct nft_ctx *ctx, @@ -138,7 +139,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg, priv->len); if (err < 0) return err; diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index c3563f0be269..b16185e9a6dd 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -318,33 +318,85 @@ static const struct nft_chain_type nft_chain_filter_netdev = { }, }; -static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_ctx *ctx) +static int nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_base_chain *basechain, bool changename) { - struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - struct nft_hook *hook, *found = NULL; - int n = 0; - - if (event != NETDEV_UNREGISTER) - return; + struct nft_table *table = basechain->chain.table; + struct nf_hook_ops *ops; + struct nft_hook *hook; + bool match; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev == dev) - found = hook; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - n++; - } - if (!found) - return; - - if (n > 1) { - nf_unregister_net_hook(ctx->net, &found->ops); - list_del_rcu(&found->list); - kfree_rcu(found, rcu); - return; + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nf_unregister_net_hook(dev_net(dev), ops); + + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kmemdup(&basechain->ops, + sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->dev = dev; + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + nf_register_net_hook(dev_net(dev), ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } + break; } + return 0; +} + +static int __nf_tables_netdev_event(unsigned long event, + struct net_device *dev, + bool changename) +{ + struct nft_base_chain *basechain; + struct nftables_pernet *nft_net; + struct nft_chain *chain; + struct nft_table *table; + + nft_net = nft_pernet(dev_net(dev)); + list_for_each_entry(table, &nft_net->tables, list) { + if (table->family != NFPROTO_NETDEV && + table->family != NFPROTO_INET) + continue; - __nft_release_basechain(ctx); + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_base_chain(chain)) + continue; + + basechain = nft_base_chain(chain); + if (table->family == NFPROTO_INET && + basechain->ops.hooknum != NF_INET_INGRESS) + continue; + + if (nft_netdev_event(event, dev, basechain, changename)) + return 1; + } + } + return 0; } static int nf_tables_netdev_event(struct notifier_block *this, @@ -352,38 +404,28 @@ static int nf_tables_netdev_event(struct notifier_block *this, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; - struct nft_table *table; - struct nft_chain *chain, *nr; - struct nft_ctx ctx = { - .net = dev_net(dev), - }; + int ret = NOTIFY_DONE; - if (event != NETDEV_UNREGISTER && + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && event != NETDEV_CHANGENAME) return NOTIFY_DONE; - if (!check_net(ctx.net)) - return NOTIFY_DONE; - - nft_net = nft_pernet(ctx.net); + nft_net = nft_pernet(dev_net(dev)); mutex_lock(&nft_net->commit_mutex); - list_for_each_entry(table, &nft_net->tables, list) { - if (table->family != NFPROTO_NETDEV) - continue; - ctx.family = table->family; - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { - if (!nft_is_base_chain(chain)) - continue; - - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; } + __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_netdev_event(event, dev, false)) { + ret = NOTIFY_BAD; } +out_unlock: mutex_unlock(&nft_net->commit_mutex); - - return NOTIFY_DONE; + return ret; } static struct notifier_block nf_tables_netdev_notifier = { diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index 98e4946100c5..40e230d8b712 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -137,6 +137,7 @@ module_init(nft_chain_nat_init); module_exit(nft_chain_nat_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("nftables network address translation support"); #ifdef CONFIG_NF_TABLES_IPV4 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); #endif diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 6eb21a4f5698..2605f43737bc 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -83,7 +83,7 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -162,7 +162,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, memcpy(key + reg->offset, data, reg->len); memcpy(mask + reg->offset, datamask, reg->len); - flow->match.dissector.used_keys |= BIT(reg->key); + flow->match.dissector.used_keys |= BIT_ULL(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; if (reg->key == FLOW_DISSECTOR_KEY_META && @@ -222,7 +222,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -323,7 +323,7 @@ static int nft_cmp16_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 5284cd2ad532..72711d62fddf 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -135,7 +135,7 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, - [NFTA_TARGET_REV] = { .type = NLA_U32 }, + [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; @@ -200,6 +200,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; + u32 l4proto; u32 flags; int err; @@ -212,16 +213,22 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); - if (flags & ~NFT_RULE_COMPAT_F_MASK) + if (flags & NFT_RULE_COMPAT_F_UNUSED || + flags & ~NFT_RULE_COMPAT_F_MASK) return -EINVAL; if (flags & NFT_RULE_COMPAT_F_INV) *inv = true; - *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + if (l4proto > U16_MAX) + return -EINVAL; + + *proto = l4proto; + return 0; } -static void nft_compat_wait_for_destructors(void) +static void nft_compat_wait_for_destructors(struct net *net) { /* xtables matches or targets can have side effects, e.g. * creation/destruction of /proc files. @@ -229,7 +236,7 @@ static void nft_compat_wait_for_destructors(void) * work queue. If we have pending invocations we thus * need to wait for those to finish. */ - nf_tables_trans_destroy_flush_work(); + nf_tables_trans_destroy_flush_work(net); } static int @@ -255,7 +262,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); - nft_compat_wait_for_destructors(); + nft_compat_wait_for_destructors(ctx->net); ret = xt_check_target(&par, size, proto, inv); if (ret < 0) { @@ -343,13 +350,28 @@ nla_put_failure: } static int nft_target_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; unsigned int hook_mask = 0; int ret; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && + ctx->family != NFPROTO_BRIDGE && + ctx->family != NFPROTO_ARP) + return -EOPNOTSUPP; + + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -413,7 +435,7 @@ static void nft_match_eval(const struct nft_expr *expr, static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, - [NFTA_MATCH_REV] = { .type = NLA_U32 }, + [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; @@ -493,7 +515,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); - nft_compat_wait_for_destructors(); + nft_compat_wait_for_destructors(ctx->net); return xt_check_match(&par, size, proto, inv); } @@ -513,7 +535,7 @@ nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_match *m = expr->ops->data; int ret; - priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL); + priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT); if (!priv->info) return -ENOMEM; @@ -588,13 +610,28 @@ static int nft_match_large_dump(struct sk_buff *skb, } static int nft_match_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; unsigned int hook_mask = 0; int ret; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET && + ctx->family != NFPROTO_BRIDGE && + ctx->family != NFPROTO_ARP) + return -EOPNOTSUPP; + + ret = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING)); + if (ret) + return ret; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -712,7 +749,7 @@ out_put: static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING, .len = NFT_COMPAT_NAME_MAX-1 }, - [NFTA_COMPAT_REV] = { .type = NLA_U32 }, + [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_COMPAT_TYPE] = { .type = NLA_U32 }, }; @@ -771,7 +808,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; @@ -861,7 +898,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index de9d1980df69..657764774a2d 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -24,33 +24,27 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, const struct nft_pktinfo *pkt, const struct nft_set_ext *ext) { - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; - const struct nf_conntrack_tuple *tuple_ptr; - struct nf_conntrack_tuple tuple; - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; unsigned int count; + int err; - tuple_ptr = &tuple; - - ct = nf_ct_get(pkt->skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), - nft_pf(pkt), nft_net(pkt), &tuple)) { - regs->verdict.code = NF_DROP; - return; - } - - if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { - regs->verdict.code = NF_DROP; - return; + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); + if (err) { + if (err == -EEXIST) { + /* Call gc to update the list count if any connection has + * been closed already. This is useful for softlimit + * connections like limiting bandwidth based on a number + * of open connections. + */ + nf_conncount_gc_list(nft_net(pkt), priv->list); + } else { + regs->verdict.code = NF_DROP; + return; + } } - count = priv->list->count; + count = READ_ONCE(priv->list->count); - if ((count > priv->limit) ^ priv->invert) { + if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) { regs->verdict.code = NFT_BREAK; return; } @@ -137,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx, return nft_connlimit_do_init(ctx, tb, priv); } +static void nft_connlimit_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_connlimit *newpriv = nft_obj_data(newobj); + struct nft_connlimit *priv = nft_obj_data(obj); + + WRITE_ONCE(priv->limit, newpriv->limit); + WRITE_ONCE(priv->invert, newpriv->invert); +} + static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { @@ -166,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = { .init = nft_connlimit_obj_init, .destroy = nft_connlimit_obj_destroy, .dump = nft_connlimit_obj_dump, + .update = nft_connlimit_obj_update, }; static struct nft_object_type nft_connlimit_obj_type __read_mostly = { @@ -210,12 +215,12 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx, nft_connlimit_do_destroy(ctx, priv); } -static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC); + priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp); if (!priv_dst->list) return -ENOMEM; @@ -238,13 +243,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - - local_bh_disable(); - ret = nf_conncount_gc_list(net, priv->list); - local_bh_enable(); - return ret; + return nf_conncount_gc_list(net, priv->list); } static struct nft_expr_type nft_connlimit_type; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index dccc68a5135a..cc7325329496 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -8,7 +8,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/seqlock.h> +#include <linux/u64_stats_sync.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -17,6 +17,11 @@ #include <net/netfilter/nf_tables_offload.h> struct nft_counter { + u64_stats_t bytes; + u64_stats_t packets; +}; + +struct nft_counter_tot { s64 bytes; s64 packets; }; @@ -25,25 +30,24 @@ struct nft_counter_percpu_priv { struct nft_counter __percpu *counter; }; -static DEFINE_PER_CPU(seqcount_t, nft_counter_seq); +static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync); static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); - - write_seqcount_begin(myseq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - this_cpu->bytes += pkt->skb->len; - this_cpu->packets++; + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->bytes, pkt->skb->len); + u64_stats_inc(&this_cpu->packets); + u64_stats_update_end(nft_sync); - write_seqcount_end(myseq); local_bh_enable(); } @@ -66,17 +70,16 @@ static int nft_counter_do_init(const struct nlattr * const tb[], if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); + this_cpu = raw_cpu_ptr(cpu_stats); if (tb[NFTA_COUNTER_PACKETS]) { - this_cpu->packets = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + u64_stats_set(&this_cpu->packets, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]))); } if (tb[NFTA_COUNTER_BYTES]) { - this_cpu->bytes = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + u64_stats_set(&this_cpu->bytes, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]))); } - preempt_enable(); + priv->counter = cpu_stats; return 0; } @@ -104,35 +107,41 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx, } static void nft_counter_reset(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - this_cpu->packets -= total->packets; - this_cpu->bytes -= total->bytes; + nft_sync = this_cpu_ptr(&nft_counter_sync); + + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, -total->packets); + u64_stats_add(&this_cpu->bytes, -total->bytes); + u64_stats_update_end(nft_sync); + local_bh_enable(); } static void nft_counter_fetch(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { struct nft_counter *this_cpu; - const seqcount_t *myseq; u64 bytes, packets; unsigned int seq; int cpu; memset(total, 0, sizeof(*total)); for_each_possible_cpu(cpu) { - myseq = per_cpu_ptr(&nft_counter_seq, cpu); + struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu); + this_cpu = per_cpu_ptr(priv->counter, cpu); do { - seq = read_seqcount_begin(myseq); - bytes = this_cpu->bytes; - packets = this_cpu->packets; - } while (read_seqcount_retry(myseq, seq)); + seq = u64_stats_fetch_begin(nft_sync); + bytes = u64_stats_read(&this_cpu->bytes); + packets = u64_stats_read(&this_cpu->packets); + } while (u64_stats_fetch_retry(nft_sync, seq)); total->bytes += bytes; total->packets += packets; @@ -143,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb, struct nft_counter_percpu_priv *priv, bool reset) { - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); @@ -226,25 +235,23 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, nft_counter_do_destroy(priv); } -static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_counter_percpu_priv *priv = nft_expr_priv(src); struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); - cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC); + cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp); if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); - this_cpu->packets = total.packets; - this_cpu->bytes = total.bytes; - preempt_enable(); + this_cpu = raw_cpu_ptr(cpu_stats); + u64_stats_set(&this_cpu->packets, total.packets); + u64_stats_set(&this_cpu->bytes, total.bytes); priv_clone->counter = cpu_stats; return 0; @@ -262,18 +269,18 @@ static void nft_counter_offload_stats(struct nft_expr *expr, const struct flow_stats *stats) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; - preempt_disable(); + local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - write_seqcount_begin(myseq); - this_cpu->packets += stats->pkts; - this_cpu->bytes += stats->bytes; - write_seqcount_end(myseq); - preempt_enable(); + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, stats->pkts); + u64_stats_add(&this_cpu->bytes, stats->bytes); + u64_stats_update_end(nft_sync); + local_bh_enable(); } void nft_counter_init_seqcount(void) @@ -281,7 +288,7 @@ void nft_counter_init_seqcount(void) int cpu; for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu)); + u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu)); } struct nft_expr_type nft_counter_type; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index c68e2151defe..6f2ae7cad731 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -12,7 +12,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> @@ -22,16 +22,7 @@ #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> - -struct nft_ct { - enum nft_ct_keys key:8; - enum ip_conntrack_dir dir:8; - u8 len; - union { - u8 dreg; - u8 sreg; - }; -}; +#include <net/netfilter/nf_conntrack_seqadj.h> struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; @@ -118,7 +109,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, helper = rcu_dereference(help->helper); if (helper == NULL) goto err; - strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); + strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { @@ -240,6 +231,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; + int oldcnt; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ @@ -260,10 +252,11 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); - if (likely(refcount_read(&ct->ct_general.use) == 1)) { - refcount_inc(&ct->ct_general.use); + __refcount_inc(&ct->ct_general.use, &oldcnt); + if (likely(oldcnt == 1)) { nf_ct_zone_add(ct, &zone); } else { + refcount_dec(&ct->ct_general.use); /* previous skb got queued to userspace, allocate temporary * one until percpu template can be reused. */ @@ -272,6 +265,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; return; } + __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); @@ -342,7 +336,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, - [NFTA_CT_KEY] = { .type = NLA_U32 }, + [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, [NFTA_CT_SREG] = { .type = NLA_U32 }, }; @@ -378,6 +372,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } + __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } @@ -385,6 +380,14 @@ static bool nft_ct_tmpl_alloc_pcpu(void) } #endif +static void __nft_ct_get_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) +{ +#ifdef CONFIG_NF_CONNTRACK_LABELS + if (priv->key == NFT_CT_LABELS) + nf_connlabels_put(ctx->net); +#endif +} + static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -419,6 +422,10 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; + + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; break; #endif case NFT_CT_HELPER: @@ -484,6 +491,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, break; #endif case NFT_CT_ID: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); break; default: @@ -497,7 +507,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, case IP_CT_DIR_REPLY: break; default: - return -EINVAL; + err = -EINVAL; + goto err; } } @@ -505,11 +516,11 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); if (err < 0) - return err; + goto err; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) - return err; + goto err; if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS || @@ -517,6 +528,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, nf_ct_set_acct(ctx->net, true); return 0; +err: + __nft_ct_get_destroy(ctx, priv); + return err; } static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) @@ -611,7 +625,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, } priv->len = len; - err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; @@ -629,6 +643,9 @@ err1: static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + __nft_ct_get_destroy(ctx, priv); nf_ct_netns_put(ctx->net, ctx->family); } @@ -759,6 +776,18 @@ static bool nft_ct_set_reduce(struct nft_regs_track *track, return false; } +#ifdef CONFIG_MITIGATION_RETPOLINE +static const struct nft_expr_ops nft_ct_get_fast_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_get_fast_eval, + .init = nft_ct_get_init, + .destroy = nft_ct_get_destroy, + .dump = nft_ct_get_dump, + .reduce = nft_ct_set_reduce, +}; +#endif + static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), @@ -791,8 +820,21 @@ nft_ct_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); - if (tb[NFTA_CT_DREG]) + if (tb[NFTA_CT_DREG]) { +#ifdef CONFIG_MITIGATION_RETPOLINE + u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + + switch (k) { + case NFT_CT_STATE: + case NFT_CT_DIRECTION: + case NFT_CT_STATUS: + case NFT_CT_MARK: + case NFT_CT_SECMARK: + return &nft_ct_get_fast_ops; + } +#endif return &nft_ct_get_ops; + } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -909,7 +951,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj, */ values = nf_ct_timeout_data(timeout); if (values) - nf_ct_refresh(ct, pkt->skb, values[0]); + nf_ct_refresh(ct, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, @@ -1151,6 +1193,10 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj, if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); + + if ((ct->status & IPS_NAT_MASK) && !nfct_seqadj(ct)) + if (!nfct_seqadj_ext_add(ct)) + regs->verdict.code = NF_DROP; } } @@ -1233,7 +1279,30 @@ static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_EXPECT_L3PROTO]) priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + switch (priv->l3num) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET) + break; + + return -EINVAL; + case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */ + default: + return -EAFNOSUPPORT; + } + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + switch (priv->l4proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_DCCP: + case IPPROTO_SCTP: + break; + default: + return -EOPNOTSUPP; + } + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c new file mode 100644 index 000000000000..e684c8a91848 --- /dev/null +++ b/net/netfilter/nft_ct_fast.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only +#if IS_ENABLED(CONFIG_NFT_CT) +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack.h> + +void nft_ct_get_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int state; + + ct = nf_ct_get(pkt->skb, &ctinfo); + + switch (priv->key) { + case NFT_CT_STATE: + if (ct) + state = NF_CT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + state = NF_CT_STATE_UNTRACKED_BIT; + else + state = NF_CT_STATE_INVALID_BIT; + *dest = state; + return; + default: + break; + } + + if (!ct) { + regs->verdict.code = NFT_BREAK; + return; + } + + switch (priv->key) { + case NFT_CT_DIRECTION: + nft_reg_store8(dest, CTINFO2DIR(ctinfo)); + return; + case NFT_CT_STATUS: + *dest = ct->status; + return; +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + *dest = ct->mark; + return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + *dest = ct->secmark; + return; +#endif + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + break; + } +} +EXPORT_SYMBOL_GPL(nft_ct_get_fast_eval); +#endif diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index e5739a59ebf1..0573f96ce079 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -40,7 +40,7 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_DEV] == NULL) return -EINVAL; - return nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, + return nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 274579b1696e..7807d8129664 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -35,7 +35,7 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, for (i = 0; i < priv->num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); - if (nft_expr_clone(expr, priv->expr_array[i]) < 0) + if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0) return -1; elem_expr->size += priv->expr_array[i]->ops->size; @@ -44,33 +44,34 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, return 0; } -static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, - struct nft_regs *regs) +struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set_ext *ext; + void *elem_priv; u64 timeout; - void *elem; if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; - timeout = priv->timeout ? : set->timeout; - elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], NULL, - ®s->data[priv->sreg_data], - timeout, 0, GFP_ATOMIC); - if (IS_ERR(elem)) + timeout = priv->timeout ? : READ_ONCE(set->timeout); + elem_priv = nft_set_elem_init(set, &priv->tmpl, + ®s->data[priv->sreg_key], NULL, + ®s->data[priv->sreg_data], + timeout, 0, GFP_ATOMIC); + if (IS_ERR(elem_priv)) goto err1; - ext = nft_set_elem_ext(set, elem); + ext = nft_set_elem_ext(set, elem_priv); if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0) goto err2; - return elem; + return elem_priv; err2: - nft_set_elem_destroy(set, elem, false); + nft_set_elem_destroy(set, elem_priv, false); err1: if (set->size) atomic_dec(&set->nelems); @@ -90,12 +91,13 @@ void nft_dynset_eval(const struct nft_expr *expr, return; } - if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, - expr, regs, &ext)) { + ext = set->ops->update(set, ®s->data[priv->sreg_key], expr, regs); + if (ext) { if (priv->op == NFT_DYNSET_OP_UPDATE && - nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - timeout = priv->timeout ? : set->timeout; - *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout; + nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && + READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) { + timeout = priv->timeout ? : READ_ONCE(set->timeout); + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + timeout); } nft_set_elem_update_expr(ext, regs, pkt); @@ -148,7 +150,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, - [NFTA_DYNSET_OP] = { .type = NLA_U32 }, + [NFTA_DYNSET_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, @@ -191,6 +193,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_OBJECT) + return -EOPNOTSUPP; + if (set->ops->update == NULL) return -EOPNOTSUPP; @@ -211,7 +216,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return err; } - err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, + err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, set->klen); if (err < 0) return err; @@ -222,7 +227,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; - err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_DATA], + err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_DATA], &priv->sreg_data, set->dlen); if (err < 0) return err; @@ -276,10 +281,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx, priv->expr_array[i] = dynset_expr; priv->num_exprs++; - if (set->num_exprs && - dynset_expr->ops != set->exprs[i]->ops) { - err = -EOPNOTSUPP; - goto err_expr_free; + if (set->num_exprs) { + if (i >= set->num_exprs) { + err = -EINVAL; + goto err_expr_free; + } + if (dynset_expr->ops != set->exprs[i]->ops) { + err = -EOPNOTSUPP; + goto err_expr_free; + } } i++; } @@ -303,12 +313,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (priv->num_exprs) nft_dynset_ext_add_expr(priv); - if (set->flags & NFT_SET_TIMEOUT) { - if (timeout || set->timeout) { - nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); - nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); - } - } + if (set->flags & NFT_SET_TIMEOUT && + (timeout || READ_ONCE(set->timeout))) + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); priv->timeout = timeout; @@ -342,7 +349,7 @@ static void nft_dynset_activate(const struct nft_ctx *ctx, { struct nft_dynset *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_dynset_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a54a7f772cec..7eedf4e3ae9c 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -5,11 +5,12 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/dccp.h> #include <linux/sctp.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -34,6 +35,14 @@ static unsigned int optlen(const u8 *opt, unsigned int offset) return opt[offset + 1]; } +static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len) +{ + if (len % NFT_REG32_SIZE) + dest[len / NFT_REG32_SIZE] = 0; + + return skb_copy_bits(skb, offset, dest, len); +} + static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -55,8 +64,7 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: @@ -77,7 +85,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; struct iphdr *iph, _iph; - unsigned int start; bool found = false; __be32 info; int optlen; @@ -85,7 +92,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (!iph) return -EBADMSG; - start = sizeof(struct iphdr); optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); if (optlen <= 0) @@ -95,7 +101,7 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, /* Copy the options since __ip_options_compile() modifies * the options. */ - if (skb_copy_bits(skb, start, opt->__data, optlen)) + if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen)) return -EBADMSG; opt->optlen = optlen; @@ -110,18 +116,18 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, found = target == IPOPT_SSRR ? opt->is_strictroute : !opt->is_strictroute; if (found) - *offset = opt->srr + start; + *offset = opt->srr; break; case IPOPT_RR: if (!opt->rr) break; - *offset = opt->rr + start; + *offset = opt->rr; found = true; break; case IPOPT_RA: if (!opt->router_alert) break; - *offset = opt->router_alert + start; + *offset = opt->router_alert; found = true; break; default: @@ -152,8 +158,7 @@ static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: @@ -207,9 +212,10 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr, offset = i + priv->offset; if (priv->flags & NFT_EXTHDR_F_PRESENT) { - *dest = 1; + nft_reg_store8(dest, 1); } else { - dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; memcpy(dest, opt + offset, priv->len); } @@ -237,7 +243,12 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (!tcph) goto err; + if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) + goto err; + + tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { union { __be16 v16; @@ -252,15 +263,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (i + optl > tcphdr_len || priv->len + priv->offset > optl) goto err; - if (skb_ensure_writable(pkt->skb, - nft_thoff(pkt) + i + priv->len)) - goto err; - - tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, - &tcphdr_len); - if (!tcph) - goto err; - offset = i + priv->offset; switch (priv->len) { @@ -324,9 +326,9 @@ static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr, if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) goto drop; - opt = (u8 *)nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); - if (!opt) - goto err; + tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); + opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { unsigned int j; @@ -391,9 +393,8 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, offset + ntohs(sch->length) > pkt->skb->len) break; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset + priv->offset, - dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset, + dest, priv->len) < 0) break; return; } @@ -406,13 +407,91 @@ err: regs->verdict.code = NFT_BREAK; } +#ifdef CONFIG_NFT_EXTHDR_DCCP +static void nft_exthdr_dccp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int thoff, dataoff, optoff, optlen, i; + u32 *dest = ®s->data[priv->dreg]; + const struct dccp_hdr *dh; + struct dccp_hdr _dh; + + if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff) + goto err; + + thoff = nft_thoff(pkt); + + dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh); + if (!dh) + goto err; + + dataoff = dh->dccph_doff * sizeof(u32); + optoff = __dccp_hdr_len(dh); + if (dataoff <= optoff) + goto err; + + optlen = dataoff - optoff; + + for (i = 0; i < optlen; ) { + /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in + * the length; the remaining options are at least 2B long. In + * all cases, the first byte contains the option type. In + * multi-byte options, the second byte contains the option + * length, which must be at least two: 1 for the type plus 1 for + * the length plus 0-253 for any following option data. We + * aren't interested in the option data, only the type and the + * length, so we don't need to read more than two bytes at a + * time. + */ + unsigned int buflen = optlen - i; + u8 buf[2], *bufp; + u8 type, len; + + if (buflen > sizeof(buf)) + buflen = sizeof(buf); + + bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen, + &buf); + if (!bufp) + goto err; + + type = bufp[0]; + + if (type == priv->type) { + nft_reg_store8(dest, 1); + return; + } + + if (type <= DCCPO_MAX_RESERVED) { + i++; + continue; + } + + if (buflen < 2) + goto err; + + len = bufp[1]; + + if (len < 2) + goto err; + + i += len; + } + +err: + *dest = 0; +} +#endif + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, - [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, + [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, - [NFTA_EXTHDR_OP] = { .type = NLA_U32 }, + [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; @@ -509,7 +588,7 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, priv->flags = flags; priv->op = op; - return nft_parse_register_load(tb[NFTA_EXTHDR_SREG], &priv->sreg, + return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg, priv->len); } @@ -557,6 +636,24 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, return 0; } +#ifdef CONFIG_NFT_EXTHDR_DCCP +static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + if (!(priv->flags & NFT_EXTHDR_F_PRESENT)) + return -EOPNOTSUPP; + + return 0; +} +#endif + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -686,6 +783,17 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = { .reduce = nft_exthdr_reduce, }; +#ifdef CONFIG_NFT_EXTHDR_DCCP +static const struct nft_expr_ops nft_exthdr_dccp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_dccp_eval, + .init = nft_exthdr_dccp_init, + .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, +}; +#endif + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -720,6 +828,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_sctp_ops; break; +#ifdef CONFIG_NFT_EXTHDR_DCCP + case NFT_EXTHDR_OP_DCCP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_dccp_ops; + break; +#endif } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 6e049fd48760..96e02a83c045 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -14,19 +14,19 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> +#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ + NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ + NFTA_FIB_F_PRESENT) + const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { [NFTA_FIB_DREG] = { .type = NLA_U32 }, [NFTA_FIB_RESULT] = { .type = NLA_U32 }, - [NFTA_FIB_FLAGS] = { .type = NLA_U32 }, + [NFTA_FIB_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFTA_FIB_F_ALL), }; EXPORT_SYMBOL(nft_fib_policy); -#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ - NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ - NFTA_FIB_F_PRESENT) - -int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_fib *priv = nft_expr_priv(expr); unsigned int hooks; @@ -34,11 +34,9 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, switch (priv->result) { case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: - hooks = (1 << NF_INET_PRE_ROUTING); - if (priv->flags & NFTA_FIB_F_IIF) { - hooks |= (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_FORWARD); - } + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); break; case NFT_FIB_RESULT_ADDRTYPE: if (priv->flags & NFTA_FIB_F_IIF) @@ -77,7 +75,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS])); - if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL)) + if (priv->flags == 0) return -EINVAL; if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == @@ -144,13 +142,17 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv, switch (priv->result) { case NFT_FIB_RESULT_OIF: index = dev ? dev->ifindex : 0; - *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index; + if (priv->flags & NFTA_FIB_F_PRESENT) + nft_reg_store8(dreg, !!index); + else + *dreg = index; + break; case NFT_FIB_RESULT_OIFNAME: if (priv->flags & NFTA_FIB_F_PRESENT) - *dreg = !!dev; + nft_reg_store8(dreg, !!dev); else - strncpy(reg, dev ? dev->name : "", IFNAMSIZ); + strscpy_pad(reg, dev ? dev->name : "", IFNAMSIZ); break; default: WARN_ON_ONCE(1); @@ -203,4 +205,5 @@ bool nft_fib_reduce(struct nft_regs_track *track, EXPORT_SYMBOL_GPL(nft_fib_reduce); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Query routing table from nftables"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index e860d8fe0e5e..b8f76c9057fd 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -8,7 +8,8 @@ #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> -#include <net/ip.h> /* for ipv4 options. */ +#include <net/ip.h> +#include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> @@ -19,253 +20,6 @@ struct nft_flow_offload { struct nft_flowtable *flowtable; }; -static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) -{ - if (dst_xfrm(dst)) - return FLOW_OFFLOAD_XMIT_XFRM; - - return FLOW_OFFLOAD_XMIT_NEIGH; -} - -static void nft_default_forward_path(struct nf_flow_route *route, - struct dst_entry *dst_cache, - enum ip_conntrack_dir dir) -{ - route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; - route->tuple[dir].dst = dst_cache; - route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); -} - -static bool nft_is_valid_ether_device(const struct net_device *dev) -{ - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) - return false; - - return true; -} - -static int nft_dev_fill_forward_path(const struct nf_flow_route *route, - const struct dst_entry *dst_cache, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, u8 *ha, - struct net_device_path_stack *stack) -{ - const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; - struct net_device *dev = dst_cache->dev; - struct neighbour *n; - u8 nud_state; - - if (!nft_is_valid_ether_device(dev)) - goto out; - - n = dst_neigh_lookup(dst_cache, daddr); - if (!n) - return -1; - - read_lock_bh(&n->lock); - nud_state = n->nud_state; - ether_addr_copy(ha, n->ha); - read_unlock_bh(&n->lock); - neigh_release(n); - - if (!(nud_state & NUD_VALID)) - return -1; - -out: - return dev_fill_forward_path(dev, ha, stack); -} - -struct nft_forward_info { - const struct net_device *indev; - const struct net_device *outdev; - const struct net_device *hw_outdev; - struct id { - __u16 id; - __be16 proto; - } encap[NF_FLOW_TABLE_ENCAP_MAX]; - u8 num_encaps; - u8 ingress_vlans; - u8 h_source[ETH_ALEN]; - u8 h_dest[ETH_ALEN]; - enum flow_offload_xmit_type xmit_type; -}; - -static void nft_dev_path_info(const struct net_device_path_stack *stack, - struct nft_forward_info *info, - unsigned char *ha, struct nf_flowtable *flowtable) -{ - const struct net_device_path *path; - int i; - - memcpy(info->h_dest, ha, ETH_ALEN); - - for (i = 0; i < stack->num_paths; i++) { - path = &stack->path[i]; - switch (path->type) { - case DEV_PATH_ETHERNET: - case DEV_PATH_DSA: - case DEV_PATH_VLAN: - case DEV_PATH_PPPOE: - info->indev = path->dev; - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - if (path->type == DEV_PATH_ETHERNET) - break; - if (path->type == DEV_PATH_DSA) { - i = stack->num_paths; - break; - } - - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { - info->indev = NULL; - break; - } - if (!info->outdev) - info->outdev = path->dev; - info->encap[info->num_encaps].id = path->encap.id; - info->encap[info->num_encaps].proto = path->encap.proto; - info->num_encaps++; - if (path->type == DEV_PATH_PPPOE) - memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); - break; - case DEV_PATH_BRIDGE: - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - switch (path->bridge.vlan_mode) { - case DEV_PATH_BR_VLAN_UNTAG_HW: - info->ingress_vlans |= BIT(info->num_encaps - 1); - break; - case DEV_PATH_BR_VLAN_TAG: - info->encap[info->num_encaps].id = path->bridge.vlan_id; - info->encap[info->num_encaps].proto = path->bridge.vlan_proto; - info->num_encaps++; - break; - case DEV_PATH_BR_VLAN_UNTAG: - info->num_encaps--; - break; - case DEV_PATH_BR_VLAN_KEEP: - break; - } - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; - break; - default: - info->indev = NULL; - break; - } - } - if (!info->outdev) - info->outdev = info->indev; - - info->hw_outdev = info->indev; - - if (nf_flowtable_hw_offload(flowtable) && - nft_is_valid_ether_device(info->indev)) - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; -} - -static bool nft_flowtable_find_dev(const struct net_device *dev, - struct nft_flowtable *ft) -{ - struct nft_hook *hook; - bool found = false; - - list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (hook->ops.dev != dev) - continue; - - found = true; - break; - } - - return found; -} - -static void nft_dev_forward_path(struct nf_flow_route *route, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - const struct dst_entry *dst = route->tuple[dir].dst; - struct net_device_path_stack stack; - struct nft_forward_info info = {}; - unsigned char ha[ETH_ALEN]; - int i; - - if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) - nft_dev_path_info(&stack, &info, ha, &ft->data); - - if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) - return; - - route->tuple[!dir].in.ifindex = info.indev->ifindex; - for (i = 0; i < info.num_encaps; i++) { - route->tuple[!dir].in.encap[i].id = info.encap[i].id; - route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; - } - route->tuple[!dir].in.num_encaps = info.num_encaps; - route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; - - if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { - memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); - memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); - route->tuple[dir].out.ifindex = info.outdev->ifindex; - route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; - route->tuple[dir].xmit_type = info.xmit_type; - } -} - -static int nft_flow_route(const struct nft_pktinfo *pkt, - const struct nf_conn *ct, - struct nf_flow_route *route, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - struct dst_entry *this_dst = skb_dst(pkt->skb); - struct dst_entry *other_dst = NULL; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; - fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; - fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; - fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos); - fl.u.ip4.flowi4_mark = pkt->skb->mark; - fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; - break; - case NFPROTO_IPV6: - fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; - fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; - fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; - fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; - fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); - fl.u.ip6.flowi6_mark = pkt->skb->mark; - fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; - break; - } - - nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); - if (!other_dst) - return -ENOENT; - - nft_default_forward_path(route, this_dst, dir); - nft_default_forward_path(route, other_dst, !dir); - - if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && - route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { - nft_dev_forward_path(route, ct, dir, ft); - nft_dev_forward_path(route, ct, !dir, ft); - } - - return 0; -} - static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) @@ -283,6 +37,15 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family) return false; } +static void flow_offload_ct_tcp(struct nf_conn *ct) +{ + /* conntrack will not see all packets, disable tcp window validation. */ + spin_lock_bh(&ct->lock); + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + spin_unlock_bh(&ct->lock); +} + static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -349,24 +112,21 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (!flow) goto err_flow_alloc; - if (flow_offload_route_init(flow, &route) < 0) - goto err_flow_add; - - if (tcph) { - ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - } + flow_offload_route_init(flow, &route); + if (tcph) + flow_offload_ct_tcp(ct); + __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; - dst_release(route.tuple[!dir].dst); return; err_flow_add: flow_offload_free(flow); err_flow_alloc: + dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); @@ -375,11 +135,15 @@ out: } static int nft_flow_offload_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, hook_mask); } @@ -399,13 +163,15 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->net, ctx->table, + tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); + if (!nft_use_inc(&flowtable->use)) + return -EMFILE; + priv->flowtable = flowtable; - flowtable->use++; return nf_ct_netns_get(ctx->net, ctx->family); } @@ -424,7 +190,7 @@ static void nft_flow_offload_activate(const struct nft_ctx *ctx, { struct nft_flow_offload *priv = nft_expr_priv(expr); - priv->flowtable->use++; + nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 7b9d4d1bd17c..152a9fb4d23a 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -40,7 +40,7 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 }, - [NFTA_FWD_NFPROTO] = { .type = NLA_U32 }, + [NFTA_FWD_NFPROTO] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_fwd_netdev_init(const struct nft_ctx *ctx, @@ -52,7 +52,7 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_FWD_SREG_DEV] == NULL) return -EINVAL; - return nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, sizeof(int)); } @@ -178,12 +178,12 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + err = nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, sizeof(int)); if (err < 0) return err; - return nft_parse_register_load(tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, + return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, addr_len); } @@ -204,8 +204,7 @@ nla_put_failure: } static int nft_fwd_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) | (1 << NF_NETDEV_EGRESS)); @@ -270,4 +269,5 @@ module_exit(nft_fwd_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nftables netdev packet forwarding support"); MODULE_ALIAS_NFT_AF_EXPR(5, "fwd"); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index ee8d487b69c0..5d034bbb6913 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -51,7 +51,8 @@ static void nft_symhash_eval(const struct nft_expr *expr, struct sk_buff *skb = pkt->skb; u32 h; - h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus); + h = reciprocal_scale(__skb_get_hash_symmetric_net(nft_net(pkt), skb), + priv->modulus); regs->data[priv->dreg] = h + priv->offset; } @@ -59,7 +60,7 @@ static void nft_symhash_eval(const struct nft_expr *expr, static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, - [NFTA_HASH_LEN] = { .type = NLA_U32 }, + [NFTA_HASH_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, @@ -91,7 +92,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_HASH_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_HASH_SREG], &priv->sreg, len); if (err < 0) return err; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c9d2f7c29f53..02ee5fb69871 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -76,11 +76,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx, switch (priv->data.verdict.code) { case NFT_JUMP: case NFT_GOTO: - if (nft_chain_is_bound(chain)) { - err = -EBUSY; + err = nf_tables_bind_chain(ctx, chain); + if (err < 0) goto err1; - } - chain->bound = true; break; default: break; @@ -98,15 +96,86 @@ static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_activate(&chain_ctx, rule); + + nft_clear(ctx->net, chain); + break; + default: + break; + } + } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } +static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx, + struct nft_chain *chain, + enum nft_trans_phase phase) +{ + struct nft_ctx chain_ctx; + struct nft_rule *rule; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_deactivate(&chain_ctx, rule, phase); +} + static void nft_immediate_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_chain *chain; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nf_tables_unbind_chain(ctx, chain); + nft_deactivate_next(ctx->net, chain); + break; + case NFT_TRANS_PREPARE: + nft_immediate_chain_deactivate(ctx, chain, phase); + nft_deactivate_next(ctx->net, chain); + break; + default: + nft_immediate_chain_deactivate(ctx, chain, phase); + nft_chain_del(chain); + chain->bound = false; + nft_use_dec(&chain->table->use); + break; + } + break; + default: + break; + } + } if (phase == NFT_TRANS_COMMIT) return; @@ -131,16 +200,28 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, case NFT_GOTO: chain = data->verdict.chain; - if (!nft_chain_is_bound(chain)) + if (!nft_chain_binding(chain)) + break; + + /* Rule construction failed, but chain is already bound: + * let the transaction records release this chain and its rules. + */ + if (chain->bound) { + nft_use_dec(&chain->use); break; + } + /* Rule has been deleted, release chain and its rules. */ chain_ctx = *ctx; chain_ctx.chain = chain; - list_for_each_entry_safe(rule, n, &chain->rules, list) - nf_tables_rule_release(&chain_ctx, rule); - - nf_tables_chain_destroy(&chain_ctx); + nft_use_dec(&chain->use); + list_for_each_entry_safe(rule, n, &chain->rules, list) { + nft_use_dec(&chain->use); + list_del(&rule->list); + nf_tables_rule_destroy(&chain_ctx, rule); + } + nf_tables_chain_destroy(chain); break; default: break; @@ -163,8 +244,7 @@ nla_put_failure: } static int nft_immediate_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **d) + const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_ctx *pctx = (struct nft_ctx *)ctx; diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 28e2873ba24e..c4569d4b9228 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -23,7 +23,14 @@ #include <linux/ip.h> #include <linux/ipv6.h> -static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); +struct nft_inner_tun_ctx_locked { + struct nft_inner_tun_ctx ctx; + local_lock_t bh_lock; +}; + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { @@ -210,35 +217,71 @@ static int nft_inner_parse(const struct nft_inner *priv, struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { - struct nft_inner_tun_ctx ctx = {}; u32 off = pkt->inneroff; if (priv->flags & NFT_INNER_HDRSIZE && - nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0) + nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0) return -1; if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { - if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0) + if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0) return -1; } else if (priv->flags & NFT_INNER_TH) { - ctx.inner_thoff = off; - ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH; + tun_ctx->inner_thoff = off; + tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; } - *tun_ctx = ctx; tun_ctx->type = priv->type; + tun_ctx->cookie = (unsigned long)pkt->skb; pkt->flags |= NFT_PKTINFO_INNER_FULL; return 0; } +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx *this_cpu_tun_ctx; + + local_bh_disable(); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); + if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { + local_bh_enable(); + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + return false; + } + *tun_ctx = *this_cpu_tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + local_bh_enable(); + + return true; +} + +static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, + const struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx *this_cpu_tun_ctx; + + local_bh_disable(); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); + if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) + *this_cpu_tun_ctx = *tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + local_bh_enable(); +} + static bool nft_inner_parse_needed(const struct nft_inner *priv, const struct nft_pktinfo *pkt, - const struct nft_inner_tun_ctx *tun_ctx) + struct nft_inner_tun_ctx *tun_ctx) { if (!(pkt->flags & NFT_PKTINFO_INNER_FULL)) return true; + if (!nft_inner_restore_tun_ctx(pkt, tun_ctx)) + return true; + if (priv->type != tun_ctx->type) return true; @@ -248,27 +291,29 @@ static bool nft_inner_parse_needed(const struct nft_inner *priv, static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); const struct nft_inner *priv = nft_expr_priv(expr); + struct nft_inner_tun_ctx tun_ctx = {}; if (nft_payload_inner_offset(pkt) < 0) goto err; - if (nft_inner_parse_needed(priv, pkt, tun_ctx) && - nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0) + if (nft_inner_parse_needed(priv, pkt, &tun_ctx) && + nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0) goto err; switch (priv->expr_type) { case NFT_INNER_EXPR_PAYLOAD: - nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); + nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); break; case NFT_INNER_EXPR_META: - nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); + nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); break; default: WARN_ON_ONCE(1); goto err; } + nft_inner_save_tun_ctx(pkt, &tun_ctx); + return; err: regs->verdict.code = NFT_BREAK; @@ -298,6 +343,7 @@ static int nft_inner_init(const struct nft_ctx *ctx, int err; if (!tb[NFTA_INNER_FLAGS] || + !tb[NFTA_INNER_NUM] || !tb[NFTA_INNER_HDRSIZE] || !tb[NFTA_INNER_TYPE] || !tb[NFTA_INNER_EXPR]) diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 7f2bda6641bd..de1b6066bfa8 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -102,14 +102,18 @@ static void nft_last_destroy(const struct nft_ctx *ctx, kfree(priv->last); } -static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_last_priv *priv_dst = nft_expr_priv(dst); + struct nft_last_priv *priv_src = nft_expr_priv(src); - priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC); + priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp); if (!priv_dst->last) return -ENOMEM; + priv_dst->last->set = priv_src->last->set; + priv_dst->last->jiffies = priv_src->last->jiffies; + return 0; } diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 145dc62c6247..21d26b79b460 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -58,16 +58,19 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) static int nft_limit_init(struct nft_limit_priv *priv, const struct nlattr * const tb[], bool pkts) { - u64 unit, tokens; + u64 unit, tokens, rate_with_burst; + bool invert = false; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + if (priv->rate == 0) + return -EINVAL; + unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); - priv->nsecs = unit * NSEC_PER_SEC; - if (priv->rate == 0 || priv->nsecs < unit) + if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs)) return -EOVERFLOW; if (tb[NFTA_LIMIT_BURST]) @@ -76,18 +79,35 @@ static int nft_limit_init(struct nft_limit_priv *priv, if (pkts && priv->burst == 0) priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT; - if (priv->rate + priv->burst < priv->rate) + if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst)) return -EOVERFLOW; if (pkts) { - tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst; + u64 tmp = div64_u64(priv->nsecs, priv->rate); + + if (check_mul_overflow(tmp, priv->burst, &tokens)) + return -EOVERFLOW; } else { + u64 tmp; + /* The token bucket size limits the number of tokens can be * accumulated. tokens_max specifies the bucket size. * tokens_max = unit * (rate + burst) / rate. */ - tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst), - priv->rate); + if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp)) + return -EOVERFLOW; + + tokens = div64_u64(tmp, priv->rate); + } + + if (tb[NFTA_LIMIT_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); + + if (flags & ~NFT_LIMIT_F_INV) + return -EOPNOTSUPP; + + if (flags & NFT_LIMIT_F_INV) + invert = true; } priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT); @@ -96,13 +116,7 @@ static int nft_limit_init(struct nft_limit_priv *priv, priv->limit->tokens = tokens; priv->tokens_max = priv->limit->tokens; - - if (tb[NFTA_LIMIT_FLAGS]) { - u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); - - if (flags & NFT_LIMIT_F_INV) - priv->invert = true; - } + priv->invert = invert; priv->limit->last = ktime_get_ns(); spin_lock_init(&priv->limit->lock); @@ -136,7 +150,7 @@ static void nft_limit_destroy(const struct nft_ctx *ctx, } static int nft_limit_clone(struct nft_limit_priv *priv_dst, - const struct nft_limit_priv *priv_src) + const struct nft_limit_priv *priv_src, gfp_t gfp) { priv_dst->tokens_max = priv_src->tokens_max; priv_dst->rate = priv_src->rate; @@ -144,7 +158,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst, priv_dst->burst = priv_src->burst; priv_dst->invert = priv_src->invert; - priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC); + priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp); if (!priv_dst->limit) return -ENOMEM; @@ -209,14 +223,15 @@ static void nft_limit_pkts_destroy(const struct nft_ctx *ctx, nft_limit_destroy(ctx, &priv->limit); } -static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) { struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst); struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src); priv_dst->cost = priv_src->cost; - return nft_limit_clone(&priv_dst->limit, &priv_src->limit); + return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp); } static struct nft_expr_type nft_limit_type; @@ -267,12 +282,13 @@ static void nft_limit_bytes_destroy(const struct nft_ctx *ctx, nft_limit_destroy(ctx, priv); } -static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) { struct nft_limit_priv *priv_dst = nft_expr_priv(dst); struct nft_limit_priv *priv_src = nft_expr_priv(src); - return nft_limit_clone(priv_dst, priv_src); + return nft_limit_clone(priv_dst, priv_src, gfp); } static const struct nft_expr_ops nft_limit_bytes_ops = { diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 5defe6e4fd98..e35588137995 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -163,7 +163,7 @@ static int nft_log_init(const struct nft_ctx *ctx, nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { - priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); + priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT); if (priv->prefix == NULL) return -ENOMEM; nla_strscpy(priv->prefix, nla, nla_len(nla) + 1); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index cae5a6724163..fc2d7c5d83c8 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -19,40 +19,78 @@ struct nft_lookup { struct nft_set *set; u8 sreg; u8 dreg; + bool dreg_set; bool invert; struct nft_set_binding binding; }; -#ifdef CONFIG_RETPOLINE -bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { +#ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) - return nft_hash_lookup_fast(net, set, key, ext); + return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) - return nft_hash_lookup(net, set, key, ext); + return nft_hash_lookup(net, set, key); if (set->ops == &nft_set_rhash_type.ops) - return nft_rhash_lookup(net, set, key, ext); + return nft_rhash_lookup(net, set, key); if (set->ops == &nft_set_bitmap_type.ops) - return nft_bitmap_lookup(net, set, key, ext); + return nft_bitmap_lookup(net, set, key); if (set->ops == &nft_set_pipapo_type.ops) - return nft_pipapo_lookup(net, set, key, ext); + return nft_pipapo_lookup(net, set, key); #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) if (set->ops == &nft_set_pipapo_avx2_type.ops) - return nft_pipapo_avx2_lookup(net, set, key, ext); + return nft_pipapo_avx2_lookup(net, set, key); #endif if (set->ops == &nft_set_rbtree_type.ops) - return nft_rbtree_lookup(net, set, key, ext); + return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); - return set->ops->lookup(net, set, key, ext); +#endif + return set->ops->lookup(net, set, key); +} + +static unsigned int nft_base_seq(const struct net *net) +{ + /* pairs with smp_store_release() in nf_tables_commit() */ + return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ + return unlikely(seq != nft_base_seq(net)); +} + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + const struct nft_set_ext *ext; + unsigned int base_seq; + + do { + base_seq = nft_base_seq(net); + + ext = __nft_set_do_lookup(net, set, key); + if (ext) + break; + /* No match? There is a small chance that lookup was + * performed in the old generation, but nf_tables_commit() + * already unlinked a (matching) element. + * + * We need to repeat the lookup to make sure that we didn't + * miss a matching element in the new generation. + */ + } while (nft_lookup_should_retry(net, base_seq)); + + return ext; } EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -60,12 +98,12 @@ void nft_lookup_eval(const struct nft_expr *expr, { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; - const struct nft_set_ext *ext = NULL; const struct net *net = nft_net(pkt); + const struct nft_set_ext *ext; bool found; - found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext) ^ - priv->invert; + ext = nft_set_do_lookup(net, set, ®s->data[priv->sreg]); + found = !!ext ^ priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { @@ -75,7 +113,7 @@ void nft_lookup_eval(const struct nft_expr *expr, } if (ext) { - if (set->flags & NFT_SET_MAP) + if (priv->dreg_set) nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); @@ -89,7 +127,8 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, - [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 }, + [NFTA_LOOKUP_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV), }; static int nft_lookup_init(const struct nft_ctx *ctx, @@ -111,7 +150,7 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - err = nft_parse_register_load(tb[NFTA_LOOKUP_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg, set->klen); if (err < 0) return err; @@ -119,14 +158,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (tb[NFTA_LOOKUP_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); - if (flags & ~NFT_LOOKUP_F_INV) - return -EINVAL; - - if (flags & NFT_LOOKUP_F_INV) { - if (set->flags & NFT_SET_MAP) - return -EINVAL; + if (flags & NFT_LOOKUP_F_INV) priv->invert = true; - } } if (tb[NFTA_LOOKUP_DREG] != NULL) { @@ -136,12 +169,22 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return -EINVAL; err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], - &priv->dreg, NULL, set->dtype, + &priv->dreg, NULL, + nft_set_datatype(set), set->dlen); if (err < 0) return err; - } else if (set->flags & NFT_SET_MAP) - return -EINVAL; + priv->dreg_set = true; + } else if (set->flags & NFT_SET_MAP) { + /* Map given, but user asks for lookup only (i.e. to + * ignore value assoicated with key). + * + * This makes no sense for anonymous maps since they are + * scoped to the rule, but for named sets this can be useful. + */ + if (set->flags & NFT_SET_ANONYMOUS) + return -EINVAL; + } priv->binding.flags = set->flags & NFT_SET_MAP; @@ -167,7 +210,7 @@ static void nft_lookup_activate(const struct nft_ctx *ctx, { struct nft_lookup *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_lookup_destroy(const struct nft_ctx *ctx, @@ -188,7 +231,7 @@ static int nft_lookup_dump(struct sk_buff *skb, goto nla_put_failure; if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; - if (priv->set->flags & NFT_SET_MAP) + if (priv->dreg_set) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) @@ -199,55 +242,24 @@ nla_put_failure: return -1; } -static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - struct nft_ctx *pctx = (struct nft_ctx *)ctx; - const struct nft_data *data; - int err; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - pctx->level++; - err = nft_chain_validate(ctx, data->verdict.chain); - if (err < 0) - return err; - pctx->level--; - break; - default: - break; - } - - return 0; -} - static int nft_lookup_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **d) + const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); - struct nft_set_iter iter; + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nft_setelem_validate, + }; if (!(priv->set->flags & NFT_SET_MAP) || priv->set->dtype != NFT_DATA_VERDICT) return 0; - iter.genmask = nft_genmask_next(ctx->net); - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nft_lookup_validate_setelem; - priv->set->ops->walk(ctx, priv->set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_validate(ctx, priv->set); + if (iter.err < 0) return iter.err; diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index e55e455275c4..868bd4d73555 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -20,14 +20,14 @@ struct nft_masq { }; static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { - [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 }, }; static int nft_masq_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { int err; @@ -43,24 +43,21 @@ static int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - u32 plen = sizeof_field(struct nf_nat_range, min_addr.all); + u32 plen = sizeof_field(struct nf_nat_range, min_proto.all); struct nft_masq *priv = nft_expr_priv(expr); int err; - if (tb[NFTA_MASQ_FLAGS]) { + if (tb[NFTA_MASQ_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } if (tb[NFTA_MASQ_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_MASQ_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) @@ -96,23 +93,39 @@ nla_put_failure: return -1; } -static void nft_masq_ipv4_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_masq_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_masq *priv = nft_expr_priv(expr); + const struct nft_masq *priv = nft_expr_priv(expr); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); + range.min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); + } + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, + nft_hook(pkt), + &range, + nft_out(pkt)); + break; +#ifdef CONFIG_NF_TABLES_IPV6 + case NFPROTO_IPV6: + regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, + nft_out(pkt)); + break; +#endif + default: + WARN_ON_ONCE(1); + break; } - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), - &range, nft_out(pkt)); } static void @@ -125,7 +138,7 @@ static struct nft_expr_type nft_masq_ipv4_type; static const struct nft_expr_ops nft_masq_ipv4_ops = { .type = &nft_masq_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), - .eval = nft_masq_ipv4_eval, + .eval = nft_masq_eval, .init = nft_masq_init, .destroy = nft_masq_ipv4_destroy, .dump = nft_masq_dump, @@ -143,25 +156,6 @@ static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { }; #ifdef CONFIG_NF_TABLES_IPV6 -static void nft_masq_ipv6_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range2 range; - - memset(&range, 0, sizeof(range)); - range.flags = priv->flags; - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - } - regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, - nft_out(pkt)); -} - static void nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -172,7 +166,7 @@ static struct nft_expr_type nft_masq_ipv6_type; static const struct nft_expr_ops nft_masq_ipv6_ops = { .type = &nft_masq_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), - .eval = nft_masq_ipv6_eval, + .eval = nft_masq_eval, .init = nft_masq_init, .destroy = nft_masq_ipv6_destroy, .dump = nft_masq_dump, @@ -204,20 +198,6 @@ static inline void nft_masq_module_exit_ipv6(void) {} #endif #ifdef CONFIG_NF_TABLES_INET -static void nft_masq_inet_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - return nft_masq_ipv4_eval(expr, regs, pkt); - case NFPROTO_IPV6: - return nft_masq_ipv6_eval(expr, regs, pkt); - } - - WARN_ON_ONCE(1); -} - static void nft_masq_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -228,7 +208,7 @@ static struct nft_expr_type nft_masq_inet_type; static const struct nft_expr_ops nft_masq_inet_ops = { .type = &nft_masq_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), - .eval = nft_masq_inet_eval, + .eval = nft_masq_eval, .init = nft_masq_init, .destroy = nft_masq_inet_destroy, .dump = nft_masq_dump, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e384e0de7a54..05cd1e6e6a2f 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -63,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key, { switch (key) { case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); + nft_reg_store64((u64 *)dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: nft_reg_store8(dest, nft_meta_weekday()); @@ -185,12 +185,12 @@ static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, case NFT_META_IIFKIND: if (!in || !in->rtnl_link_ops) return false; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + strscpy_pad((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_OIFKIND: if (!out || !out->rtnl_link_ops) return false; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + strscpy_pad((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; default: return false; @@ -206,7 +206,7 @@ static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) { - strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); + strscpy_pad((char *)dest, dev ? dev->name : "", IFNAMSIZ); } static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) @@ -458,7 +458,7 @@ EXPORT_SYMBOL_GPL(nft_meta_set_eval); const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, - [NFTA_META_KEY] = { .type = NLA_U32 }, + [NFTA_META_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_META_SREG] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(nft_meta_policy); @@ -581,8 +581,7 @@ static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) } static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -600,8 +599,7 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, } int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -657,7 +655,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, } priv->len = len; - err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; @@ -839,6 +837,9 @@ static int nft_meta_inner_init(const struct nft_ctx *ctx, struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; + if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG]) + return -EINVAL; + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_PROTOCOL: @@ -951,7 +952,7 @@ static int nft_secmark_obj_init(const struct nft_ctx *ctx, if (tb[NFTA_SECMARK_CTX] == NULL) return -EINVAL; - priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL); + priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT); if (!priv->ctx) return -ENOMEM; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 047999150390..6e21f72c5b57 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -132,16 +132,21 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_nat *priv = nft_expr_priv(expr); int err; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; @@ -208,13 +213,13 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) @@ -226,15 +231,15 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_MAP_IPS; } - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) @@ -246,11 +251,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_NAT_FLAGS]) { + if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EOPNOTSUPP; - } return nf_ct_netns_get(ctx->net, family); } diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 7d29db7c2ac0..bd058babfc82 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -66,7 +66,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL); + priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT); if (!priv->counter) return -ENOMEM; diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7b01aa2ef653..1a62e384766a 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -13,15 +13,44 @@ #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) -static void nft_objref_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_object *obj = nft_objref_priv(expr); obj->ops->eval(obj, regs, pkt); } +static int nft_objref_validate_obj_type(const struct nft_ctx *ctx, u32 type) +{ + unsigned int hooks; + + switch (type) { + case NFT_OBJECT_SYNPROXY: + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + + hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD); + + return nft_chain_validate_hooks(ctx->chain, hooks); + default: + break; + } + + return 0; +} + +static int nft_objref_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_object *obj = nft_objref_priv(expr); + + return nft_objref_validate_obj_type(ctx, obj->ops->type->type); +} + static int nft_objref_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -41,8 +70,10 @@ static int nft_objref_init(const struct nft_ctx *ctx, if (IS_ERR(obj)) return -ENOENT; + if (!nft_use_inc(&obj->use)) + return -EMFILE; + nft_objref_priv(expr) = obj; - obj->use++; return 0; } @@ -72,7 +103,7 @@ static void nft_objref_deactivate(const struct nft_ctx *ctx, if (phase == NFT_TRANS_COMMIT) return; - obj->use--; + nft_use_dec(&obj->use); } static void nft_objref_activate(const struct nft_ctx *ctx, @@ -80,7 +111,7 @@ static void nft_objref_activate(const struct nft_ctx *ctx, { struct nft_object *obj = nft_objref_priv(expr); - obj->use++; + nft_use_inc_restore(&obj->use); } static const struct nft_expr_ops nft_objref_ops = { @@ -91,6 +122,7 @@ static const struct nft_expr_ops nft_objref_ops = { .activate = nft_objref_activate, .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, + .validate = nft_objref_validate, .reduce = NFT_REDUCE_READONLY, }; @@ -100,19 +132,18 @@ struct nft_objref_map { struct nft_set_binding binding; }; -static void nft_objref_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; - bool found; - found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext); - if (!found) { + ext = nft_set_do_lookup(net, set, ®s->data[priv->sreg]); + if (!ext) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; @@ -141,7 +172,7 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; - err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_OBJREF_SET_SREG], &priv->sreg, set->klen); if (err < 0) return err; @@ -185,7 +216,7 @@ static void nft_objref_map_activate(const struct nft_ctx *ctx, { struct nft_objref_map *priv = nft_expr_priv(expr); - priv->set->use++; + nf_tables_activate_set(ctx, priv->set); } static void nft_objref_map_destroy(const struct nft_ctx *ctx, @@ -196,6 +227,14 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } +static int nft_objref_map_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_objref_map *priv = nft_expr_priv(expr); + + return nft_objref_validate_obj_type(ctx, priv->set->objtype); +} + static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), @@ -205,6 +244,7 @@ static const struct nft_expr_ops nft_objref_map_ops = { .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, + .validate = nft_objref_map_validate, .reduce = NFT_REDUCE_READONLY, }; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 70820c66b591..1c0b493ef0a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -23,7 +23,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct sk_buff *skb = pkt->skb; - char os_match[NFT_OSF_MAXGENRELEN + 1]; + char os_match[NFT_OSF_MAXGENRELEN]; const struct tcphdr *tcp; struct nf_osf_data data; struct tcphdr _tcph; @@ -45,7 +45,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, } if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { - strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); + strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); } else { if (priv->flags & NFT_OSF_F_VERSION) snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", @@ -53,7 +53,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, else strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); - strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN); + strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN); } } @@ -63,7 +63,6 @@ static int nft_osf_init(const struct nft_ctx *ctx, { struct nft_osf *priv = nft_expr_priv(expr); u32 flags; - int err; u8 ttl; if (!tb[NFTA_OSF_DREG]) @@ -83,13 +82,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->flags = flags; } - err = nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, - NULL, NFT_DATA_VALUE, - NFT_OSF_MAXGENRELEN); - if (err < 0) - return err; - - return 0; + return nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, + NFT_OSF_MAXGENRELEN); } static int nft_osf_dump(struct sk_buff *skb, @@ -113,8 +108,7 @@ nla_put_failure: } static int nft_osf_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { unsigned int hooks; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 3a3c7746e88f..b0214418f75a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -40,41 +40,32 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, /* add vlan header into the user buffer for if tag was removed by offloads */ static bool -nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; - u8 vlan_hlen = 0; - - if ((skb->protocol == htons(ETH_P_8021AD) || - skb->protocol == htons(ETH_P_8021Q)) && - offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) - vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < VLAN_ETH_HLEN + vlan_hlen) { + if (offset < VLAN_ETH_HLEN) { u8 ethlen = len; - if (vlan_hlen && - skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) - return false; - else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) + if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - if (offset + len > VLAN_ETH_HLEN + vlan_hlen) - ethlen -= offset + len - VLAN_ETH_HLEN - vlan_hlen; + if (offset + len > VLAN_ETH_HLEN) + ethlen -= offset + len - VLAN_ETH_HLEN; - memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); + memcpy(dst_u8, vlanh + offset, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN + vlan_hlen; + offset = ETH_HLEN; } else { - offset -= VLAN_HLEN + vlan_hlen; + offset -= VLAN_HLEN; } return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; @@ -154,6 +145,17 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt) return pkt->inneroff; } +static bool nft_payload_need_vlan_adjust(u32 offset, u32 len) +{ + unsigned int boundary = offset + len; + + /* data past ether src/dst requested, copy needed */ + if (boundary > offsetof(struct ethhdr, h_proto)) + return true; + + return false; +} + void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -168,10 +170,11 @@ void nft_payload_eval(const struct nft_expr *expr, switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: - if (!skb_mac_header_was_set(skb)) + if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) == 0) goto err; - if (skb_vlan_tag_present(skb)) { + if (skb_vlan_tag_present(skb) && + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; @@ -209,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), @@ -647,6 +650,10 @@ static int nft_payload_inner_init(const struct nft_ctx *ctx, struct nft_payload *priv = nft_expr_priv(expr); u32 base; + if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] || + !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG]) + return -EINVAL; + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); switch (base) { case NFT_PAYLOAD_TUN_HEADER: @@ -677,7 +684,7 @@ static const struct nft_expr_ops nft_payload_inner_ops = { static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); + csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum); if (*sum == 0) *sum = CSUM_MANGLED_0; } @@ -790,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, struct nft_payload_set { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 sreg; u8 csum_type; @@ -798,21 +805,79 @@ struct nft_payload_set { u8 csum_flags; }; +/* This is not struct vlan_hdr. */ +struct nft_payload_vlan_hdr { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; +}; + +static bool +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, + int *vlan_hlen) +{ + struct nft_payload_vlan_hdr *vlanh; + __be16 vlan_proto; + u16 vlan_tci; + + if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) { + *vlan_hlen = VLAN_HLEN; + return true; + } + + switch (offset) { + case offsetof(struct vlan_ethhdr, h_vlan_proto): + if (len == 2) { + vlan_proto = nft_reg_load_be16(src); + skb->vlan_proto = vlan_proto; + } else if (len == 4) { + vlanh = (struct nft_payload_vlan_hdr *)src; + __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto, + ntohs(vlanh->h_vlan_TCI)); + } else { + return false; + } + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (len != 2) + return false; + + vlan_tci = ntohs(nft_reg_load_be16(src)); + skb->vlan_tci = vlan_tci; + break; + default: + return false; + } + + return true; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload_set *priv = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; const u32 *src = ®s->data[priv->sreg]; - int offset, csum_offset; + int offset, csum_offset, vlan_hlen = 0; + struct sk_buff *skb = pkt->skb; __wsum fsum, tsum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; - offset = skb_mac_header(skb) - skb->data; + + if (skb_vlan_tag_present(skb) && + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { + if (!nft_payload_set_vlan(src, skb, + priv->offset, priv->len, + &vlan_hlen)) + goto err; + + if (!vlan_hlen) + return; + } + + offset = skb_mac_header(skb) - skb->data - vlan_hlen; break; case NFT_PAYLOAD_NETWORK_HEADER: offset = skb_network_offset(skb); @@ -839,6 +904,9 @@ static void nft_payload_set_eval(const struct nft_expr *expr, ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER && priv->base != NFT_PAYLOAD_INNER_HEADER) || skb->ip_summed != CHECKSUM_PARTIAL)) { + if (offset + priv->len > skb->len) + goto err; + fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); @@ -872,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); - u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); - priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return err; + priv->offset = offset; + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { @@ -916,7 +988,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, } priv->csum_type = csum_type; - return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg, + return nft_parse_register_load(ctx, tb[NFTA_PAYLOAD_SREG], &priv->sreg, priv->len); } @@ -1001,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return ERR_PTR(err); diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index b2b8127c8d43..344fe311878f 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -69,8 +69,7 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr, } static int nft_queue_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | @@ -136,7 +135,7 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, struct nft_queue *priv = nft_expr_priv(expr); int err; - err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM], + err = nft_parse_register_load(ctx, tb[NFTA_QUEUE_SREG_QNUM], &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 123578e28917..df0798da2329 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -19,10 +19,16 @@ struct nft_quota { }; static inline bool nft_overquota(struct nft_quota *priv, - const struct sk_buff *skb) + const struct sk_buff *skb, + bool *report) { - return atomic64_add_return(skb->len, priv->consumed) >= - atomic64_read(&priv->quota); + u64 consumed = atomic64_add_return(skb->len, priv->consumed); + u64 quota = atomic64_read(&priv->quota); + + if (report) + *report = consumed >= quota; + + return consumed > quota; } static inline bool nft_quota_invert(struct nft_quota *priv) @@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv)) + if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; } @@ -51,13 +57,13 @@ static void nft_quota_obj_eval(struct nft_object *obj, const struct nft_pktinfo *pkt) { struct nft_quota *priv = nft_obj_data(obj); - bool overquota; + bool overquota, report; - overquota = nft_overquota(priv, pkt->skb); + overquota = nft_overquota(priv, pkt->skb, &report); if (overquota ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; - if (overquota && + if (report && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); @@ -233,15 +239,19 @@ static void nft_quota_destroy(const struct nft_ctx *ctx, return nft_quota_do_destroy(ctx, priv); } -static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_quota *priv_dst = nft_expr_priv(dst); + struct nft_quota *priv_src = nft_expr_priv(src); + + priv_dst->quota = priv_src->quota; + priv_dst->flags = priv_src->flags; - priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC); + priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp); if (!priv_dst->consumed) return -ENOMEM; - atomic64_set(priv_dst->consumed, 0); + *priv_dst->consumed = *priv_src->consumed; return 0; } diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 0566d6aaf1e5..ea382f7bbd78 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -42,7 +42,7 @@ void nft_range_eval(const struct nft_expr *expr, static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = { [NFTA_RANGE_SREG] = { .type = NLA_U32 }, - [NFTA_RANGE_OP] = { .type = NLA_U32 }, + [NFTA_RANGE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED }, [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED }, }; @@ -83,7 +83,7 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr goto err2; } - err = nft_parse_register_load(tb[NFTA_RANGE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_RANGE_SREG], &priv->sreg, desc_from.len); if (err < 0) goto err2; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 5f7739987559..95eedad85c83 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -22,12 +22,12 @@ struct nft_redir { static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_REDIR_FLAGS] = { .type = NLA_U32 }, + [NFTA_REDIR_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_redir_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { int err; @@ -48,15 +48,15 @@ static int nft_redir_init(const struct nft_ctx *ctx, unsigned int plen; int err; - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) @@ -64,13 +64,12 @@ static int nft_redir_init(const struct nft_ctx *ctx, } else { priv->sreg_proto_max = priv->sreg_proto_min; } + + priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_REDIR_FLAGS]) { + if (tb[NFTA_REDIR_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } return nf_ct_netns_get(ctx->net, ctx->family); } @@ -99,25 +98,37 @@ nla_put_failure: return -1; } -static void nft_redir_ipv4_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_redir_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_ipv4_multi_range_compat mr; + const struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_range2 range; - memset(&mr, 0, sizeof(mr)); + memset(&range, 0, sizeof(range)); + range.flags = priv->flags; if (priv->sreg_proto_min) { - mr.range[0].min.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - mr.range[0].max.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range.min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); } - mr.range[0].flags |= priv->flags; - - regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range, + nft_hook(pkt)); + break; +#ifdef CONFIG_NF_TABLES_IPV6 + case NFPROTO_IPV6: + regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, + nft_hook(pkt)); + break; +#endif + default: + WARN_ON_ONCE(1); + break; + } } static void @@ -130,7 +141,7 @@ static struct nft_expr_type nft_redir_ipv4_type; static const struct nft_expr_ops nft_redir_ipv4_ops = { .type = &nft_redir_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_ipv4_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_ipv4_destroy, .dump = nft_redir_dump, @@ -148,28 +159,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { }; #ifdef CONFIG_NF_TABLES_IPV6 -static void nft_redir_ipv6_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_range2 range; - - memset(&range, 0, sizeof(range)); - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } - - range.flags |= priv->flags; - - regs->verdict.code = - nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt)); -} - static void nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -180,7 +169,7 @@ static struct nft_expr_type nft_redir_ipv6_type; static const struct nft_expr_ops nft_redir_ipv6_ops = { .type = &nft_redir_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_ipv6_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_ipv6_destroy, .dump = nft_redir_dump, @@ -199,20 +188,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { #endif #ifdef CONFIG_NF_TABLES_INET -static void nft_redir_inet_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - return nft_redir_ipv4_eval(expr, regs, pkt); - case NFPROTO_IPV6: - return nft_redir_ipv6_eval(expr, regs, pkt); - } - - WARN_ON_ONCE(1); -} - static void nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -223,7 +198,7 @@ static struct nft_expr_type nft_redir_inet_type; static const struct nft_expr_ops nft_redir_inet_ops = { .type = &nft_redir_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_inet_eval, + .eval = nft_redir_eval, .init = nft_redir_init, .destroy = nft_redir_inet_destroy, .dump = nft_redir_dump, @@ -236,7 +211,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = { .name = "redir", .ops = &nft_redir_inet_ops, .policy = nft_redir_policy, - .maxattr = NFTA_MASQ_MAX, + .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index f2addc844dd2..196a92c7ea09 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -18,14 +18,13 @@ #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { - [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, + [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 973fa31a9dd6..49020e67304a 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -61,8 +61,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, } static int nft_reject_inet_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c index 7865cd8b11bb..2558ce1505d9 100644 --- a/net/netfilter/nft_reject_netdev.c +++ b/net/netfilter/nft_reject_netdev.c @@ -145,8 +145,7 @@ out: } static int nft_reject_netdev_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); } diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 5990fdd7b3cc..dc50b9a5bd68 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -73,14 +73,14 @@ void nft_rt_get_eval(const struct nft_expr *expr, if (nft_pf(pkt) != NFPROTO_IPV4) goto err; - *dest = (__force u32)rt_nexthop((const struct rtable *)dst, + *dest = (__force u32)rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: if (nft_pf(pkt) != NFPROTO_IPV6) goto err; - memcpy(dest, rt6_nexthop((struct rt6_info *)dst, + memcpy(dest, rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr), sizeof(struct in6_addr)); break; @@ -104,7 +104,7 @@ err: static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { [NFTA_RT_DREG] = { .type = NLA_U32 }, - [NFTA_RT_KEY] = { .type = NLA_U32 }, + [NFTA_RT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_rt_get_init(const struct nft_ctx *ctx, @@ -160,12 +160,16 @@ nla_put_failure: return -1; } -static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_rt *priv = nft_expr_priv(expr); unsigned int hooks; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + switch (priv->key) { case NFT_RT_NEXTHOP4: case NFT_RT_NEXTHOP6: diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 96081ac8d2b4..8d3f040a904a 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -13,6 +13,7 @@ #include <net/netfilter/nf_tables_core.h> struct nft_bitmap_elem { + struct nft_elem_priv priv; struct list_head head; struct nft_set_ext ext; }; @@ -74,26 +75,33 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) } INDIRECT_CALLABLE_SCOPE -bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { const struct nft_bitmap *priv = nft_set_priv(set); + static const struct nft_set_ext found; u8 genmask = nft_genmask_cur(net); u32 idx, off; nft_bitmap_location(set, key, &idx, &off); - return nft_bitmap_active(priv->bitmap, idx, off, genmask); + if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) + return &found; + + return NULL; } static struct nft_bitmap_elem * -nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, +nft_bitmap_elem_find(const struct net *net, + const struct nft_set *set, struct nft_bitmap_elem *this, u8 genmask) { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(net)->commit_mutex)) { if (memcmp(nft_set_ext_key(&be->ext), nft_set_ext_key(&this->ext), set->klen) || !nft_set_elem_active(&be->ext, genmask)) @@ -104,8 +112,9 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, return NULL; } -static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_bitmap_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -116,23 +125,23 @@ static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, !nft_set_elem_active(&be->ext, genmask)) continue; - return be; + return &be->priv; } return ERR_PTR(-ENOENT); } static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *new = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; - be = nft_bitmap_elem_find(set, new, genmask); + be = nft_bitmap_elem_find(net, set, new, genmask); if (be) { - *ext = &be->ext; + *elem_priv = &be->priv; return -EEXIST; } @@ -144,12 +153,11 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, return 0; } -static void nft_bitmap_remove(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static void nft_bitmap_remove(const struct net *net, const struct nft_set *set, + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -161,47 +169,46 @@ static void nft_bitmap_remove(const struct net *net, static void nft_bitmap_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); - nft_set_elem_change_active(net, set, &be->ext); + nft_clear(net, &be->ext); } -static bool nft_bitmap_flush(const struct net *net, - const struct nft_set *set, void *_be) +static void nft_bitmap_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); - struct nft_bitmap_elem *be = _be; u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); - - return true; } -static void *nft_bitmap_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *this = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, elem->key.val.data, &idx, &off); - be = nft_bitmap_elem_find(set, this, genmask); + be = nft_bitmap_elem_find(net, set, this, genmask); if (!be) return NULL; @@ -209,7 +216,7 @@ static void *nft_bitmap_deactivate(const struct net *net, priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); - return be; + return &be->priv; } static void nft_bitmap_walk(const struct nft_ctx *ctx, @@ -218,17 +225,13 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - struct nft_set_elem elem; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&be->ext, iter->genmask)) - goto cont; - elem.priv = be; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &be->priv); if (iter->err < 0) return; @@ -265,19 +268,22 @@ static int nft_bitmap_init(const struct nft_set *set, { struct nft_bitmap *priv = nft_set_priv(set); + BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0); + INIT_LIST_HEAD(&priv->list); priv->bitmap_size = nft_bitmap_size(set->klen); return 0; } -static void nft_bitmap_destroy(const struct nft_set *set) +static void nft_bitmap_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nft_set_elem_destroy(set, be, true); + nf_tables_set_elem_destroy(ctx, set, &be->priv); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 76de6c8d9865..ba01ce75d6de 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -24,10 +24,14 @@ struct nft_rhash { struct rhashtable ht; struct delayed_work gc_work; + u32 wq_gc_seq; }; struct nft_rhash_elem { + struct nft_elem_priv priv; struct rhash_head node; + struct llist_node walk_node; + u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -35,6 +39,7 @@ struct nft_rhash_cmp_arg { const struct nft_set *set; const u32 *key; u8 genmask; + u64 tstamp; }; static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed) @@ -59,7 +64,9 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; - if (nft_set_elem_expired(&he->ext)) + if (nft_set_elem_is_dead(&he->ext)) + return 1; + if (__nft_set_elem_expired(&he->ext, x->tstamp)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) return 1; @@ -75,8 +82,9 @@ static const struct rhashtable_params nft_rhash_params = { }; INDIRECT_CALLABLE_SCOPE -bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_rhash *priv = nft_set_priv(set); const struct nft_rhash_elem *he; @@ -84,17 +92,19 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) - *ext = &he->ext; + return &he->ext; - return !!he; + return NULL; } -static void *nft_rhash_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_rhash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; @@ -102,39 +112,40 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = elem->key.val.data, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) - return he; + return &he->priv; return ERR_PTR(-ENOENT); } -static bool nft_rhash_update(struct nft_set *set, const u32 *key, - void *(*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *regs), - const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext) +static const struct nft_set_ext * +nft_rhash_update(struct nft_set *set, const u32 *key, + const struct nft_expr *expr, struct nft_regs *regs) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he, *prev; + struct nft_elem_priv *elem_priv; struct nft_rhash_cmp_arg arg = { .genmask = NFT_GENMASK_ANY, .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) goto out; - he = new(set, expr, regs); - if (he == NULL) + elem_priv = nft_dynset_new(set, expr, regs); + if (!elem_priv) goto err1; + he = nft_elem_priv_cast(elem_priv); + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -142,71 +153,67 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, /* Another cpu may race to insert the element with the same key */ if (prev) { - nft_set_elem_destroy(set, he, true); + nft_set_elem_destroy(set, &he->priv, true); atomic_dec(&set->nelems); he = prev; } out: - *ext = &he->ext; - return true; + return &he->ext; err2: - nft_set_elem_destroy(set, he, true); + nft_set_elem_destroy(set, &he->priv, true); atomic_dec(&set->nelems); err1: - return false; + return NULL; } static int nft_rhash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv); struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he = elem->priv; struct nft_rhash_cmp_arg arg = { .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; struct nft_rhash_elem *prev; + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) return PTR_ERR(prev); if (prev) { - *ext = &prev->ext; + *elem_priv = &prev->priv; return -EEXIST; } return 0; } static void nft_rhash_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_rhash_elem *he = elem->priv; + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); + nft_clear(net, &he->ext); } -static bool nft_rhash_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_rhash_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_rhash_elem *he = priv; + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); } -static void *nft_rhash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_rhash_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; @@ -214,25 +221,25 @@ static void *nft_rhash_deactivate(const struct net *net, .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); - return he; + return &he->priv; } static void nft_rhash_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he = elem->priv; rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); } @@ -252,16 +259,17 @@ static bool nft_rhash_delete(const struct nft_set *set, if (he == NULL) return false; - return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; + nft_set_elem_dead(&he->ext); + + return true; } -static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he; struct rhashtable_iter hti; - struct nft_set_elem elem; + struct nft_rhash_elem *he; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -278,14 +286,8 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&he->ext)) - goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; - elem.priv = he; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) break; @@ -296,6 +298,97 @@ cont: rhashtable_walk_exit(&hti); } +static void nft_rhash_walk_update(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *tmp; + struct llist_node *first_node; + struct rhashtable_iter hti; + LLIST_HEAD(walk_list); + + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (set->in_update_walk) { + /* This can happen with bogus rulesets during ruleset validation + * when a verdict map causes a jump back to the same map. + * + * Without this extra check the walk_next loop below will see + * elems on the callers walk_list and skip (not validate) them. + */ + iter->err = -EMLINK; + return; + } + + /* walk happens under RCU. + * + * We create a snapshot list so ->iter callback can sleep. + * commit_mutex is held, elements can ... + * .. be added in parallel from dataplane (dynset) + * .. be marked as dead in parallel from dataplane (dynset). + * .. be queued for removal in parallel (gc timeout). + * .. not be freed: transaction mutex is held. + */ + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; + } + + continue; + } + + /* rhashtable resized during walk, skip */ + if (llist_on_list(&he->walk_node)) + continue; + + llist_add(&he->walk_node, &walk_list); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + first_node = __llist_del_all(&walk_list); + set->in_update_walk = true; + llist_for_each_entry_safe(he, tmp, first_node, walk_node) { + if (iter->err == 0) { + iter->err = iter->fn(ctx, set, iter, &he->priv); + if (iter->err == 0) + iter->count++; + } + + /* all entries must be cleared again, else next ->walk iteration + * will skip entries. + */ + init_llist_node(&he->walk_node); + } + set->in_update_walk = false; +} + +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + switch (iter->type) { + case NFT_ITER_UPDATE: + /* only relevant for netlink dumps which use READ type */ + WARN_ON_ONCE(iter->skip != 0); + + nft_rhash_walk_update(ctx, set, iter); + break; + case NFT_ITER_READ: + nft_rhash_walk_ro(ctx, set, iter); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, struct nft_set_ext *ext) { @@ -305,7 +398,8 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, nft_setelem_expr_foreach(expr, elem_expr, size) { if (expr->ops->gc && - expr->ops->gc(read_pnet(&set->net), expr)) + expr->ops->gc(read_pnet(&set->net), expr) && + set->flags & NFT_SET_EVAL) return true; } @@ -314,25 +408,60 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, static void nft_rhash_gc(struct work_struct *work) { + struct nftables_pernet *nft_net; struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + + if (nft_set_gc_is_pending(set)) + goto done; + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; + + /* Elements never collected use a zero gc worker sequence number. */ + if (unlikely(++priv->wq_gc_seq == 0)) + priv->wq_gc_seq++; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; - continue; + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } + /* rhashtable walk is unstable, already seen in this gc run? + * Then, skip this element. In case of (unlikely) sequence + * wraparound and stale element wq_gc_seq, next gc run will + * just find this expired element. + */ + if (he->wq_gc_seq == priv->wq_gc_seq) + continue; + + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && nft_rhash_expr_needs_gc_run(set, &he->ext)) goto needs_gc_run; @@ -340,26 +469,28 @@ static void nft_rhash_gc(struct work_struct *work) if (!nft_set_elem_expired(&he->ext)) continue; needs_gc_run: - if (nft_set_elem_mark_busy(&he->ext)) - continue; - - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + /* annotate gc sequence for this attempt. */ + he->wq_gc_seq = priv->wq_gc_seq; + nft_trans_gc_elem_add(gc, he); } + + gc = nft_trans_gc_catchall_async(gc, gc_seq); + +try_later: + /* catchall list iteration requires rcu read side lock. */ rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - he = nft_set_catchall_gc(set); - if (he) { - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb) - nft_set_gc_batch_add(gcb, he); - } - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); + +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -386,6 +517,8 @@ static int nft_rhash_init(const struct nft_set *set, struct rhashtable_params params = nft_rhash_params; int err; + BUILD_BUG_ON(offsetof(struct nft_rhash_elem, priv) != 0); + params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT; params.key_len = set->klen; @@ -394,25 +527,37 @@ static int nft_rhash_init(const struct nft_set *set, return err; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); - if (set->flags & NFT_SET_TIMEOUT) + if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) nft_rhash_gc_init(set); return 0; } +struct nft_rhash_ctx { + const struct nft_ctx ctx; + const struct nft_set *set; +}; + static void nft_rhash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy(arg, ptr, true); + struct nft_rhash_ctx *rhash_ctx = arg; + struct nft_rhash_elem *he = ptr; + + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, &he->priv); } -static void nft_rhash_destroy(const struct nft_set *set) +static void nft_rhash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_ctx rhash_ctx = { + .ctx = *ctx, + .set = set, + }; cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, - (void *)set); + (void *)&rhash_ctx); } /* Number of buckets is stored in u32, so cap our result to 1U<<31 */ @@ -445,13 +590,15 @@ struct nft_hash { }; struct nft_hash_elem { + struct nft_elem_priv priv; struct hlist_node node; struct nft_set_ext ext; }; INDIRECT_CALLABLE_SCOPE -bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -462,16 +609,15 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set, hash = reciprocal_scale(hash, priv->buckets); hlist_for_each_entry_rcu(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) && - nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; - return true; - } + nft_set_elem_active(&he->ext, genmask)) + return &he->ext; } - return false; + return NULL; } -static void *nft_hash_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_hash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -483,15 +629,15 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && nft_set_elem_active(&he->ext, genmask)) - return he; + return &he->priv; } return ERR_PTR(-ENOENT); } INDIRECT_CALLABLE_SCOPE -bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -504,12 +650,10 @@ bool nft_hash_lookup_fast(const struct net *net, hlist_for_each_entry_rcu(he, &priv->table[hash], node) { k2 = *(u32 *)nft_set_ext_key(&he->ext)->data; if (k1 == k2 && - nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; - return true; - } + nft_set_elem_active(&he->ext, genmask)) + return &he->ext; } - return false; + return NULL; } static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv, @@ -531,9 +675,9 @@ static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv, static int nft_hash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { - struct nft_hash_elem *this = elem->priv, *he; + struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he; struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 hash; @@ -543,7 +687,7 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, if (!memcmp(nft_set_ext_key(&this->ext), nft_set_ext_key(&he->ext), set->klen) && nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; + *elem_priv = &he->priv; return -EEXIST; } } @@ -552,28 +696,28 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, } static void nft_hash_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = elem->priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } -static bool nft_hash_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_hash_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &he->ext); - return true; } -static void *nft_hash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_hash_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he; struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *this = elem->priv, *he; u8 genmask = nft_genmask_next(net); u32 hash; @@ -583,7 +727,7 @@ static void *nft_hash_deactivate(const struct net *net, set->klen) && nft_set_elem_active(&he->ext, genmask)) { nft_set_elem_change_active(net, set, &he->ext); - return he; + return &he->priv; } } return NULL; @@ -591,9 +735,9 @@ static void *nft_hash_deactivate(const struct net *net, static void nft_hash_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = elem->priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); hlist_del_rcu(&he->node); } @@ -603,19 +747,15 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; - struct nft_set_elem elem; int i; for (i = 0; i < priv->buckets; i++) { - hlist_for_each_entry_rcu(he, &priv->table[i], node) { + hlist_for_each_entry_rcu(he, &priv->table[i], node, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; - - elem.priv = he; - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) return; cont: @@ -643,7 +783,8 @@ static int nft_hash_init(const struct nft_set *set, return 0; } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_hash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; @@ -653,7 +794,7 @@ static void nft_hash_destroy(const struct nft_set *set) for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nft_set_elem_destroy(set, he, true); + nf_tables_set_elem_destroy(ctx, set, &he->priv); } } } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 06d46d182634..112fe46788b6 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -342,9 +342,6 @@ #include "nft_set_pipapo_avx2.h" #include "nft_set_pipapo.h" -/* Current working bitmap index, toggled between field matches */ -static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); - /** * pipapo_refill() - For each set bit, set bits from selected mapping table item * @map: Bitmap to be scanned for set bits @@ -362,11 +359,13 @@ static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); * * Return: -1 on no match, bit position on 'match_only', 0 otherwise. */ -int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, - union nft_pipapo_map_bucket *mt, bool match_only) +int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, + unsigned long *dst, + const union nft_pipapo_map_bucket *mt, bool match_only) { unsigned long bitset; - int k, ret = -1; + unsigned int k; + int ret = -1; for (k = 0; k < len; k++) { bitset = map[k]; @@ -398,41 +397,47 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, } /** - * nft_pipapo_lookup() - Lookup function - * @net: Network namespace - * @set: nftables API set representation - * @key: nftables API element representation containing key data - * @ext: nftables API extension pointer, filled with matching reference + * pipapo_get_slow() - Get matching element reference given key data + * @m: storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: timestamp to check for expired elements * * For more details, see DOC: Theory of Operation. * - * Return: true on match, false otherwise. + * This is the main lookup function. It matches key data against either + * the working match set or the uncommitted copy, depending on what the + * caller passed to us. + * nft_pipapo_get (lookup from userspace/control plane) and nft_pipapo_lookup + * (datapath lookup) pass the active copy. + * The insertion path will pass the uncommitted working copy. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); - unsigned long *res_map, *fill_map; - u8 genmask = nft_genmask_cur(net); - const u8 *rp = (const u8 *)key; - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; + unsigned long *res_map, *fill_map, *map; + struct nft_pipapo_scratch *scratch; + const struct nft_pipapo_field *f; bool map_index; int i; local_bh_disable(); - map_index = raw_cpu_read(nft_pipapo_scratch_index); - - m = rcu_dereference(priv->match); - - if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) goto out; + __local_lock_nested_bh(&scratch->bh_lock); + + map_index = scratch->map_index; - res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); - fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res_map = map + (map_index ? m->bsize_max : 0); + fill_map = map + (map_index ? 0 : m->bsize_max); - memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; @@ -442,12 +447,12 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, * packet bytes value, then AND bucket value */ if (likely(f->bb == 8)) - pipapo_and_field_buckets_8bit(f, res_map, rp); + pipapo_and_field_buckets_8bit(f, res_map, data); else - pipapo_and_field_buckets_4bit(f, res_map, rp); + pipapo_and_field_buckets_4bit(f, res_map, data); NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; - rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); + data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' @@ -460,16 +465,19 @@ next_match: b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, last); if (b < 0) { - raw_cpu_write(nft_pipapo_scratch_index, map_index); + scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); - return false; + return NULL; } if (last) { - *ext = &f->mt[b].e->ext; - if (unlikely(nft_set_elem_expired(*ext) || - !nft_set_elem_active(*ext, genmask))) + struct nft_pipapo_elem *e; + + e = f->mt[b].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) goto next_match; /* Last field: we're just returning the key without @@ -477,10 +485,10 @@ next_match: * current inactive bitmap is clean and can be reused as * *next* bitmap (not initial) for the next packet. */ - raw_cpu_write(nft_pipapo_scratch_index, map_index); + scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); - - return true; + return e; } /* Swap bitmap indices: res_map is the initial bitmap for the @@ -490,119 +498,202 @@ next_match: map_index = !map_index; swap(res_map, fill_map); - rp += NFT_PIPAPO_GROUPS_PADDING(f); + data += NFT_PIPAPO_GROUPS_PADDING(f); } + __local_unlock_nested_bh(&scratch->bh_lock); out: local_bh_enable(); - return false; + return NULL; } /** * pipapo_get() - Get matching element reference given key data - * @net: Network namespace - * @set: nftables API set representation + * @m: Storage containing the set elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements * - * This is essentially the same as the lookup function, except that it matches - * key data against the uncommitted copy and doesn't use preallocated maps for - * bitmap results. + * This is a dispatcher function, either calling out the generic C + * implementation or, if available, the AVX2 one. + * This helper is only called from the control plane, with either RCU + * read lock or transaction mutex held. * - * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -static struct nft_pipapo_elem *pipapo_get(const struct net *net, - const struct nft_set *set, - const u8 *data, u8 genmask) +static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m = priv->clone; - unsigned long *res_map, *fill_map = NULL; - struct nft_pipapo_field *f; - int i; + struct nft_pipapo_elem *e; - res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); - if (!res_map) { - ret = ERR_PTR(-ENOMEM); - goto out; - } + local_bh_disable(); - fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); - if (!fill_map) { - ret = ERR_PTR(-ENOMEM); - goto out; +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) { + e = pipapo_get_avx2(m, data, genmask, tstamp); + local_bh_enable(); + return e; } +#endif + e = pipapo_get_slow(m, data, genmask, tstamp); + local_bh_enable(); + return e; +} - memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); +/** + * nft_pipapo_lookup() - Dataplane fronted for main lookup function + * @net: Network namespace + * @set: nftables API set representation + * @key: pointer to nft registers containing key data + * + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy. + * Unlike other set types, this uses 0 instead of nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK_ANY doesn't work for the same reason: old-gen entries get + * skipped, new-gen entries are only reachable from priv->clone. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip. As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones. + * + * Return: ntables API extension pointer or NULL if no match. + */ +const struct nft_set_ext * +nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + const struct nft_pipapo_elem *e; - nft_pipapo_for_each_field(f, i, m) { - bool last = i == m->field_count - 1; - int b; + m = rcu_dereference(priv->match); + e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64()); - /* For each bit group: select lookup table bucket depending on - * packet bytes value, then AND bucket value - */ - if (f->bb == 8) - pipapo_and_field_buckets_8bit(f, res_map, data); - else if (f->bb == 4) - pipapo_and_field_buckets_4bit(f, res_map, data); - else - BUG(); + return e ? &e->ext : NULL; +} - data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); +/** + * nft_pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @flags: Unused + * + * This function is called from the control plane path under + * RCU read lock. + * + * Return: set element private pointer or ERR_PTR(-ENOENT). + */ +static struct nft_elem_priv * +nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = rcu_dereference(priv->match); + struct nft_pipapo_elem *e; - /* Now populate the bitmap for the next field, unless this is - * the last field, in which case return the matched 'ext' - * pointer if any. - * - * Now res_map contains the matching bitmap, and fill_map is the - * bitmap for the next field. - */ -next_match: - b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, - last); - if (b < 0) - goto out; + e = pipapo_get(m, (const u8 *)elem->key.val.data, + nft_genmask_cur(net), get_jiffies_64()); + if (!e) + return ERR_PTR(-ENOENT); - if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext) || - (genmask && - !nft_set_elem_active(&f->mt[b].e->ext, genmask))) - goto next_match; + return &e->priv; +} - ret = f->mt[b].e; - goto out; - } +/** + * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize + * @f: Field containing mapping table + * @old_rules: Amount of existing mapped rules + * @rules: Amount of new rules to map + * + * Return: 0 on success, negative error code on failure. + */ +static int pipapo_realloc_mt(struct nft_pipapo_field *f, + unsigned int old_rules, unsigned int rules) +{ + union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt; + const unsigned int extra = PAGE_SIZE / sizeof(*new_mt); + unsigned int rules_alloc = rules; - data += NFT_PIPAPO_GROUPS_PADDING(f); + might_sleep(); - /* Swap bitmap indices: fill_map will be the initial bitmap for - * the next field (i.e. the new res_map), and res_map is - * guaranteed to be all-zeroes at this point, ready to be filled - * according to the next mapping table. - */ - swap(res_map, fill_map); + if (unlikely(rules == 0)) + goto out_free; + + /* growing and enough space left, no action needed */ + if (rules > old_rules && f->rules_alloc > rules) + return 0; + + /* downsize and extra slack has not grown too large */ + if (rules < old_rules) { + unsigned int remove = f->rules_alloc - rules; + + if (remove < (2u * extra)) + return 0; } -out: - kfree(fill_map); - kfree(res_map); - return ret; + /* If set needs more than one page of memory for rules then + * allocate another extra page to avoid frequent reallocation. + */ + if (rules > extra && + check_add_overflow(rules, extra, &rules_alloc)) + return -EOVERFLOW; + + if (rules_alloc > (INT_MAX / sizeof(*new_mt))) + return -ENOMEM; + + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); + if (!new_mt) + return -ENOMEM; + + if (old_mt) + memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt)); + + if (rules > old_rules) { + memset(new_mt + old_rules, 0, + (rules - old_rules) * sizeof(*new_mt)); + } +out_free: + f->rules_alloc = rules_alloc; + f->mt = new_mt; + + kvfree(old_mt); + + return 0; } + /** - * nft_pipapo_get() - Get matching element reference given key data - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * @flags: Unused + * lt_calculate_size() - Get storage size for lookup table with overflow check + * @groups: Amount of bit groups + * @bb: Number of bits grouped together in lookup table buckets + * @bsize: Size of each bucket in lookup table, in longs + * + * Return: allocation size including alignment overhead, negative on overflow */ -static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb, + unsigned int bsize) { - return pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long); + + if (check_mul_overflow(ret, bsize, &ret)) + return -1; + if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret)) + return -1; + if (ret > INT_MAX) + return -1; + + return ret; } /** @@ -617,12 +708,16 @@ static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, * * Return: 0 on success, -ENOMEM on allocation failure. */ -static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) +static int pipapo_resize(struct nft_pipapo_field *f, + unsigned int old_rules, unsigned int rules) { long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; - union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; - size_t new_bucket_size, copy; - int group, bucket; + unsigned int new_bucket_size, copy; + int group, bucket, err; + ssize_t lt_size; + + if (rules >= NFT_PIPAPO_RULE0_MAX) + return -ENOSPC; new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); #ifdef NFT_PIPAPO_ALIGN @@ -638,10 +733,11 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) else copy = new_bucket_size; - new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * - new_bucket_size * sizeof(*new_lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size); + if (lt_size < 0) + return -ENOMEM; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return -ENOMEM; @@ -662,27 +758,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) } mt: - new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); - if (!new_mt) { + err = pipapo_realloc_mt(f, old_rules, rules); + if (err) { kvfree(new_lt); - return -ENOMEM; - } - - memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); - if (rules > old_rules) { - memset(new_mt + old_rules, 0, - (rules - old_rules) * sizeof(*new_mt)); + return err; } if (new_lt) { f->bsize = new_bucket_size; - NFT_PIPAPO_LT_ASSIGN(f, new_lt); + f->lt = new_lt; kvfree(old_lt); } - f->mt = new_mt; - kvfree(old_mt); - return 0; } @@ -833,9 +920,9 @@ static void pipapo_lt_8b_to_4b(int old_groups, int bsize, */ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) { + unsigned int groups, bb; unsigned long *new_lt; - int groups, bb; - size_t lt_size; + ssize_t lt_size; lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * sizeof(*f->lt); @@ -845,15 +932,17 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) groups = f->groups * 2; bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && lt_size < NFT_PIPAPO_LT_SIZE_LOW) { groups = f->groups / 2; bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; /* Don't increase group width if the resulting lookup table size * would exceed the upper size threshold for a "small" set. @@ -864,7 +953,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) return; } - new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return; @@ -884,7 +973,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) f->groups = groups; f->bb = bb; kvfree(f->lt); - NFT_PIPAPO_LT_ASSIGN(f, new_lt); + f->lt = new_lt; } /** @@ -901,12 +990,14 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { - int rule = f->rules++, group, ret, bit_offset = 0; + unsigned int rule = f->rules, group, ret, bit_offset = 0; - ret = pipapo_resize(f, f->rules - 1, f->rules); + ret = pipapo_resize(f, f->rules, f->rules + 1); if (ret) return ret; + f->rules++; + for (group = 0; group < f->groups; group++) { int i, v; u8 mask; @@ -1051,7 +1142,9 @@ static int pipapo_expand(struct nft_pipapo_field *f, step++; if (step >= len) { if (!masks) { - pipapo_insert(f, base, 0); + err = pipapo_insert(f, base, 0); + if (err < 0) + return err; masks = 1; } goto out; @@ -1097,6 +1190,20 @@ static void pipapo_map(struct nft_pipapo_match *m, } /** + * pipapo_free_scratch() - Free per-CPU map at original address + * @m: Matching data + * @cpu: CPU number + */ +static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) +{ + struct nft_pipapo_scratch *s; + + s = *per_cpu_ptr(m->scratch, cpu); + + kvfree(s); +} + +/** * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results * @clone: Copy of matching data with pending insertions and deletions * @bsize_max: Maximum bucket size, scratch maps cover two buckets @@ -1109,14 +1216,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, int i; for_each_possible_cpu(i) { - unsigned long *scratch; -#ifdef NFT_PIPAPO_ALIGN - unsigned long *scratch_aligned; -#endif + struct nft_pipapo_scratch *scratch; - scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL, cpu_to_node(i)); + scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) + + NFT_PIPAPO_ALIGN_HEADROOM, + GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous * allocations: this means that some scratch maps have @@ -1128,50 +1232,82 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, return -ENOMEM; } - kfree(*per_cpu_ptr(clone->scratch, i)); - + pipapo_free_scratch(clone, i); + local_lock_init(&scratch->bh_lock); *per_cpu_ptr(clone->scratch, i) = scratch; - -#ifdef NFT_PIPAPO_ALIGN - scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch); - *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned; -#endif } return 0; } +static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) +{ +#ifdef CONFIG_PROVE_LOCKING + const struct net *net = read_pnet(&set->net); + + return lockdep_is_held(&nft_pernet(net)->commit_mutex); +#else + return true; +#endif +} + +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old); + +/** + * pipapo_maybe_clone() - Build clone for pending data changes, if not existing + * @set: nftables API set representation + * + * Return: newly created or existing clone, if any. NULL on allocation failure + */ +static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + + if (priv->clone) + return priv->clone; + + m = rcu_dereference_protected(priv->match, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = pipapo_clone(m); + + return priv->clone; +} + /** * nft_pipapo_insert() - Validate and insert ranged elements * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data - * @ext2: Filled with pointer to &struct nft_set_ext in inserted element + * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element * * Return: 0 on success, error pointer on failure. */ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext2) + struct nft_elem_priv **elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; - struct nft_pipapo_elem *e = elem->priv, *dup; - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m = priv->clone; + struct nft_pipapo_match *m = pipapo_maybe_clone(set); u8 genmask = nft_genmask_next(net); + struct nft_pipapo_elem *e, *dup; + u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_field *f; const u8 *start_p, *end_p; int i, bsize_max, err = 0; + if (!m) + return -ENOMEM; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) end = (const u8 *)nft_set_ext_key_end(ext)->data; else end = start; - dup = pipapo_get(net, set, start, genmask); - if (!IS_ERR(dup)) { + dup = pipapo_get(m, start, genmask, tstamp); + if (dup) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1183,30 +1319,31 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { - *ext2 = &dup->ext; + *elem_priv = &dup->priv; return -EEXIST; } return -ENOTEMPTY; } - if (PTR_ERR(dup) == -ENOENT) { - /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, end, nft_genmask_next(net)); - } - - if (PTR_ERR(dup) != -ENOENT) { - if (IS_ERR(dup)) - return PTR_ERR(dup); - *ext2 = &dup->ext; + /* Look for partially overlapping entries */ + dup = pipapo_get(m, end, nft_genmask_next(net), tstamp); + if (dup) { + *elem_priv = &dup->priv; return -ENOTEMPTY; } /* Validate */ start_p = start; end_p = end; + + /* some helpers return -1, or 0 >= for valid rule pos, + * so we cannot support more than INT_MAX rules at this time. + */ + BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX); + nft_pipapo_for_each_field(f, i, m) { - if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) + if (f->rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; if (memcmp(start_p, end_p, @@ -1218,8 +1355,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, } /* Insert */ - priv->dirty = true; - bsize_max = m->bsize_max; nft_pipapo_for_each_field(f, i, m) { @@ -1234,6 +1369,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else ret = pipapo_expand(f, start, end, f->groups * f->bb); + if (ret < 0) + return ret; + if (f->bsize > bsize_max) bsize_max = f->bsize; @@ -1255,7 +1393,8 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, put_cpu_ptr(m->scratch); } - *ext2 = &e->ext; + e = nft_elem_priv_cast(elem->priv); + *elem_priv = &e->priv; pipapo_map(m, rulemap, e); @@ -1266,7 +1405,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, * pipapo_clone() - Clone matching data to create new working copy * @old: Existing matching data * - * Return: copy of matching data passed as 'old', error pointer on failure + * Return: copy of matching data passed as 'old' or NULL. */ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) { @@ -1274,10 +1413,9 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) struct nft_pipapo_match *new; int i; - new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, - GFP_KERNEL); + new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); if (!new) - return ERR_PTR(-ENOMEM); + return NULL; new->field_count = old->field_count; new->bsize_max = old->bsize_max; @@ -1286,11 +1424,6 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (!new->scratch) goto out_scratch; -#ifdef NFT_PIPAPO_ALIGN - new->scratch_aligned = alloc_percpu(*new->scratch_aligned); - if (!new->scratch_aligned) - goto out_scratch; -#endif for_each_possible_cpu(i) *per_cpu_ptr(new->scratch, i) = NULL; @@ -1304,28 +1437,41 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) for (i = 0; i < old->field_count; i++) { unsigned long *new_lt; + ssize_t lt_size; memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); - new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * - src->bsize * sizeof(*dst->lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + lt_size = lt_calculate_size(src->groups, src->bb, src->bsize); + if (lt_size < 0) + goto out_lt; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) goto out_lt; - NFT_PIPAPO_LT_ASSIGN(dst, new_lt); + dst->lt = new_lt; memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), NFT_PIPAPO_LT_ALIGN(src->lt), src->bsize * sizeof(*dst->lt) * src->groups * NFT_PIPAPO_BUCKETS(src->bb)); - dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); - if (!dst->mt) - goto out_mt; + if (src->rules > 0) { + if (src->rules_alloc > (INT_MAX / sizeof(*src->mt))) + goto out_mt; + + dst->mt = kvmalloc_array(src->rules_alloc, + sizeof(*src->mt), + GFP_KERNEL_ACCOUNT); + if (!dst->mt) + goto out_mt; + + memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); + } else { + dst->mt = NULL; + dst->rules_alloc = 0; + } - memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); src++; dst++; } @@ -1342,15 +1488,12 @@ out_lt: } out_scratch_realloc: for_each_possible_cpu(i) - kfree(*per_cpu_ptr(new->scratch, i)); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(new->scratch_aligned); -#endif + pipapo_free_scratch(new, i); out_scratch: free_percpu(new->scratch); kfree(new); - return ERR_PTR(-ENOMEM); + return NULL; } /** @@ -1382,10 +1525,10 @@ out_scratch: * * Return: Number of rules that originated from the same entry as @first. */ -static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) +static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first) { struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ - int r; + unsigned int r; for (r = first; r < f->rules; r++) { if (r != first && e != f->mt[r].e) @@ -1438,8 +1581,9 @@ static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) * 0 1 2 * element pointers: 0x42 0x42 0x44 */ -static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, - int start, int n, int to_offset, bool is_last) +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules, + unsigned int start, unsigned int n, + unsigned int to_offset, bool is_last) { int i; @@ -1529,21 +1673,35 @@ static void pipapo_drop(struct nft_pipapo_match *m, } } +static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, + struct nft_pipapo_elem *e) + +{ + nft_setelem_data_deactivate(net, set, &e->priv); +} + /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); - int rules_f0, first_rule = 0; + struct net *net = read_pnet(&set->net); + unsigned int rules_f0, first_rule = 0; + u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_elem *e; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; - struct nft_pipapo_field *f; - int i, start, rules_fx; + const struct nft_pipapo_field *f; + unsigned int i, start, rules_fx; start = first_rule; rules_fx = rules_f0; @@ -1562,13 +1720,18 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) f--; i--; e = f->mt[rulemap[i].to].e; - if (nft_set_elem_expired(&e->ext) && - !nft_set_elem_mark_busy(&e->ext)) { - priv->dirty = true; - pipapo_drop(m, rulemap); - rcu_barrier(); - nft_set_elem_destroy(set, e, true); + /* synchronous gc never fails, there is no need to set on + * NFT_SET_ELEM_DEAD_BIT. + */ + if (__nft_set_elem_expired(&e->ext, tstamp)) { + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); + nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. @@ -1578,11 +1741,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } - e = nft_set_catchall_gc(set); - if (e) - nft_set_elem_destroy(set, e, true); - - priv->last_gc = jiffies; + gc = nft_trans_gc_catchall_sync(gc); + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } /** @@ -1600,32 +1763,33 @@ static void pipapo_free_fields(struct nft_pipapo_match *m) } } -/** - * pipapo_reclaim_match - RCU callback to free fields from old matching data - * @rcu: RCU head - */ -static void pipapo_reclaim_match(struct rcu_head *rcu) +static void pipapo_free_match(struct nft_pipapo_match *m) { - struct nft_pipapo_match *m; int i; - m = container_of(rcu, struct nft_pipapo_match, rcu); - for_each_possible_cpu(i) - kfree(*per_cpu_ptr(m->scratch, i)); + pipapo_free_scratch(m, i); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif free_percpu(m->scratch); - pipapo_free_fields(m); kfree(m); } /** - * pipapo_commit() - Replace lookup data with current working copy + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + pipapo_free_match(m); +} + +/** + * nft_pipapo_commit() - Replace lookup data with current working copy * @set: nftables API set representation * * While at it, check if we should perform garbage collection on the working @@ -1635,108 +1799,91 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * We also need to create a new working copy for subsequent insertions and * deletions. */ -static void pipapo_commit(const struct nft_set *set) +static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *new_clone, *old; + struct nft_pipapo_match *old; - if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - pipapo_gc(set, priv->clone); - - if (!priv->dirty) + if (!priv->clone) return; - new_clone = pipapo_clone(priv->clone); - if (IS_ERR(new_clone)) - return; + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); - priv->dirty = false; + old = rcu_replace_pointer(priv->match, priv->clone, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = NULL; - old = rcu_access_pointer(priv->match); - rcu_assign_pointer(priv->match, priv->clone); if (old) call_rcu(&old->rcu, pipapo_reclaim_match); +} + +static void nft_pipapo_abort(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); - priv->clone = new_clone; + if (!priv->clone) + return; + pipapo_free_match(priv->clone); + priv->clone = NULL; } /** * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently - * in use for lookups, and not directly inserted into current lookup data, so - * we'll take care of that by calling pipapo_commit() here. Both + * in use for lookups, and not directly inserted into current lookup data. Both * nft_pipapo_insert() and nft_pipapo_activate() are called once for each * element, hence we can't purpose either one as a real commit operation. */ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_pipapo_elem *e; - - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); - if (IS_ERR(e)) - return; + struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &e->ext); - nft_set_elem_clear_busy(&e->ext); - - pipapo_commit(set); + nft_clear(net, &e->ext); } /** - * pipapo_deactivate() - Check that element is in set, mark as inactive + * nft_pipapo_deactivate() - Search for element and make it inactive * @net: Network namespace * @set: nftables API set representation - * @data: Input key data - * @ext: nftables API extension pointer, used to check for end element - * - * This is a convenience function that can be called from both - * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same - * operation. + * @elem: nftables API element representation containing key data * * Return: deactivated element if found, NULL otherwise. */ -static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, - const u8 *data, const struct nft_set_ext *ext) +static struct nft_elem_priv * +nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_pipapo_match *m = pipapo_maybe_clone(set); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, data, nft_genmask_next(net)); - if (IS_ERR(e)) + /* removal must occur on priv->clone, if we are low on memory + * we have no choice and must fail the removal request. + */ + if (!m) return NULL; - nft_set_elem_change_active(net, set, &e->ext); - - return e; -} + e = pipapo_get(m, (const u8 *)elem->key.val.data, + nft_genmask_next(net), nft_net_tstamp(net)); + if (!e) + return NULL; -/** - * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * - * Return: deactivated element if found, NULL otherwise. - */ -static void *nft_pipapo_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + nft_set_elem_change_active(net, set, &e->ext); - return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); + return &e->priv; } /** - * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * nft_pipapo_flush() - make element inactive * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * This is functionally the same as nft_pipapo_deactivate(), with a slightly * different interface, and it's also called once for each element in a set @@ -1750,13 +1897,12 @@ static void *nft_pipapo_deactivate(const struct net *net, * * Return: true if element was found and deactivated. */ -static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, - void *elem) +static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_pipapo_elem *e = elem; + struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), - &e->ext); + nft_set_elem_change_active(net, set, &e->ext); } /** @@ -1879,7 +2025,7 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, * nft_pipapo_remove() - Remove element given key, commit * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * Similarly to nft_pipapo_activate(), this is used as commit operation by the * API, but it's called once per element in the pending transaction, so we can't @@ -1887,20 +2033,17 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, * the matched element here, if any, and commit the updated matching data. */ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; - struct nft_pipapo_elem *e = elem->priv; - int rules_f0, first_rule = 0; + unsigned int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; const u8 *data; + e = nft_elem_priv_cast(elem_priv); data = (const u8 *)nft_set_ext_key(&e->ext); - e = pipapo_get(net, set, data, 0); - if (IS_ERR(e)) - return; - while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; @@ -1908,12 +2051,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, int i, start, rules_fx; match_start = data; - match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END)) + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + else + match_end = data; start = first_rule; rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; @@ -1926,49 +2075,42 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); - } - if (i == m->field_count) { - priv->dirty = true; - pipapo_drop(m, rulemap); - pipapo_commit(set); - return; + if (last && f->mt[rulemap[i].to].e == e) { + pipapo_drop(m, rulemap); + return; + } } first_rule += rules_f0; } + + WARN_ON_ONCE(1); /* elem_priv not found */ } /** - * nft_pipapo_walk() - Walk over elements + * nft_pipapo_do_walk() - Walk over elements in m * @ctx: nftables API context * @set: nftables API set representation + * @m: matching data pointing to key mapping array * @iter: Iterator * * As elements are referenced in the mapping array for the last field, directly * scan that array: there's no need to follow rule mappings from the first - * field. + * field. @m is protected either by RCU read lock or by transaction mutex. */ -static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_pipapo_match *m, + struct nft_set_iter *iter) { - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; - int i, r; - - rcu_read_lock(); - m = rcu_dereference(priv->match); - - if (unlikely(!m)) - goto out; + const struct nft_pipapo_field *f; + unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; for (r = 0; r < f->rules; r++) { struct nft_pipapo_elem *e; - struct nft_set_elem elem; if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) continue; @@ -1977,21 +2119,52 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, goto cont; e = f->mt[r].e; - if (nft_set_elem_expired(&e->ext)) - goto cont; - elem.priv = e; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) - goto out; + return; cont: iter->count++; } +} -out: - rcu_read_unlock(); +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * Test if destructive action is needed or not, clone active backend if needed + * and call the real function to work on the data. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + + switch (iter->type) { + case NFT_ITER_UPDATE: + m = pipapo_maybe_clone(set); + if (!m) { + iter->err = -ENOMEM; + return; + } + + nft_pipapo_do_walk(ctx, set, m, iter); + break; + case NFT_ITER_READ: + rcu_read_lock(); + m = rcu_dereference(priv->match); + nft_pipapo_do_walk(ctx, set, m, iter); + rcu_read_unlock(); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } /** @@ -2054,20 +2227,24 @@ static int nft_pipapo_init(const struct nft_set *set, struct nft_pipapo_field *f; int err, i, field_count; + BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0); + field_count = desc->field_count ? : 1; + BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255); + BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT); + if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; - m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count, - GFP_KERNEL); + m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); if (!m) return -ENOMEM; m->field_count = field_count; m->bsize_max = 0; - m->scratch = alloc_percpu(unsigned long *); + m->scratch = alloc_percpu(struct nft_pipapo_scratch *); if (!m->scratch) { err = -ENOMEM; goto out_scratch; @@ -2075,20 +2252,14 @@ static int nft_pipapo_init(const struct nft_set *set, for_each_possible_cpu(i) *per_cpu_ptr(m->scratch, i) = NULL; -#ifdef NFT_PIPAPO_ALIGN - m->scratch_aligned = alloc_percpu(unsigned long *); - if (!m->scratch_aligned) { - err = -ENOMEM; - goto out_free; - } - for_each_possible_cpu(i) - *per_cpu_ptr(m->scratch_aligned, i) = NULL; -#endif - rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { - int len = desc->field_len[i] ? : set->klen; + unsigned int len = desc->field_len[i] ? : set->klen; + + /* f->groups is u8 */ + BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES * + BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256); f->bb = NFT_PIPAPO_GROUP_BITS_INIT; f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); @@ -2097,28 +2268,15 @@ static int nft_pipapo_init(const struct nft_set *set, f->bsize = 0; f->rules = 0; - NFT_PIPAPO_LT_ASSIGN(f, NULL); + f->rules_alloc = 0; + f->lt = NULL; f->mt = NULL; } - /* Create an initial clone of matching data for next insertion */ - priv->clone = pipapo_clone(m); - if (IS_ERR(priv->clone)) { - err = PTR_ERR(priv->clone); - goto out_free; - } - - priv->dirty = false; - rcu_assign_pointer(priv->match, m); return 0; -out_free: -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif - free_percpu(m->scratch); out_scratch: kfree(m); @@ -2127,14 +2285,16 @@ out_scratch: /** * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array + * @ctx: context * @set: nftables API set representation * @m: matching data pointing to key mapping array */ -static void nft_set_pipapo_match_destroy(const struct nft_set *set, +static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo_field *f; - int i, r; + unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; @@ -2147,54 +2307,32 @@ static void nft_set_pipapo_match_destroy(const struct nft_set *set, e = f->mt[r].e; - nft_set_elem_destroy(set, e, true); + nf_tables_set_elem_destroy(ctx, set, &e->priv); } } /** * nft_pipapo_destroy() - Free private data for set and all committed elements + * @ctx: context * @set: nftables API set representation */ -static void nft_pipapo_destroy(const struct nft_set *set) +static void nft_pipapo_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; - int cpu; m = rcu_dereference_protected(priv->match, true); - if (m) { - rcu_barrier(); - - nft_set_pipapo_match_destroy(set, m); - -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif - for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(m->scratch, cpu)); - free_percpu(m->scratch); - pipapo_free_fields(m); - kfree(m); - priv->match = NULL; - } if (priv->clone) { - m = priv->clone; - - if (priv->dirty) - nft_set_pipapo_match_destroy(set, m); - -#ifdef NFT_PIPAPO_ALIGN - free_percpu(priv->clone->scratch_aligned); -#endif - for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); - free_percpu(priv->clone->scratch); - - pipapo_free_fields(priv->clone); - kfree(priv->clone); + nft_set_pipapo_match_destroy(ctx, set, priv->clone); + pipapo_free_match(priv->clone); priv->clone = NULL; + } else { + nft_set_pipapo_match_destroy(ctx, set, m); } + + pipapo_free_match(m); } /** @@ -2230,6 +2368,8 @@ const struct nft_set_type nft_set_pipapo_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2252,6 +2392,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 25a75591583e..eaab422aa56a 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -70,15 +70,9 @@ #define NFT_PIPAPO_ALIGN_HEADROOM \ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN) #define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN)) -#define NFT_PIPAPO_LT_ASSIGN(field, x) \ - do { \ - (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \ - (field)->lt = (x); \ - } while (0) #else #define NFT_PIPAPO_ALIGN_HEADROOM 0 #define NFT_PIPAPO_LT_ALIGN(lt) (lt) -#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x)) #endif /* NFT_PIPAPO_ALIGN */ #define nft_pipapo_for_each_field(field, index, match) \ @@ -110,44 +104,50 @@ union nft_pipapo_map_bucket { /** * struct nft_pipapo_field - Lookup, mapping tables and related data for a field - * @groups: Amount of bit groups * @rules: Number of inserted rules * @bsize: Size of each bucket in lookup table, in longs + * @rules_alloc: Number of allocated rules, always >= rules + * @groups: Amount of bit groups * @bb: Number of bits grouped together in lookup table buckets * @lt: Lookup table: 'groups' rows of buckets - * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes * @mt: Mapping table: one bucket per rule */ struct nft_pipapo_field { - int groups; - unsigned long rules; - size_t bsize; - int bb; -#ifdef NFT_PIPAPO_ALIGN - unsigned long *lt_aligned; -#endif + unsigned int rules; + unsigned int bsize; + unsigned int rules_alloc; + u8 groups; + u8 bb; unsigned long *lt; union nft_pipapo_map_bucket *mt; }; /** + * struct nft_pipapo_scratch - percpu data used for lookup and matching + * @bh_lock: PREEMPT_RT local spinlock + * @map_index: Current working bitmap index, toggled between field matches + * @__map: store partial matching results during lookup + */ +struct nft_pipapo_scratch { + local_lock_t bh_lock; + u8 map_index; + unsigned long __map[]; +}; + +/** * struct nft_pipapo_match - Data used for lookup and matching - * @field_count Amount of fields in set - * @scratch: Preallocated per-CPU maps for partial matching results - * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes + * @field_count: Amount of fields in set * @bsize_max: Maximum lookup table bucket size of all fields, in longs - * @rcu Matching data is swapped on commits + * @scratch: Preallocated per-CPU maps for partial matching results + * @rcu: Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { - int field_count; -#ifdef NFT_PIPAPO_ALIGN - unsigned long * __percpu *scratch_aligned; -#endif - unsigned long * __percpu *scratch; - size_t bsize_max; + u8 field_count; + unsigned int bsize_max; + struct nft_pipapo_scratch * __percpu *scratch; struct rcu_head rcu; - struct nft_pipapo_field f[]; + struct nft_pipapo_field f[] __counted_by(field_count); }; /** @@ -155,14 +155,12 @@ struct nft_pipapo_match { * @match: Currently in-use matching data * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding - * @dirty: Working copy has pending insertions or deletions * @last_gc: Timestamp of last garbage collection run, jiffies */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; - bool dirty; unsigned long last_gc; }; @@ -170,14 +168,17 @@ struct nft_pipapo_elem; /** * struct nft_pipapo_elem - API-facing representation of single set element + * @priv: element placeholder * @ext: nftables API extensions */ struct nft_pipapo_elem { - struct nft_set_ext ext; + struct nft_elem_priv priv; + struct nft_set_ext ext; }; -int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, - union nft_pipapo_map_bucket *mt, bool match_only); +int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, + unsigned long *dst, + const union nft_pipapo_map_bucket *mt, bool match_only); /** * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets @@ -185,7 +186,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, * @dst: Area to store result * @data: Input data selecting table buckets */ -static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f, +static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { @@ -213,7 +214,7 @@ static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f, * @dst: Area to store result * @data: Input data selecting table buckets */ -static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f, +static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { @@ -277,4 +278,25 @@ static u64 pipapo_estimate_size(const struct nft_set_desc *desc) return size; } +/** + * pipapo_resmap_init() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Initialize all bits covered by the first field to one, so that after + * the first step, only the matching bits of the first bit group remain. + * + * If other fields have a large bitmap, set remainder of res_map to 0. + */ +static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + for (i = 0; i < f->bsize; i++) + res_map[i] = ULONG_MAX; + + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} #endif /* _NFT_SET_PIPAPO_H */ diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 52e0d026d30a..7ff90325c97f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -57,7 +57,7 @@ /* Jump to label if @reg is zero */ #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ - asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ + asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ "je %l[" #label "]" : : : : label) /* Store 256 bits from YMM register into memory. Contrary to bucket load @@ -71,9 +71,6 @@ #define NFT_PIPAPO_AVX2_ZERO(reg) \ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) -/* Current working bitmap index, toggled between field matches */ -static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index); - /** * nft_pipapo_avx2_prepare() - Prepare before main algorithm body * @@ -215,8 +212,9 @@ static int nft_pipapo_avx2_refill(int offset, unsigned long *map, * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; @@ -277,8 +275,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; @@ -353,8 +352,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -448,8 +448,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -537,8 +538,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -672,8 +674,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -729,8 +732,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -793,8 +797,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -868,8 +873,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -953,8 +959,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -987,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_AND(3, 4, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); - NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_AND(0, 3, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); @@ -1029,6 +1037,7 @@ nothing: /** * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes + * @mdata: Matching data, including mapping table * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables @@ -1044,15 +1053,17 @@ nothing: * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ -static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) +static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata, + unsigned long *map, unsigned long *fill, + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { unsigned long bsize = f->bsize; int i, ret = -1, b; if (first) - memset(map, 0xff, bsize * sizeof(*map)); + pipapo_resmap_init(mdata, map); for (i = offset; i < bsize; i++) { if (f->bb == 8) @@ -1088,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; - if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) + if (!boot_cpu_has(X86_FEATURE_AVX2)) return false; est->size = pipapo_estimate_size(desc); @@ -1103,65 +1114,78 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, } /** - * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation - * @net: Network namespace - * @set: nftables API set representation - * @key: nftables API element representation containing key data - * @ext: nftables API extension pointer, filled with matching reference + * pipapo_resmap_init_avx2() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Like pipapo_resmap_init() but do not set start map bits covered by the first field. + */ +static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + /* Starting map doesn't need to be set to all-ones for this implementation, + * but we do need to zero the remaining bits, if any. + */ + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} + +/** + * pipapo_get_avx2() - Lookup function for AVX2 implementation + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * * This implementation exploits the repetitive characteristic of the algorithm * to provide a fast, vectorised version using the AVX2 SIMD instruction set. * - * Return: true on match, false otherwise. + * The caller must check that the FPU is usable. + * This function must be called with BH disabled. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); - unsigned long *res, *fill, *scratch; - u8 genmask = nft_genmask_cur(net); - const u8 *rp = (const u8 *)key; - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; + struct nft_pipapo_scratch *scratch; + const struct nft_pipapo_field *f; + unsigned long *res, *fill, *map; bool map_index; - int i, ret = 0; + int i; - if (unlikely(!irq_fpu_usable())) - return nft_pipapo_lookup(net, set, key, ext); + scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) + return NULL; - m = rcu_dereference(priv->match); + __local_lock_nested_bh(&scratch->bh_lock); + map_index = scratch->map_index; + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res = map + (map_index ? m->bsize_max : 0); + fill = map + (map_index ? 0 : m->bsize_max); + + pipapo_resmap_init_avx2(m, res); - /* This also protects access to all data related to scratch maps. - * - * Note that we don't need a valid MXCSR state for any of the + /* Note that we don't need a valid MXCSR state for any of the * operations we use here, so pass 0 as mask and spare a LDMXCSR * instruction. */ kernel_fpu_begin_mask(0); - scratch = *raw_cpu_ptr(m->scratch_aligned); - if (unlikely(!scratch)) { - kernel_fpu_end(); - return false; - } - map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index); - - res = scratch + (map_index ? m->bsize_max : 0); - fill = scratch + (map_index ? 0 : m->bsize_max); - - /* Starting map doesn't need to be set for this implementation */ - nft_pipapo_avx2_prepare(); -next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; + int ret = 0; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ - ret, rp, \ + ret, data, \ first, last)) if (likely(f->bb == 8)) { @@ -1176,8 +1200,8 @@ next_match: } else if (f->groups == 16) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { - ret = nft_pipapo_avx2_lookup_slow(res, fill, f, - ret, rp, + ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, + ret, data, first, last); } } else { @@ -1192,8 +1216,8 @@ next_match: } else if (f->groups == 32) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { - ret = nft_pipapo_avx2_lookup_slow(res, fill, f, - ret, rp, + ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, + ret, data, first, last); } } @@ -1201,28 +1225,78 @@ next_match: #undef NFT_SET_PIPAPO_AVX2_LOOKUP - if (ret < 0) - goto out; +next_match: + if (ret < 0) { + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; + } if (last) { - *ext = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(*ext) || - !nft_set_elem_active(*ext, genmask))) { - ret = 0; + struct nft_pipapo_elem *e; + + e = f->mt[ret].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) { + ret = pipapo_refill(res, f->bsize, f->rules, + fill, f->mt, last); goto next_match; } - goto out; + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return e; } + map_index = !map_index; swap(res, fill); - rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } -out: - if (i % 2) - raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index); kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; +} + +/** + * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation + * @net: Network namespace + * @set: nftables API set representation + * @key: nftables API element representation containing key data + * + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy using + * the AVX2 routines if the FPU is usable or fall back to the generic + * implementation of the algorithm otherwise. + * + * Return: nftables API extension pointer or NULL if no match. + */ +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + const u8 *rp = (const u8 *)key; + const struct nft_pipapo_elem *e; + + local_bh_disable(); + + if (unlikely(!irq_fpu_usable())) { + const struct nft_set_ext *ext; + + ext = nft_pipapo_lookup(net, set, key); + + local_bh_enable(); + return ext; + } + + m = rcu_dereference(priv->match); + + e = pipapo_get_avx2(m, rp, 0, get_jiffies_64()); + local_bh_enable(); - return ret >= 0; + return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index dbb6aaca8a7a..c2999b63da3f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,8 +5,12 @@ #include <asm/fpu/xstate.h> #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) +struct nft_pipapo_match; bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ #endif /* _NFT_SET_PIPAPO_AVX2_H */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 19ea4d3c3553..ca594161b840 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -19,10 +19,11 @@ struct nft_rbtree { struct rb_root root; rwlock_t lock; seqcount_rwlock_t count; - struct delayed_work gc_work; + unsigned long last_gc; }; struct nft_rbtree_elem { + struct nft_elem_priv priv; struct rb_node node; struct nft_set_ext ext; }; @@ -46,9 +47,14 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } -static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext, - unsigned int seq) +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +{ + return nft_set_elem_expired(&rbe->ext); +} + +static const struct nft_set_ext * +__nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, unsigned int seq) { struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; @@ -59,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(priv->root.rb_node); while (parent != NULL) { if (read_seqcount_retry(&priv->count, seq)) - return false; + return NULL; rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -71,7 +77,9 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; - interval = rbe; + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_rbtree_elem_expired(rbe)) + interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { @@ -80,51 +88,47 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) - return false; + if (nft_rbtree_elem_expired(rbe)) + return NULL; if (nft_rbtree_interval_end(rbe)) { if (nft_set_is_anonymous(set)) - return false; + return NULL; parent = rcu_dereference_raw(parent->rb_left); interval = NULL; continue; } - *ext = &rbe->ext; - return true; + return &rbe->ext; } } if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && - nft_rbtree_interval_start(interval)) { - *ext = &interval->ext; - return true; - } + nft_rbtree_interval_start(interval)) + return &interval->ext; - return false; + return NULL; } INDIRECT_CALLABLE_SCOPE -bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); - bool ret; + const struct nft_set_ext *ext; - ret = __nft_rbtree_lookup(net, set, key, ext, seq); - if (ret || !read_seqcount_retry(&priv->count, seq)) - return ret; + ext = __nft_rbtree_lookup(net, set, key, seq); + if (ext || !read_seqcount_retry(&priv->count, seq)) + return ext; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); - ret = __nft_rbtree_lookup(net, set, key, ext, seq); + ext = __nft_rbtree_lookup(net, set, key, seq); read_unlock_bh(&priv->lock); - return ret; + return ext; } static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, @@ -191,8 +195,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, return false; } -static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_rbtree_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); @@ -203,48 +208,82 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); if (ret || !read_seqcount_retry(&priv->count, seq)) - return rbe; + return &rbe->priv; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - if (!ret) - rbe = ERR_PTR(-ENOENT); read_unlock_bh(&priv->lock); - return rbe; + if (!ret) + return ERR_PTR(-ENOENT); + + return &rbe->priv; } -static int nft_rbtree_gc_elem(const struct nft_set *__set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) +static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + lockdep_assert_held_write(&priv->lock); + nft_setelem_data_deactivate(net, set, &rbe->priv); + rb_erase(&rbe->node, &priv->root); +} + +static const struct nft_rbtree_elem * +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; - struct nft_set_gc_batch *gcb; + struct nft_trans_gc *gc; - gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); - if (!gcb) - return -ENOMEM; + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) + return ERR_PTR(-ENOMEM); - /* search for expired end interval coming before this element. */ - do { + /* search for end interval coming before this element. + * end intervals don't carry a timeout extension, they + * are coupled with the interval start element. + */ + while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); - if (nft_rbtree_interval_end(rbe_prev)) + if (nft_rbtree_interval_end(rbe_prev) && + nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) break; prev = rb_prev(prev); - } while (prev != NULL); + } - rb_erase(&rbe_prev->node, &priv->root); - rb_erase(&rbe->node, &priv->root); - atomic_sub(2, &set->nelems); + rbe_prev = NULL; + if (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); - nft_set_gc_batch_add(gcb, rbe); - nft_set_gc_batch_complete(gcb); + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return ERR_PTR(-ENOMEM); - return 0; + nft_trans_gc_elem_add(gc, rbe_prev); + } + + nft_rbtree_gc_elem_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return ERR_PTR(-ENOMEM); + + nft_trans_gc_elem_add(gc, rbe); + + nft_trans_gc_queue_sync_done(gc); + + return rbe_prev; } static bool nft_rbtree_update_first(const struct nft_set *set, @@ -265,13 +304,15 @@ static bool nft_rbtree_update_first(const struct nft_set *set, static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; - struct rb_node *node, *parent, **p, *first = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); + u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - int d, err; + u64 tstamp = nft_net_tstamp(net); + int d; /* Descend the tree to search for an existing element greater than the * key value to insert that is greater than the new element. This is the @@ -307,17 +348,27 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, * Values stored in the tree are in reversed order, starting from * highest to lowest value. */ - for (node = first; node != NULL; node = rb_next(node)) { + for (node = first; node != NULL; node = next) { + next = rb_next(node); + rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_set_elem_active(&rbe->ext, genmask)) continue; - /* perform garbage collection to avoid bogus overlap reports. */ - if (nft_set_elem_expired(&rbe->ext)) { - err = nft_rbtree_gc_elem(set, priv, rbe); - if (err < 0) - return err; + /* perform garbage collection to avoid bogus overlap reports + * but skip new elements in this transaction. + */ + if (__nft_set_elem_expired(&rbe->ext, tstamp) && + nft_set_elem_active(&rbe->ext, cur_genmask)) { + const struct nft_rbtree_elem *removed_end; + + removed_end = nft_rbtree_gc_elem(set, priv, rbe); + if (IS_ERR(removed_end)) + return PTR_ERR(removed_end); + + if (removed_end == rbe_le || removed_end == rbe_ge) + return -EAGAIN; continue; } @@ -371,7 +422,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { - *ext = &rbe_ge->ext; + *elem_priv = &rbe_ge->priv; return -EEXIST; } @@ -380,7 +431,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { - *ext = &rbe_le->ext; + *elem_priv = &rbe_le->priv; return -EEXIST; } @@ -432,66 +483,74 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->priv; int err; + do { + if (fatal_signal_pending(current)) + return -EINTR; + + cond_resched(); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, elem_priv); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + } while (err == -EAGAIN); + + return err; +} + +static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) +{ write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); + rb_erase(&rbe->node, &priv->root); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); - - return err; } static void nft_rbtree_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->priv; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - rb_erase(&rbe->node, &priv->root); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + nft_rbtree_erase(priv, rbe); } static void nft_rbtree_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_rbtree_elem *rbe = elem->priv; + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); + nft_clear(net, &rbe->ext); } -static bool nft_rbtree_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_rbtree_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_rbtree_elem *rbe = priv; + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); } -static void *nft_rbtree_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_next(net); + u64 tstamp = nft_net_tstamp(net); int d; while (parent != NULL) { @@ -512,72 +571,92 @@ static void *nft_rbtree_deactivate(const struct net *net, nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; + } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) { + break; } else if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; continue; } - nft_rbtree_flush(net, set, rbe); - return rbe; + nft_rbtree_flush(net, set, &rbe->priv); + return &rbe->priv; } } return NULL; } -static void nft_rbtree_walk(const struct nft_ctx *ctx, - struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rbtree_do_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; - struct nft_set_elem elem; struct rb_node *node; - read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&rbe->ext)) - goto cont; - if (!nft_set_elem_active(&rbe->ext, iter->genmask)) - goto cont; - elem.priv = rbe; - - iter->err = iter->fn(ctx, set, iter, &elem); - if (iter->err < 0) { - read_unlock_bh(&priv->lock); + iter->err = iter->fn(ctx, set, iter, &rbe->priv); + if (iter->err < 0) return; - } cont: iter->count++; } - read_unlock_bh(&priv->lock); } -static void nft_rbtree_gc(struct work_struct *work) +static void nft_rbtree_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; - struct nft_rbtree *priv; - struct rb_node *node; - struct nft_set *set; - struct net *net; - u8 genmask; + struct nft_rbtree *priv = nft_set_priv(set); + + switch (iter->type) { + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); + break; + case NFT_ITER_READ: + read_lock_bh(&priv->lock); + nft_rbtree_do_walk(ctx, set, iter); + read_unlock_bh(&priv->lock); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + nft_setelem_data_deactivate(net, set, &rbe->priv); + nft_rbtree_erase(priv, rbe); +} + +static void nft_rbtree_gc(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe, *rbe_end = NULL; + struct net *net = read_pnet(&set->net); + u64 tstamp = nft_net_tstamp(net); + struct rb_node *node, *next; + struct nft_trans_gc *gc; - priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); - genmask = nft_genmask_cur(net); - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { - rbe = rb_entry(node, struct nft_rbtree_elem, node); + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; - if (!nft_set_elem_active(&rbe->ext, genmask)) - continue; + for (node = rb_first(&priv->root); node ; node = next) { + next = rb_next(node); + + rbe = rb_entry(node, struct nft_rbtree_elem, node); /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is @@ -587,51 +666,37 @@ static void nft_rbtree_gc(struct work_struct *work) rbe_end = rbe; continue; } - if (!nft_set_elem_expired(&rbe->ext)) + if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) { - rbe_end = NULL; - continue; - } - - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; - - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + goto try_later; + /* end element needs to be removed first, it has + * no timeout extension. + */ if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); + nft_rbtree_gc_remove(net, set, priv, rbe_end); + nft_trans_gc_elem_add(gc, rbe_end); rbe_end = NULL; } - node = rb_next(node); - if (!node) - break; - } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); - rbe = nft_set_catchall_gc(set); - if (rbe) { - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb) - nft_set_gc_batch_add(gcb, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + goto try_later; + + nft_rbtree_gc_remove(net, set, priv, rbe); + nft_trans_gc_elem_add(gc, rbe); } - nft_set_gc_batch_complete(gcb); - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); +try_later: + + if (gc) { + gc = nft_trans_gc_catchall_sync(gc); + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], @@ -646,30 +711,26 @@ static int nft_rbtree_init(const struct nft_set *set, { struct nft_rbtree *priv = nft_set_priv(set); + BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); + rwlock_init(&priv->lock); seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; - INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); - if (set->flags & NFT_SET_TIMEOUT) - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); - return 0; } -static void nft_rbtree_destroy(const struct nft_set *set) +static void nft_rbtree_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; - cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe, true); + nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } } @@ -691,6 +752,61 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static void nft_rbtree_commit(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + nft_rbtree_gc(set); +} + +static void nft_rbtree_gc_init(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + +/* rbtree stores ranges as singleton elements, each range is composed of two + * elements ... + */ +static u32 nft_rbtree_ksize(u32 size) +{ + return size * 2; +} + +/* ... hide this detail to userspace. */ +static u32 nft_rbtree_usize(u32 size) +{ + if (!size) + return 0; + + return size / 2; +} + +static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *node; + const void *key; + + node = rb_last(&priv->root); + if (!node) + return 0; + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_rbtree_interval_end(rbe)) + return 0; + + key = nft_set_ext_key(&rbe->ext); + if (memchr(key, 1, set->klen)) + return 0; + + /* this is the all-zero no-match element. */ + return 1; +} + const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -704,8 +820,13 @@ const struct nft_set_type nft_set_rbtree_type = { .deactivate = nft_rbtree_deactivate, .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, + .commit = nft_rbtree_commit, + .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, + .ksize = nft_rbtree_ksize, + .usize = nft_rbtree_usize, + .adjust_maxsize = nft_rbtree_adjust_maxsize, }, }; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 85f8df87efda..36affbb697c2 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -9,7 +9,8 @@ struct nft_socket { enum nft_socket_keys key:8; - u8 level; + u8 level; /* cgroupv2 level to extract */ + u8 level_user; /* cgroupv2 level provided by userspace */ u8 len; union { u8 dreg; @@ -53,6 +54,28 @@ nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo memcpy(dest, &cgid, sizeof(u64)); return true; } + +/* process context only, uses current->nsproxy. */ +static noinline int nft_socket_cgroup_subtree_level(void) +{ + struct cgroup *cgrp = cgroup_get_from_path("/"); + int level; + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + level = cgrp->level; + + cgroup_put(cgrp); + + if (level > 255) + return -ERANGE; + + if (WARN_ON_ONCE(level < 0)) + return -EINVAL; + + return level; +} #endif static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt) @@ -107,16 +130,16 @@ static void nft_socket_eval(const struct nft_expr *expr, break; case NFT_SOCKET_MARK: if (sk_fullsock(sk)) { - *dest = sk->sk_mark; + *dest = READ_ONCE(sk->sk_mark); } else { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } break; case NFT_SOCKET_WILDCARD: if (!sk_fullsock(sk)) { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } nft_socket_wildcard(pkt, regs, sk, dest); break; @@ -124,7 +147,7 @@ static void nft_socket_eval(const struct nft_expr *expr, case NFT_SOCKET_CGROUPV2: if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } break; #endif @@ -133,14 +156,15 @@ static void nft_socket_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +out_put_sk: if (sk != skb->sk) sock_gen_put(sk); } static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { - [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, + [NFTA_SOCKET_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, - [NFTA_SOCKET_LEVEL] = { .type = NLA_U32 }, + [NFTA_SOCKET_LEVEL] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_socket_init(const struct nft_ctx *ctx, @@ -173,9 +197,10 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_MARK: len = sizeof(u32); break; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_SOCK_CGROUP_DATA case NFT_SOCKET_CGROUPV2: { unsigned int level; + int err; if (!tb[NFTA_SOCKET_LEVEL]) return -EINVAL; @@ -184,6 +209,17 @@ static int nft_socket_init(const struct nft_ctx *ctx, if (level > 255) return -EOPNOTSUPP; + err = nft_socket_cgroup_subtree_level(); + if (err < 0) + return err; + + priv->level_user = level; + + level += err; + /* Implies a giant cgroup tree */ + if (level > 255) + return -EOPNOTSUPP; + priv->level = level; len = sizeof(u64); break; @@ -208,7 +244,7 @@ static int nft_socket_dump(struct sk_buff *skb, if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; if (priv->key == NFT_SOCKET_CGROUPV2 && - nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) + nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user))) return -1; return 0; } @@ -239,9 +275,13 @@ static bool nft_socket_reduce(struct nft_regs_track *track, } static int nft_socket_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 13da882669a4..5d3e51825985 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx, break; #endif case NFPROTO_INET: - case NFPROTO_BRIDGE: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; @@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) break; #endif case NFPROTO_INET: - case NFPROTO_BRIDGE: nf_synproxy_ipv4_fini(snet, ctx->net); nf_synproxy_ipv6_fini(snet, ctx->net); break; @@ -250,9 +248,13 @@ static void nft_synproxy_eval(const struct nft_expr *expr, } static int nft_synproxy_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD)); } diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index ea83f661417e..50481280abd2 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -183,7 +183,7 @@ static void nft_tproxy_eval(const struct nft_expr *expr, } static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = { - [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 }, + [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 }, [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 }, }; @@ -254,14 +254,14 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, } if (tb[NFTA_TPROXY_REG_ADDR]) { - err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR], + err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_ADDR], &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { - err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT], + err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_PORT], &priv->sreg_port, sizeof(u16)); if (err < 0) return err; @@ -313,9 +313,13 @@ static int nft_tproxy_dump(struct sk_buff *skb, } static int nft_tproxy_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); } diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index b059aa541798..a12486ae089d 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -66,9 +66,9 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr, } static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = { - [NFTA_TUNNEL_KEY] = { .type = NLA_U32 }, + [NFTA_TUNNEL_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TUNNEL_DREG] = { .type = NLA_U32 }, - [NFTA_TUNNEL_MODE] = { .type = NLA_U32 }, + [NFTA_TUNNEL_MODE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_tunnel_get_init(const struct nft_ctx *ctx, @@ -174,8 +174,8 @@ struct nft_tunnel_opts { struct erspan_metadata erspan; u8 data[IP_TUNNEL_OPTS_MAX]; } u; + IP_TUNNEL_DECLARE_FLAGS(flags); u32 len; - __be16 flags; }; struct nft_tunnel_obj { @@ -271,7 +271,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP])); opts->len = sizeof(struct vxlan_metadata); - opts->flags = TUNNEL_VXLAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags); return 0; } @@ -325,7 +326,8 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, opts->u.erspan.version = version; opts->len = sizeof(struct erspan_metadata); - opts->flags = TUNNEL_ERSPAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags); return 0; } @@ -333,13 +335,13 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, - [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { - struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len; + struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len); struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; int err, data_len; @@ -366,7 +368,8 @@ static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, opt->length = data_len / 4; opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]); opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]); - opts->flags = TUNNEL_GENEVE_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags); return 0; } @@ -385,8 +388,8 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, struct nft_tunnel_opts *opts) { struct nlattr *nla; - __be16 type = 0; int err, rem; + u32 type = 0; err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX, nft_tunnel_opts_policy, NULL); @@ -401,7 +404,7 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_vxlan_init(nla, opts); if (err) return err; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_ERSPAN: if (type) @@ -409,15 +412,15 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_erspan_init(nla, opts); if (err) return err; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; err = nft_tunnel_obj_geneve_init(nla, opts); if (err) return err; - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; default: return -EOPNOTSUPP; @@ -454,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, memset(&info, 0, sizeof(info)); info.mode = IP_TUNNEL_INFO_TX; info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID])); - info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags); if (tb[NFTA_TUNNEL_KEY_IP]) { err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info); @@ -483,18 +488,16 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX) - info.key.tun_flags &= ~TUNNEL_CSUM; + __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT) - info.key.tun_flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER) - info.key.tun_flags |= TUNNEL_SEQ; + __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags); } if (tb[NFTA_TUNNEL_KEY_TOS]) info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]); - if (tb[NFTA_TUNNEL_KEY_TTL]) - info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]); - else - info.key.ttl = U8_MAX; + info.key.ttl = nla_get_u8_default(tb[NFTA_TUNNEL_KEY_TTL], U8_MAX); if (tb[NFTA_TUNNEL_KEY_OPTS]) { err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS], @@ -503,13 +506,14 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return err; } - md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL); + md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, + GFP_KERNEL_ACCOUNT); if (!md) return -ENOMEM; memcpy(&md->u.tun_info, &info, sizeof(info)); #ifdef CONFIG_DST_CACHE - err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL); + err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT); if (err < 0) { metadata_dst_free(md); return err; @@ -583,7 +587,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, if (!nest) return -1; - if (opts->flags & TUNNEL_VXLAN_OPT) { + if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); if (!inner) goto failure; @@ -591,7 +595,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, htonl(opts->u.vxlan.gbp))) goto inner_failure; nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); if (!inner) goto failure; @@ -613,15 +617,15 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, break; } nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_GENEVE_OPT) { + } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) { struct geneve_opt *opt; int offset = 0; - inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); - if (!inner) - goto failure; while (opts->len > offset) { - opt = (struct geneve_opt *)opts->u.data + offset; + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; + opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, @@ -630,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, opt->length * 4, opt->opt_data)) goto inner_failure; offset += sizeof(*opt) + opt->length * 4; + nla_nest_end(skb, inner); } - nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; @@ -658,11 +662,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb, { u32 flags = 0; - if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_DONT_FRAGMENT; - if (!(info->key.tun_flags & TUNNEL_CSUM)) + if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_ZERO_CSUM_TX; - if (info->key.tun_flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_SEQ_NUMBER; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0) @@ -713,6 +717,7 @@ static const struct nft_object_ops nft_tunnel_obj_ops = { static struct nft_object_type nft_tunnel_obj_type __read_mostly = { .type = NFT_OBJECT_TUNNEL, + .family = NFPROTO_NETDEV, .ops = &nft_tunnel_obj_ops, .maxattr = NFTA_TUNNEL_KEY_MAX, .policy = nft_tunnel_key_policy, diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index c88fd078a9ae..3210cfc966ab 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -16,9 +16,9 @@ #include <net/xfrm.h> static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { - [NFTA_XFRM_KEY] = { .type = NLA_U32 }, + [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, - [NFTA_XFRM_SPNUM] = { .type = NLA_U32 }, + [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; @@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) return true; } - return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || + mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, @@ -229,12 +230,16 @@ static int nft_xfrm_get_dump(struct sk_buff *skb, return 0; } -static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 2182d361e273..008419db815a 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -179,39 +179,54 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, } EXPORT_SYMBOL_GPL(nf_route); -static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) +/* Only get and check the lengths, not do any hop-by-hop stuff. */ +int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen) { -#ifdef CONFIG_INET - const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - if (!(iph->tos == rt_info->tos && - skb->mark == rt_info->mark && - iph->daddr == rt_info->daddr && - iph->saddr == rt_info->saddr)) - return ip_route_me_harder(entry->state.net, entry->state.sk, - skb, RTN_UNSPEC); + int len, off = sizeof(struct ipv6hdr); + unsigned char *nh; + + if (!pskb_may_pull(skb, off + 8)) + return -ENOMEM; + nh = (unsigned char *)(ipv6_hdr(skb) + 1); + len = (nh[1] + 1) << 3; + + if (!pskb_may_pull(skb, off + len)) + return -ENOMEM; + nh = skb_network_header(skb); + + off += 2; + len -= 2; + while (len > 0) { + int optlen; + + if (nh[off] == IPV6_TLV_PAD1) { + off++; + len--; + continue; + } + if (len < 2) + return -EBADMSG; + optlen = nh[off + 1] + 2; + if (optlen > len) + return -EBADMSG; + + if (nh[off] == IPV6_TLV_JUMBO) { + u32 pkt_len; + + if (nh[off + 1] != 4 || (off & 3) != 2) + return -EBADMSG; + pkt_len = ntohl(*(__be32 *)(nh + off + 2)); + if (pkt_len <= IPV6_MAXPLEN || + ipv6_hdr(skb)->payload_len) + return -EBADMSG; + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + return -EBADMSG; + *plen = pkt_len; + } + off += optlen; + len -= optlen; } -#endif - return 0; -} - -int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) -{ - const struct nf_ipv6_ops *v6ops; - int ret = 0; - switch (entry->state.pf) { - case AF_INET: - ret = nf_ip_reroute(skb, entry); - break; - case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - ret = v6ops->reroute(skb, entry); - break; - } - return ret; + return len ? -EBADMSG : 0; } +EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 470282cf3fae..90b7630421c4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -768,7 +768,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, m->u.user.match_size = msize; strscpy(name, match->name, sizeof(name)); module_put(match->me); - strncpy(m->u.user.name, name, sizeof(m->u.user.name)); + strscpy_pad(m->u.user.name, name, sizeof(m->u.user.name)); *size += off; *dstptr += msize; @@ -1142,13 +1142,14 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, if (target->compat_from_user) target->compat_from_user(t->data, ct->data); else - memcpy(t->data, ct->data, tsize - sizeof(*ct)); + unsafe_memcpy(t->data, ct->data, tsize - sizeof(*ct), + /* UAPI 0-sized destination */); tsize += off; t->u.user.target_size = tsize; strscpy(name, target->name, sizeof(name)); module_put(target->me); - strncpy(t->u.user.name, name, sizeof(t->u.user.name)); + strscpy_pad(t->u.user.name, name, sizeof(t->u.user.name)); *size += off; *dstptr += tsize; @@ -1268,7 +1269,7 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, /* and once again: */ list_for_each_entry(t, &xt_net->tables[af], list) - if (strcmp(t->name, name) == 0) + if (strcmp(t->name, name) == 0 && owner == t->me) return t; module_put(owner); @@ -1316,12 +1317,13 @@ void xt_compat_unlock(u_int8_t af) EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif -DEFINE_PER_CPU(seqcount_t, xt_recseq); -EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); - struct static_key xt_tee_enabled __read_mostly; EXPORT_SYMBOL_GPL(xt_tee_enabled); +#ifdef CONFIG_NETFILTER_XTABLES_LEGACY +DEFINE_PER_CPU(seqcount_t, xt_recseq); +EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); + static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; @@ -1513,6 +1515,7 @@ void *xt_unregister_table(struct xt_table *table) return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); +#endif #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) @@ -1896,6 +1899,7 @@ void xt_proto_fini(struct net *net, u_int8_t af) } EXPORT_SYMBOL_GPL(xt_proto_fini); +#ifdef CONFIG_NETFILTER_XTABLES_LEGACY /** * xt_percpu_counter_alloc - allocate x_tables rule counter * @@ -1950,6 +1954,7 @@ void xt_percpu_counter_free(struct xt_counters *counters) free_percpu((void __percpu *)pcnt); } EXPORT_SYMBOL_GPL(xt_percpu_counter_free); +#endif static int __net_init xt_net_init(struct net *net) { @@ -1982,8 +1987,10 @@ static int __init xt_init(void) unsigned int i; int rv; - for_each_possible_cpu(i) { - seqcount_init(&per_cpu(xt_recseq, i)); + if (IS_ENABLED(CONFIG_NETFILTER_XTABLES_LEGACY)) { + for_each_possible_cpu(i) { + seqcount_init(&per_cpu(xt_recseq, i)); + } } xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); @@ -2014,4 +2021,3 @@ static void __exit xt_fini(void) module_init(xt_init); module_exit(xt_fini); - diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index c8a639f56168..9d99f5a3d176 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -63,24 +63,37 @@ static int checksum_tg_check(const struct xt_tgchk_param *par) return 0; } -static struct xt_target checksum_tg_reg __read_mostly = { - .name = "CHECKSUM", - .family = NFPROTO_UNSPEC, - .target = checksum_tg, - .targetsize = sizeof(struct xt_CHECKSUM_info), - .table = "mangle", - .checkentry = checksum_tg_check, - .me = THIS_MODULE, +static struct xt_target checksum_tg_reg[] __read_mostly = { + { + .name = "CHECKSUM", + .family = NFPROTO_IPV4, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CHECKSUM", + .family = NFPROTO_IPV6, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, + }, +#endif }; static int __init checksum_tg_init(void) { - return xt_register_target(&checksum_tg_reg); + return xt_register_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } static void __exit checksum_tg_exit(void) { - xt_unregister_target(&checksum_tg_reg); + xt_unregister_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } module_init(checksum_tg_init); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 0accac98dea7..0ae8d8a1216e 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -38,9 +38,9 @@ static struct xt_target classify_tg_reg[] __read_mostly = { { .name = "CLASSIFY", .revision = 0, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING), + (1 << NF_INET_POST_ROUTING), .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), .me = THIS_MODULE, @@ -54,6 +54,18 @@ static struct xt_target classify_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_classify_target_info), .me = THIS_MODULE, }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, +#endif }; static int __init classify_tg_init(void) diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 76acecf3e757..1494b3ee30e1 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -114,25 +114,39 @@ static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target connsecmark_tg_reg __read_mostly = { - .name = "CONNSECMARK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = connsecmark_tg_check, - .destroy = connsecmark_tg_destroy, - .target = connsecmark_tg, - .targetsize = sizeof(struct xt_connsecmark_target_info), - .me = THIS_MODULE, +static struct xt_target connsecmark_tg_reg[] __read_mostly = { + { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, + }, +#endif }; static int __init connsecmark_tg_init(void) { - return xt_register_target(&connsecmark_tg_reg); + return xt_register_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg)); } static void __exit connsecmark_tg_exit(void) { - xt_unregister_target(&connsecmark_tg_reg); + xt_unregister_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg)); } module_init(connsecmark_tg_init); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 2be2f7a7b60f..3ba94c34297c 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -313,10 +313,30 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) xt_ct_tg_destroy(par, par->targinfo); } +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->_nfct != 0) + return XT_CONTINUE; + + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + + return XT_CONTINUE; +} + static struct xt_target xt_ct_tg_reg[] __read_mostly = { { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_IPV4, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, + }, + { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .targetsize = sizeof(struct xt_ct_target_info), .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v0, @@ -327,7 +347,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), @@ -339,7 +359,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), @@ -349,49 +369,61 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .table = "raw", .me = THIS_MODULE, }, -}; - -static unsigned int -notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - /* Previously seen (loopback)? Ignore. */ - if (skb->_nfct != 0) - return XT_CONTINUE; - - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - - return XT_CONTINUE; -} - -static struct xt_target notrack_tg_reg __read_mostly = { - .name = "NOTRACK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = notrack_tg, - .table = "raw", - .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_IPV6, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .targetsize = sizeof(struct xt_ct_target_info), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v0, + .destroy = xt_ct_tg_destroy_v0, + .target = xt_ct_target_v0, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .revision = 1, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v1, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .revision = 2, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v2, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, +#endif }; static int __init xt_ct_tg_init(void) { - int ret; - - ret = xt_register_target(¬rack_tg_reg); - if (ret < 0) - return ret; - - ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); - if (ret < 0) { - xt_unregister_target(¬rack_tg_reg); - return ret; - } - return 0; + return xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); } static void __exit xt_ct_tg_exit(void) { xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); - xt_unregister_target(¬rack_tg_reg); } module_init(xt_ct_tg_init); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 8d36303f3935..d73957592c9d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -100,21 +100,19 @@ static void idletimer_tg_work(struct work_struct *work) static void idletimer_tg_expired(struct timer_list *t) { - struct idletimer_tg *timer = from_timer(timer, t, timer); + struct idletimer_tg *timer = timer_container_of(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } -static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm, - ktime_t now) +static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); - return ALARMTIMER_NORESTART; } static int idletimer_check_sysfs_name(const char *name, unsigned int size) @@ -170,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return 0; @@ -231,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return 0; @@ -256,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, info->label, info->timeout); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } @@ -277,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; @@ -322,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -384,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) } } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -409,21 +407,23 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) mutex_lock(&list_mutex); - if (--info->timer->refcnt == 0) { - pr_debug("deleting timer %s\n", info->label); - - list_del(&info->timer->entry); - timer_shutdown_sync(&info->timer->timer); - cancel_work_sync(&info->timer->work); - sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); - kfree(info->timer->attr.attr.name); - kfree(info->timer); - } else { + if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); + mutex_unlock(&list_mutex); + return; } + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); mutex_unlock(&list_mutex); + + timer_shutdown_sync(&info->timer->timer); + cancel_work_sync(&info->timer->work); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) @@ -434,52 +434,75 @@ static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) mutex_lock(&list_mutex); - if (--info->timer->refcnt == 0) { - pr_debug("deleting timer %s\n", info->label); - - list_del(&info->timer->entry); - if (info->timer->timer_type & XT_IDLETIMER_ALARM) { - alarm_cancel(&info->timer->alarm); - } else { - timer_shutdown_sync(&info->timer->timer); - } - cancel_work_sync(&info->timer->work); - sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); - kfree(info->timer->attr.attr.name); - kfree(info->timer); - } else { + if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); + mutex_unlock(&list_mutex); + return; } + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); mutex_unlock(&list_mutex); + + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + alarm_cancel(&info->timer->alarm); + } else { + timer_shutdown_sync(&info->timer->timer); + } + cancel_work_sync(&info->timer->work); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { - .name = "IDLETIMER", - .family = NFPROTO_UNSPEC, - .target = idletimer_tg_target, - .targetsize = sizeof(struct idletimer_tg_info), - .usersize = offsetof(struct idletimer_tg_info, timer), - .checkentry = idletimer_tg_checkentry, - .destroy = idletimer_tg_destroy, - .me = THIS_MODULE, + .name = "IDLETIMER", + .family = NFPROTO_IPV4, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, }, { - .name = "IDLETIMER", - .family = NFPROTO_UNSPEC, - .revision = 1, - .target = idletimer_tg_target_v1, - .targetsize = sizeof(struct idletimer_tg_info_v1), - .usersize = offsetof(struct idletimer_tg_info_v1, timer), - .checkentry = idletimer_tg_checkentry_v1, - .destroy = idletimer_tg_destroy_v1, - .me = THIS_MODULE, + .name = "IDLETIMER", + .family = NFPROTO_IPV4, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, }, - - +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "IDLETIMER", + .family = NFPROTO_IPV6, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "IDLETIMER", + .family = NFPROTO_IPV6, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, + }, +#endif }; static struct class *idletimer_tg_class; @@ -490,7 +513,7 @@ static int __init idletimer_tg_init(void) { int err; - idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer"); + idletimer_tg_class = class_create("xt_idletimer"); err = PTR_ERR(idletimer_tg_class); if (IS_ERR(idletimer_tg_class)) { pr_debug("couldn't register device class\n"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 66b0f941d8fb..90dcf088071a 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -43,7 +43,6 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_led_info *ledinfo = par->targinfo; struct xt_led_info_internal *ledinternal = ledinfo->internal_data; - unsigned long led_delay = XT_LED_BLINK_DELAY; /* * If "always blink" is enabled, and there's still some time until the @@ -52,7 +51,7 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) if ((ledinfo->delay > 0) && ledinfo->always_blink && timer_pending(&ledinternal->timer)) led_trigger_blink_oneshot(&ledinternal->netfilter_led_trigger, - &led_delay, &led_delay, 1); + XT_LED_BLINK_DELAY, XT_LED_BLINK_DELAY, 1); else led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL); @@ -73,8 +72,9 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) static void led_timeout_callback(struct timer_list *t) { - struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t, - timer); + struct xt_led_info_internal *ledinternal = timer_container_of(ledinternal, + t, + timer); led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); } @@ -97,7 +97,9 @@ static int led_tg_check(const struct xt_tgchk_param *par) struct xt_led_info_internal *ledinternal; int err; - if (ledinfo->id[0] == '\0') + /* Bail out if empty string or not a string at all. */ + if (ledinfo->id[0] == '\0' || + !memchr(ledinfo->id, '\0', sizeof(ledinfo->id))) return -EINVAL; mutex_lock(&xt_led_mutex); @@ -176,26 +178,41 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par) kfree(ledinternal); } -static struct xt_target led_tg_reg __read_mostly = { - .name = "LED", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = led_tg, - .targetsize = sizeof(struct xt_led_info), - .usersize = offsetof(struct xt_led_info, internal_data), - .checkentry = led_tg_check, - .destroy = led_tg_destroy, - .me = THIS_MODULE, +static struct xt_target led_tg_reg[] __read_mostly = { + { + .name = "LED", + .revision = 0, + .family = NFPROTO_IPV4, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "LED", + .revision = 0, + .family = NFPROTO_IPV6, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init led_tg_init(void) { - return xt_register_target(&led_tg_reg); + return xt_register_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg)); } static void __exit led_tg_exit(void) { - xt_unregister_target(&led_tg_reg); + xt_unregister_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg)); } module_init(led_tg_init); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index e660c3710a10..6dcf4bc7e30b 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -64,25 +64,39 @@ static void nflog_tg_destroy(const struct xt_tgdtor_param *par) nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } -static struct xt_target nflog_tg_reg __read_mostly = { - .name = "NFLOG", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = nflog_tg_check, - .destroy = nflog_tg_destroy, - .target = nflog_tg, - .targetsize = sizeof(struct xt_nflog_info), - .me = THIS_MODULE, +static struct xt_target nflog_tg_reg[] __read_mostly = { + { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, + }, +#endif }; static int __init nflog_tg_init(void) { - return xt_register_target(&nflog_tg_reg); + return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } static void __exit nflog_tg_exit(void) { - xt_unregister_target(&nflog_tg_reg); + xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } module_init(nflog_tg_init); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 80f6624e2355..4f49cfc27831 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -179,16 +179,31 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) xt_rateest_put(par->net, info->est); } -static struct xt_target xt_rateest_tg_reg __read_mostly = { - .name = "RATEEST", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = xt_rateest_tg, - .checkentry = xt_rateest_tg_checkentry, - .destroy = xt_rateest_tg_destroy, - .targetsize = sizeof(struct xt_rateest_target_info), - .usersize = offsetof(struct xt_rateest_target_info, est), - .me = THIS_MODULE, +static struct xt_target xt_rateest_tg_reg[] __read_mostly = { + { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_IPV4, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_IPV6, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), + .me = THIS_MODULE, + }, +#endif }; static __net_init int xt_rateest_net_init(struct net *net) @@ -214,12 +229,12 @@ static int __init xt_rateest_tg_init(void) if (err) return err; - return xt_register_target(&xt_rateest_tg_reg); + return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { - xt_unregister_target(&xt_rateest_tg_reg); + xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index 353ca7801251..ff66b56a3f97 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -/* FIXME: Take multiple ranges --RR */ static int redirect_tg4_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; @@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par) static unsigned int redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par)); + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; + struct nf_nat_range2 range = { + .flags = mr->range[0].flags, + .min_proto = mr->range[0].min, + .max_proto = mr->range[0].max, + }; + + return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par)); } static struct xt_target redirect_tg_reg[] __read_mostly = { diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 498a0bf6f044..5bc5ea505eb9 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -157,7 +157,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .revision = 0, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v0, .destroy = secmark_tg_destroy, .target = secmark_tg_v0, @@ -167,7 +167,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .revision = 1, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v1, .destroy = secmark_tg_destroy, .target = secmark_tg_v1, @@ -175,6 +175,29 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { .usersize = offsetof(struct xt_secmark_target_info_v1, secid), .me = THIS_MODULE, }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = secmark_tg_check_v0, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v0, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, + }, + { + .name = "SECMARK", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = secmark_tg_check_v1, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v1, + .targetsize = sizeof(struct xt_secmark_target_info_v1), + .usersize = offsetof(struct xt_secmark_target_info_v1, secid), + .me = THIS_MODULE, + }, +#endif }; static int __init secmark_tg_init(void) diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 30e99464171b..93f064306901 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_tcpoptstrip_target_info), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TCPOPTSTRIP", .family = NFPROTO_IPV6, diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 5582dce98cae..a642ff09fc8e 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -29,25 +29,39 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static struct xt_target trace_tg_reg __read_mostly = { - .name = "TRACE", - .revision = 0, - .family = NFPROTO_UNSPEC, - .table = "raw", - .target = trace_tg, - .checkentry = trace_tg_check, - .destroy = trace_tg_destroy, - .me = THIS_MODULE, +static struct xt_target trace_tg_reg[] __read_mostly = { + { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_IPV4, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_IPV6, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init trace_tg_init(void) { - return xt_register_target(&trace_tg_reg); + return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } static void __exit trace_tg_exit(void) { - xt_unregister_target(&trace_tg_reg); + xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } module_init(trace_tg_init); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index e9b2181e8c42..a77088943107 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -208,13 +208,24 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { }, { .name = "addrtype", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE - } + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "addrtype", + .family = NFPROTO_IPV6, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct xt_addrtype_info_v1), + .me = THIS_MODULE + }, +#endif }; static int __init addrtype_mt_init(void) diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index c0f5e9a4f3c6..c437fbd59ec1 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); +#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n" + static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v0 *info = par->matchinfo; @@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) if (info->invert & ~1) return -EINVAL; + if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + return 0; } @@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -100,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { +#ifdef CONFIG_CGROUP_NET_CLASSID const struct xt_cgroup_info_v0 *info = par->matchinfo; struct sock *sk = skb->sk; @@ -108,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ info->invert; +#endif + return false; } static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) @@ -123,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) @@ -141,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index a047a545371e..908fd5f2c3c8 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -146,24 +146,37 @@ static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_match xt_cluster_match __read_mostly = { - .name = "cluster", - .family = NFPROTO_UNSPEC, - .match = xt_cluster_mt, - .checkentry = xt_cluster_mt_checkentry, - .matchsize = sizeof(struct xt_cluster_match_info), - .destroy = xt_cluster_mt_destroy, - .me = THIS_MODULE, +static struct xt_match xt_cluster_match[] __read_mostly = { + { + .name = "cluster", + .family = NFPROTO_IPV4, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "cluster", + .family = NFPROTO_IPV6, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init xt_cluster_mt_init(void) { - return xt_register_match(&xt_cluster_match); + return xt_register_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match)); } static void __exit xt_cluster_mt_fini(void) { - xt_unregister_match(&xt_cluster_match); + xt_unregister_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match)); } MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 93cb018c3055..2aabdcea8707 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -111,9 +111,11 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) + if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); + return ret; + } /* * This filter cannot function correctly unless connection tracking diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 5d04ef80a61d..848287ab79cf 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; - struct nf_conntrack_tuple tuple; - const struct nf_conntrack_tuple *tuple_ptr = &tuple; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; @@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) u32 key[5]; ct = nf_ct_get(skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if (ct) zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - xt_family(par), net, &tuple)) { - goto hotdrop; - } if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -69,10 +62,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) key[1] = zone->id; } - connections = nf_conncount_count(net, info->data, key, tuple_ptr, - zone); + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); if (connections == 0) - /* kmalloc failed, drop it entirely */ + /* kmalloc failed or tuple couldn't be found, drop it entirely */ goto hotdrop; return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); @@ -86,6 +78,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_connlimit_info *info = par->matchinfo; unsigned int keylen; + int ret; keylen = sizeof(u32); if (par->family == NFPROTO_IPV6) @@ -93,8 +86,17 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) else keylen += sizeof(struct in_addr); + ret = nf_ct_netns_get(par->net, par->family); + if (ret < 0) { + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + /* init private data */ - info->data = nf_conncount_init(par->net, par->family, keylen); + info->data = nf_conncount_init(par->net, keylen); + if (IS_ERR(info->data)) + nf_ct_netns_put(par->net, par->family); return PTR_ERR_OR_ZERO(info->data); } @@ -103,29 +105,45 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; - nf_conncount_destroy(par->net, par->family, info->data); + nf_conncount_destroy(par->net, info->data); + nf_ct_netns_put(par->net, par->family); } -static struct xt_match connlimit_mt_reg __read_mostly = { - .name = "connlimit", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .usersize = offsetof(struct xt_connlimit_info, data), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, +static struct xt_match connlimit_mt_reg[] __read_mostly = { + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_IPV4, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init connlimit_mt_init(void) { - return xt_register_match(&connlimit_mt_reg); + return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } static void __exit connlimit_mt_exit(void) { - xt_unregister_match(&connlimit_mt_reg); + xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index ad3c033db64e..4277084de2e7 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -151,7 +151,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), @@ -161,13 +161,35 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 2, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, - } + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "CONNMARK", + .revision = 2, + .family = NFPROTO_IPV6, + .checkentry = connmark_tg_check, + .target = connmark_tg_v2, + .targetsize = sizeof(struct xt_connmark_tginfo2), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static struct xt_match connmark_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 0859b8f76764..3b507694e81e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -15,7 +15,6 @@ #include <linux/random.h> #include <linux/jhash.h> #include <linux/slab.h> -#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/list.h> @@ -294,8 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, if (size < 16) size = 16; } - /* FIXME: don't use vmalloc() here or anywhere else -HW */ - hinfo = vmalloc(struct_size(hinfo, hash, size)); + hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; @@ -303,7 +301,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); if (ret) { - vfree(hinfo); + kvfree(hinfo); return ret; } @@ -322,7 +320,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, hinfo->rnd_initialized = false; hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); @@ -344,7 +342,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, ops, hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } hinfo->net = net; @@ -363,11 +361,15 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select unsigned int i; for (i = 0; i < ht->cfg.size; i++) { + struct hlist_head *head = &ht->hash[i]; struct dsthash_ent *dh; struct hlist_node *n; + if (hlist_empty(head)) + continue; + spin_lock_bh(&ht->lock); - hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { + hlist_for_each_entry_safe(dh, n, head, node) { if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } @@ -429,7 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) cancel_delayed_work_sync(&hinfo->gc_work); htable_selective_cleanup(hinfo, true); kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); } } diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 1873da3a945a..ca730cedb5d4 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -21,7 +21,7 @@ static bool length_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); + u32 pktlen = skb_ip_totlen(skb); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } @@ -30,8 +30,7 @@ static bool length_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + - sizeof(struct ipv6hdr); + u32 pktlen = skb->len; return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 1ad74b5920b5..59b9d04400ca 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -39,13 +39,35 @@ mark_mt(const struct sk_buff *skb, struct xt_action_param *par) return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -static struct xt_target mark_tg_reg __read_mostly = { - .name = "MARK", - .revision = 2, - .family = NFPROTO_UNSPEC, - .target = mark_tg, - .targetsize = sizeof(struct xt_mark_tginfo2), - .me = THIS_MODULE, +static struct xt_target mark_tg_reg[] __read_mostly = { + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_IPV4, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP) + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_ARP, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#endif +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_IPV6, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#endif }; static struct xt_match mark_mt_reg __read_mostly = { @@ -61,12 +83,12 @@ static int __init mark_mt_init(void) { int ret; - ret = xt_register_target(&mark_tg_reg); + ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&mark_mt_reg); if (ret < 0) { - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); return ret; } return 0; @@ -75,7 +97,7 @@ static int __init mark_mt_init(void) static void __exit mark_mt_exit(void) { xt_unregister_match(&mark_mt_reg); - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); } module_init(mark_mt_init); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 7c6bf1c16813..0ca1cdfc4095 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -38,8 +38,8 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par) nfacct = nfnl_acct_find_get(par->net, info->name); if (nfacct == NULL) { - pr_info_ratelimited("accounting object `%s' does not exists\n", - info->name); + pr_info_ratelimited("accounting object `%.*s' does not exist\n", + NFACCT_NAME_MAX, info->name); return -ENOENT; } info->nfacct = nfacct; diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index e1990baf3a3b..dc9485854002 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); MODULE_ALIAS("ipt_osf"); MODULE_ALIAS("ip6t_osf"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index e85ce69924ae..50332888c8d2 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -76,18 +76,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) */ return false; - filp = sk->sk_socket->file; - if (filp == NULL) + read_lock_bh(&sk->sk_callback_lock); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + if (filp == NULL) { + read_unlock_bh(&sk->sk_callback_lock); return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; + } if (info->match & XT_OWNER_UID) { kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ - !(info->invert & XT_OWNER_UID)) + !(info->invert & XT_OWNER_UID)) { + read_unlock_bh(&sk->sk_callback_lock); return false; + } } if (info->match & XT_OWNER_GID) { @@ -112,10 +117,13 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } } - if (match ^ !(info->invert & XT_OWNER_GID)) + if (match ^ !(info->invert & XT_OWNER_GID)) { + read_unlock_bh(&sk->sk_callback_lock); return false; + } } + read_unlock_bh(&sk->sk_callback_lock); return true; } diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index ec6ed6fda96c..343e65f377d4 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -59,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) (!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED))) return false; - physdev = nf_bridge_get_physindev(skb); + physdev = nf_bridge_get_physindev(skb, xt_net(par)); indev = physdev ? physdev->name : NULL; if ((info->bitmask & XT_PHYSDEV_OP_ISIN && diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 7ddb9a78e3fc..588a5e6ad899 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -59,9 +59,9 @@ MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* fil /* retained for backwards compatibility */ static unsigned int ip_pkt_list_tot __read_mostly; module_param(ip_pkt_list_tot, uint, 0400); -MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 65535)"); -#define XT_RECENT_MAX_NSTAMPS 256 +#define XT_RECENT_MAX_NSTAMPS 65536 struct recent_entry { struct list_head list; @@ -69,7 +69,7 @@ struct recent_entry { union nf_inet_addr addr; u_int16_t family; u_int8_t ttl; - u_int8_t index; + u_int16_t index; u_int16_t nstamps; unsigned long stamps[]; }; @@ -80,7 +80,7 @@ struct recent_table { union nf_inet_addr mask; unsigned int refcnt; unsigned int entries; - u8 nstamps_max_mask; + u_int16_t nstamps_max_mask; struct list_head lru_list; struct list_head iphash[]; }; @@ -561,7 +561,7 @@ recent_mt_proc_write(struct file *file, const char __user *input, { struct recent_table *t = pde_data(file_inode(file)); struct recent_entry *e; - char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; + char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")]; const char *c = buf; union nf_inet_addr addr = {}; u_int16_t family; diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 68ccbe50bb1e..600060ca940a 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -29,7 +29,7 @@ if (tbl == NULL) \ return NULL; \ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ - strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + strscpy(tbl->repl.name, info->name); \ *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index e8961094a282..b46a6a512058 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -149,6 +149,8 @@ static int sctp_mt_check(const struct xt_mtchk_param *par) { const struct xt_sctp_info *info = par->matchinfo; + if (info->flag_count > ARRAY_SIZE(info->flag_info)) + return -EINVAL; if (info->flags & ~XT_SCTP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~XT_SCTP_VALID_FLAGS) diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 7013f55f05d1..76e01f292aaf 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -77,7 +77,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) - pskb->mark = sk->sk_mark; + pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); @@ -138,7 +138,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) - pskb->mark = sk->sk_mark; + pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 11ec2abf0c72..e8991130a3de 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -4,6 +4,7 @@ #include <linux/module.h> #include <net/ip.h> #include <linux/ipv6.h> +#include <linux/icmp.h> #include <net/ipv6.h> #include <net/tcp.h> #include <net/udp.h> @@ -20,6 +21,8 @@ MODULE_ALIAS("ipt_udp"); MODULE_ALIAS("ipt_tcp"); MODULE_ALIAS("ip6t_udp"); MODULE_ALIAS("ip6t_tcp"); +MODULE_ALIAS("ipt_icmp"); +MODULE_ALIAS("ip6t_icmp6"); /* Returns 1 if the port is matched by the range, 0 otherwise */ static inline bool @@ -161,6 +164,95 @@ static int udp_mt_check(const struct xt_mtchk_param *par) return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; } +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code, + u8 type, u8 code) +{ + return type == test_type && code >= min_code && code <= max_code; +} + +static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code, + u8 type, u8 code, bool invert) +{ + return (test_type == 0xFF || + type_code_in_range(test_type, min_code, max_code, type, code)) + ^ invert; +} + +static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code, + u8 type, u8 code, bool invert) +{ + return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert; +} + +static bool +icmp_match(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct icmphdr *ic; + struct icmphdr _icmph; + const struct ipt_icmp *icmpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); + if (!ic) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + par->hotdrop = true; + return false; + } + + return icmp_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->type, ic->code, + !!(icmpinfo->invflags & IPT_ICMP_INV)); +} + +static bool +icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct icmp6hdr *ic; + struct icmp6hdr _icmph; + const struct ip6t_icmp *icmpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); + if (!ic) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + par->hotdrop = true; + return false; + } + + return icmp6_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->icmp6_type, ic->icmp6_code, + !!(icmpinfo->invflags & IP6T_ICMP_INV)); +} + +static int icmp_checkentry(const struct xt_mtchk_param *par) +{ + const struct ipt_icmp *icmpinfo = par->matchinfo; + + return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0; +} + +static int icmp6_checkentry(const struct xt_mtchk_param *par) +{ + const struct ip6t_icmp *icmpinfo = par->matchinfo; + + return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; +} + static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", @@ -216,6 +308,24 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, + { + .name = "icmp", + .match = icmp_match, + .matchsize = sizeof(struct ipt_icmp), + .checkentry = icmp_checkentry, + .proto = IPPROTO_ICMP, + .family = NFPROTO_IPV4, + .me = THIS_MODULE, + }, + { + .name = "icmp6", + .match = icmp6_match, + .matchsize = sizeof(struct ip6t_icmp), + .checkentry = icmp6_checkentry, + .proto = IPPROTO_ICMPV6, + .family = NFPROTO_IPV6, + .me = THIS_MODULE, + }, }; static int __init tcpudp_mt_init(void) diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index 177b40d08098..117d4615d668 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -96,11 +96,32 @@ static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par) return ret ^ data->invert; } +static int u32_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_u32 *data = par->matchinfo; + const struct xt_u32_test *ct; + unsigned int i; + + if (data->ntests > ARRAY_SIZE(data->tests)) + return -EINVAL; + + for (i = 0; i < data->ntests; ++i) { + ct = &data->tests[i]; + + if (ct->nnums > ARRAY_SIZE(ct->location) || + ct->nvalues > ARRAY_SIZE(ct->value)) + return -EINVAL; + } + + return 0; +} + static struct xt_match xt_u32_mt_reg __read_mostly = { .name = "u32", .revision = 0, .family = NFPROTO_UNSPEC, .match = u32_mt, + .checkentry = u32_mt_checkentry, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }; |
