diff options
Diffstat (limited to 'net/netfilter')
125 files changed, 4325 insertions, 2043 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index df2dc21304ef..2560416218d0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -212,7 +212,7 @@ config NF_CT_PROTO_SCTP bool 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default y - select LIBCRC32C + select NET_CRC32C help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -475,7 +475,7 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK - select LIBCRC32C + select NET_CRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to @@ -1180,7 +1180,7 @@ config NETFILTER_XT_MATCH_CGROUP tristate '"control group" match support' depends on NETFILTER_ADVANCED depends on CGROUPS - select CGROUP_NET_CLASSID + select SOCK_CGROUP_DATA help Socket/process control group matching allows you to match locally generated packets based on which net_cls control group processes diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 614815a3ed73..f0aa4d7ef499 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -142,8 +142,13 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ - nf_flow_table_offload.o + nf_flow_table_offload.o nf_flow_table_xdp.o nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o +ifeq ($(CONFIG_NF_FLOW_TABLE),m) +nf_flow_table-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_flow_table_bpf.o +else ifeq ($(CONFIG_NF_FLOW_TABLE),y) +nf_flow_table-$(CONFIG_DEBUG_INFO_BTF) += nf_flow_table_bpf.o +endif obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 3126911f5042..11a702065bab 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@ const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); - #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -655,11 +652,9 @@ void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); int ret; - INIT_LIST_HEAD(&sublist); - list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); ret = nf_hook_slow(skb, state, e, 0); @@ -815,12 +810,21 @@ int __init netfilter_init(void) if (ret < 0) goto err; +#ifdef CONFIG_LWTUNNEL + ret = netfilter_lwtunnel_init(); + if (ret < 0) + goto err_lwtunnel_pernet; +#endif ret = netfilter_log_init(); if (ret < 0) - goto err_pernet; + goto err_log_pernet; return 0; -err_pernet: +err_log_pernet: +#ifdef CONFIG_LWTUNNEL + netfilter_lwtunnel_fini(); +err_lwtunnel_pernet: +#endif unregister_pernet_subsys(&netfilter_net_ops); err: return ret; diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index cb48a2b9cb9f..798c7993635e 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -264,7 +264,7 @@ out: static void mtype_gc(struct timer_list *t) { - struct mtype *map = from_timer(map, t, gc); + struct mtype *map = timer_container_of(map, t, gc); struct ip_set *set = map->set; void *x; u32 id; @@ -294,7 +294,7 @@ mtype_cancel_gc(struct ip_set *set) struct mtype *map = set->data; if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); + timer_delete_sync(&map->gc); } static const struct ip_set_type_variant mtype = { diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e4fa00abde6a..5988b9bb9029 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -163,11 +163,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) { + if (ip > ip_to) swap(ip, ip_to); - if (ip < map->first_ip) - return -IPSET_ERR_BITMAP_RANGE; - } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -178,7 +175,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ip_to = ip; } - if (ip_to > map->last_ip) + if (ip < map->first_ip || ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; for (; !before(ip_to, ip); ip += map->hosts) { diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 3184cc6be4c9..cc20e6d56807 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -53,12 +53,13 @@ MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ -#define ip_set_dereference(p) \ - rcu_dereference_protected(p, \ +#define ip_set_dereference(inst) \ + rcu_dereference_protected((inst)->ip_set_list, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ - lockdep_is_held(&ip_set_ref_lock)) + lockdep_is_held(&ip_set_ref_lock) || \ + (inst)->is_deleted) #define ip_set(inst, id) \ - ip_set_dereference((inst)->ip_set_list)[id] + ip_set_dereference(inst)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] #define ip_set_dereference_nfnl(p) \ @@ -103,14 +104,19 @@ find_set_type(const char *name, u8 family, u8 revision) static bool load_settype(const char *name) { + if (!try_module_get(THIS_MODULE)) + return false; + nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { pr_warn("Can't find ip_set type %s\n", name); nfnl_lock(NFNL_SUBSYS_IPSET); + module_put(THIS_MODULE); return false; } nfnl_lock(NFNL_SUBSYS_IPSET); + module_put(THIS_MODULE); return true; } @@ -1133,7 +1139,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ - tmp = ip_set_dereference(inst->ip_set_list); + tmp = ip_set_dereference(inst); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ @@ -1172,23 +1178,50 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; +/* In order to return quickly when destroying a single set, it is split + * into two stages: + * - Cancel garbage collector + * - Destroy the set itself via call_rcu() + */ + static void -ip_set_destroy_set(struct ip_set *set) +ip_set_destroy_set_rcu(struct rcu_head *head) { - pr_debug("set: %s\n", set->name); + struct ip_set *set = container_of(head, struct ip_set, rcu); - /* Must call it without holding any lock */ set->variant->destroy(set); module_put(set->type->me); kfree(set); } static void -ip_set_destroy_set_rcu(struct rcu_head *head) +_destroy_all_sets(struct ip_set_net *inst) { - struct ip_set *set = container_of(head, struct ip_set, rcu); + struct ip_set *set; + ip_set_id_t i; + bool need_wait = false; - ip_set_destroy_set(set); + /* First cancel gc's: set:list sets are flushed as well */ + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + set->variant->cancel_gc(set); + if (set->type->features & IPSET_TYPE_NAME) + need_wait = true; + } + } + /* Must wait for flush to be really finished */ + if (need_wait) + rcu_barrier(); + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); + } + } } static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, @@ -1202,11 +1235,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call - * ip_set_put|get_nfnl_* functions, that way we + * ip_set_nfnl_get_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference @@ -1214,8 +1246,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, * without holding the lock. */ if (!attr[IPSET_ATTR_SETNAME]) { - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); @@ -1226,15 +1256,7 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); - for (i = 0; i < inst->ip_set_max; i++) { - s = ip_set(inst, i); - if (s) { - ip_set(inst, i) = NULL; - /* Must cancel garbage collectors */ - s->variant->cancel_gc(s); - ip_set_destroy_set(s); - } - } + _destroy_all_sets(inst); /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { @@ -1255,12 +1277,12 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); if (features & IPSET_TYPE_NAME) { /* Must wait for flush to be really finished */ rcu_barrier(); } - /* Must cancel garbage collectors */ - s->variant->cancel_gc(s); call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; @@ -2365,30 +2387,25 @@ ip_set_net_init(struct net *net) } static void __net_exit -ip_set_net_exit(struct net *net) +ip_set_net_pre_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set = NULL; - ip_set_id_t i; - inst->is_deleted = true; /* flag for ip_set_nfnl_put */ +} - nfnl_lock(NFNL_SUBSYS_IPSET); - for (i = 0; i < inst->ip_set_max; i++) { - set = ip_set(inst, i); - if (set) { - ip_set(inst, i) = NULL; - set->variant->cancel_gc(set); - ip_set_destroy_set(set); - } - } - nfnl_unlock(NFNL_SUBSYS_IPSET); +static void __net_exit +ip_set_net_exit(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); + + _destroy_all_sets(inst); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, + .pre_exit = ip_set_net_pre_exit, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index cf3ce72c3de6..5251524b96af 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -64,7 +64,7 @@ struct hbucket { #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) #define ahash_region(n, htable_bits) \ - ((n) % ahash_numof_locks(htable_bits)) + ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ : (h) * jhash_size(HTABLE_REGION_BITS)) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 6c3f28bc59b3..13c7a08aa868 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -79,7 +79,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -99,7 +99,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -188,9 +188,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *next, *prev = NULL; - int ret; + int ret = 0; - list_for_each_entry(e, &map->members, list) { + rcu_read_lock(); + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -201,6 +202,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (d->before == 0) { ret = 1; + goto out; } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && @@ -208,9 +210,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, } else { ret = prev && prev->id == d->refid; } - return ret; + goto out; } - return 0; +out: + rcu_read_unlock(); + return ret; } static void @@ -239,7 +243,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* Find where to add the new entry */ n = prev = next = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -316,9 +320,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e, *next, *prev = NULL; + struct set_elem *e, *n, *next, *prev = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_safe(e, n, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -424,14 +428,8 @@ static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e, *n; - list_for_each_entry_safe(e, n, &map->members, list) { - list_del(&e->list); - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - kfree(e); - } + WARN_ON_ONCE(!list_empty(&map->members)); kfree(map); set->data = NULL; @@ -549,6 +547,9 @@ list_set_cancel_gc(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) timer_shutdown_sync(&map->gc); + + /* Flush list to drop references to other ipsets */ + list_set_flush(set); } static const struct ip_set_type_variant set_variant = { @@ -570,7 +571,7 @@ static const struct ip_set_type_variant set_variant = { static void list_set_gc(struct timer_list *t) { - struct list_set *map = from_timer(map, t, gc); + struct list_set *map = timer_container_of(map, t, gc); struct ip_set *set = map->set; spin_lock_bh(&set->lock); @@ -610,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) return true; } +static struct lock_class_key list_set_lockdep_key; + static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -626,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; + lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 2a3017b9c001..c203252e856d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -105,7 +105,7 @@ config IP_VS_PROTO_AH config IP_VS_PROTO_SCTP bool "SCTP load balancing support" - select LIBCRC32C + select NET_CRC32C help This option enables support for load balancing SCTP transport protocol. Say Y if unsure. diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 98d7dbe3d787..44b2ad695c15 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -822,7 +822,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) /* Try to delete connection while not holding reference */ static void ip_vs_conn_del(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) { + if (timer_delete(&cp->timer)) { /* Drop cp->control chain too */ if (cp->control) cp->timeout = 0; @@ -833,7 +833,7 @@ static void ip_vs_conn_del(struct ip_vs_conn *cp) /* Try to delete connection while holding reference */ static void ip_vs_conn_del_put(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) { + if (timer_delete(&cp->timer)) { /* Drop cp->control chain too */ if (cp->control) cp->timeout = 0; @@ -846,7 +846,7 @@ static void ip_vs_conn_del_put(struct ip_vs_conn *cp) static void ip_vs_conn_expire(struct timer_list *t) { - struct ip_vs_conn *cp = from_timer(cp, t, timer); + struct ip_vs_conn *cp = timer_container_of(cp, t, timer); struct netns_ipvs *ipvs = cp->ipvs; /* @@ -860,7 +860,7 @@ static void ip_vs_conn_expire(struct timer_list *t) struct ip_vs_conn *ct = cp->control; /* delete the timer if it is activated by other users */ - del_timer(&cp->timer); + timer_delete(&cp->timer); /* does anybody control me? */ if (ct) { @@ -1046,28 +1046,35 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, #ifdef CONFIG_PROC_FS struct ip_vs_iter_state { struct seq_net_private p; - struct hlist_head *l; + unsigned int bucket; + unsigned int skip_elems; }; -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +static void *ip_vs_conn_array(struct ip_vs_iter_state *iter) { int idx; struct ip_vs_conn *cp; - struct ip_vs_iter_state *iter = seq->private; - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) { + unsigned int skip = 0; + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { /* __ip_vs_conn_get() is not needed by * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show */ - if (pos-- == 0) { - iter->l = &ip_vs_conn_tab[idx]; + if (skip >= iter->skip_elems) { + iter->bucket = idx; return cp; } + + ++skip; } + + iter->skip_elems = 0; cond_resched_rcu(); } + iter->bucket = idx; return NULL; } @@ -1076,9 +1083,14 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) { struct ip_vs_iter_state *iter = seq->private; - iter->l = NULL; rcu_read_lock(); - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; + if (*pos == 0) { + iter->skip_elems = 0; + iter->bucket = 0; + return SEQ_START_TOKEN; + } + + return ip_vs_conn_array(iter); } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -1086,28 +1098,22 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct ip_vs_conn *cp = v; struct ip_vs_iter_state *iter = seq->private; struct hlist_node *e; - struct hlist_head *l = iter->l; - int idx; ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); + return ip_vs_conn_array(iter); /* more on same hash chain? */ e = rcu_dereference(hlist_next_rcu(&cp->c_list)); - if (e) + if (e) { + iter->skip_elems++; return hlist_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - while (++idx < ip_vs_conn_tab_size) { - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - iter->l = &ip_vs_conn_tab[idx]; - return cp; - } - cond_resched_rcu(); } - iter->l = NULL; - return NULL; + + iter->skip_elems = 0; + iter->bucket++; + + return ip_vs_conn_array(iter); } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) @@ -1495,8 +1501,8 @@ int __init ip_vs_conn_init(void) max_avail -= 2; /* ~4 in hash row */ max_avail -= 1; /* IPVS up to 1/2 of mem */ max_avail -= order_base_2(sizeof(struct ip_vs_conn)); - max = clamp(max, min, max_avail); - ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max); + max = clamp(max_avail, min, max); + ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a2c16b501087..c7a8a08b7308 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1550,6 +1550,7 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 type; /* Only support version 0 and C (csum) */ @@ -1560,7 +1561,10 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (type != htons(ETH_P_IP)) goto unk; *proto = IPPROTO_IPIP; - return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + + gre_flags_to_tnl_flags(flags, greh->flags); + + return gre_calc_hlen(flags); } unk: diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 143a341bbc0a..6a6fc4478533 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -94,6 +94,7 @@ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; + int amemthresh; int nomem; int to_change = -1; @@ -105,7 +106,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ - nomem = (availmem < ipvs->sysctl_amemthresh); + amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); + nomem = (availmem < amemthresh); local_bh_disable(); @@ -145,9 +147,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 1: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; @@ -155,9 +156,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 2: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; @@ -848,7 +848,7 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; - del_timer_sync(&ipvs->dest_trash_timer); + timer_delete_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { list_del(&dest->t_list); @@ -1331,7 +1331,8 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) static void ip_vs_dest_trash_expire(struct timer_list *t) { - struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); + struct netns_ipvs *ipvs = timer_container_of(ipvs, t, + dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; @@ -1459,18 +1460,18 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (ret < 0) goto out_err; - /* Bind the ct retriever */ - RCU_INIT_POINTER(svc->pe, pe); - pe = NULL; - /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - if (svc->pe && svc->pe->conn_out) + if (pe && pe->conn_out) atomic_inc(&ipvs->conn_out_counter); + /* Bind the ct retriever */ + RCU_INIT_POINTER(svc->pe, pe); + pe = NULL; + /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; @@ -1846,7 +1847,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) #ifdef CONFIG_SYSCTL static int -proc_do_defense_mode(struct ctl_table *table, int write, +proc_do_defense_mode(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; @@ -1873,7 +1874,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, } static int -proc_do_sync_threshold(struct ctl_table *table, int write, +proc_do_sync_threshold(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; @@ -1901,7 +1902,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, } static int -proc_do_sync_ports(struct ctl_table *table, int write, +proc_do_sync_ports(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1924,7 +1925,8 @@ proc_do_sync_ports(struct ctl_table *table, int write, return rc; } -static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer) +static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, + void *buffer) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; @@ -1962,8 +1964,8 @@ out: return ret; } -static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, - size_t size) +static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, + void *buffer, size_t size) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; @@ -1983,7 +1985,7 @@ static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, return ret; } -static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, +static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2010,7 +2012,7 @@ static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, return ret; } -static int ipvs_proc_est_nice(struct ctl_table *table, int write, +static int ipvs_proc_est_nice(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; @@ -2040,7 +2042,7 @@ static int ipvs_proc_est_nice(struct ctl_table *table, int write, return ret; } -static int ipvs_proc_run_estimation(struct ctl_table *table, int write, +static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; @@ -2263,7 +2265,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif - { } }; #endif @@ -3091,12 +3092,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; - int size; + size_t size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } @@ -3132,12 +3133,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; - int size; + size_t size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } @@ -3662,10 +3663,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); - if (nla_addr_family) - udest->af = nla_get_u16(nla_addr_family); - else - udest->af = 0; + udest->af = nla_get_u16_default(nla_addr_family, 0); /* If a full entry was requested, check for the additional fields */ if (full_entry) { @@ -4270,6 +4268,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) struct ctl_table *tbl; int idx, ret; size_t ctl_table_size = ARRAY_SIZE(vs_vars); + bool unpriv = net->user_ns != &init_user_ns; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); @@ -4284,12 +4283,6 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; - ctl_table_size = 0; - } } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -4315,10 +4308,17 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; @@ -4341,15 +4341,22 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; + ipvs->sysctl_run_estimation = 1; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_cpulist; ipvs->sysctl_est_nice = IPVS_EST_NICE; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index f53899d12416..d8a284999544 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -35,7 +35,7 @@ #include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/ip_vs.h> diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 8ceec7a2fa8f..156181a3bacd 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -123,7 +123,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -293,7 +292,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) */ static void ip_vs_lblc_check_expire(struct timer_list *t) { - struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_lblc_table *tbl = timer_container_of(tbl, t, + periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; @@ -563,10 +563,8 @@ static int __net_init __ip_vs_lblc_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - ipvs->lblc_ctl_table[0].procname = NULL; + if (net->user_ns != &init_user_ns) vars_table_size = 0; - } } else ipvs->lblc_ctl_table = vs_vars_table; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0fb64707213f..a021e6aba3d7 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -294,7 +294,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -457,7 +456,8 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) */ static void ip_vs_lblcr_check_expire(struct timer_list *t) { - struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_lblcr_table *tbl = timer_container_of(tbl, t, + periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; @@ -749,10 +749,8 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - ipvs->lblcr_ctl_table[0].procname = NULL; + if (net->user_ns != &init_user_ns) vars_table_size = 0; - } } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index f100da4ba3bc..a9fd1d3fc2cb 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -340,7 +340,7 @@ void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_protocol_init(void) { - char protocols[64]; + char protocols[64] = { 0 }; #define REGISTER_PROTOCOL(p) \ do { \ register_ip_vs_protocol(p); \ @@ -348,8 +348,6 @@ int __init ip_vs_protocol_init(void) strcat(protocols, (p)->name); \ } while (0) - protocols[0] = '\0'; - protocols[2] = '\0'; #ifdef CONFIG_IP_VS_PROTO_TCP REGISTER_PROTOCOL(&ip_vs_protocol_tcp); #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index a0921adc31a9..83e452916403 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -126,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -174,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index be74c0906dda..3402675bf521 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -51,7 +51,7 @@ #include <linux/kernel.h> #include <linux/sched/signal.h> -#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ +#include <linux/unaligned.h> /* Used for ntoh_seq and hton_seq */ #include <net/ip.h> #include <net/sock.h> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 65e0259178da..014f07740369 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -119,13 +119,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) return false; } -/* Get route to daddr, update *saddr, optionally bind route to saddr */ +/* Get route to daddr, optionally bind route to saddr */ static struct rtable *do_output_route4(struct net *net, __be32 daddr, - int rt_mode, __be32 *saddr) + int rt_mode, __be32 *ret_saddr) { struct flowi4 fl4; struct rtable *rt; - bool loop = false; memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -135,23 +134,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, retry: rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { - /* Invalid saddr ? */ - if (PTR_ERR(rt) == -EINVAL && *saddr && - rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { - *saddr = 0; - flowi4_update_output(&fl4, 0, daddr, 0); - goto retry; - } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); return NULL; - } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + } + if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); - *saddr = fl4.saddr; flowi4_update_output(&fl4, 0, daddr, fl4.saddr); - loop = true; + rt_mode = 0; goto retry; } - *saddr = fl4.saddr; + if (ret_saddr) + *ret_saddr = fl4.saddr; return rt; } @@ -180,7 +173,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && (addr_type & IPV6_ADDR_LOOPBACK); old_rt_is_local = __ip_vs_is_local_route6( - (struct rt6_info *)skb_dst(skb)); + dst_rt6_info(skb_dst(skb))); } else #endif { @@ -318,7 +311,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rtable *) dest_dst->dst_cache; + rt = dst_rtable(dest_dst->dst_cache); else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); @@ -344,19 +337,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; } else { - __be32 saddr = htonl(INADDR_ANY); - noref = 0; /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; - rt = do_output_route4(net, daddr, rt_mode, &saddr); + rt = do_output_route4(net, daddr, rt_mode, ret_saddr); if (!rt) goto err_unreach; - if (ret_saddr) - *ret_saddr = saddr; } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; @@ -390,10 +379,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < 68) { @@ -481,7 +470,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rt6_info *) dest_dst->dst_cache; + rt = dst_rt6_info(dest_dst->dst_cache); else { u32 cookie; @@ -501,7 +490,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, ip_vs_dest_dst_free(dest_dst); goto err_unreach; } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); @@ -517,7 +506,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, rt_mode); if (!dst) goto err_unreach; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); } local = __ip_vs_is_local_route6(rt); @@ -553,10 +542,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < IPV6_MIN_MTU) { @@ -862,7 +851,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_RDR); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -1082,11 +1071,11 @@ ipvs_gre_encap(struct net *net, struct sk_buff *skb, { __be16 proto = *next_protocol == IPPROTO_IPIP ? htons(ETH_P_IP) : htons(ETH_P_IPV6); - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t hdrlen; if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); hdrlen = gre_calc_hlen(tflags); gre_build_header(skb, hdrlen, tflags, proto, 0, 0); @@ -1165,11 +1154,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1288,7 +1277,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (local) return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); tdev = rt->dst.dev; /* @@ -1310,11 +1299,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1590,7 +1579,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 5257d5e7eb09..06b084844700 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -23,6 +23,7 @@ static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, struct bpf_nf_link { struct bpf_link link; struct nf_hook_ops hook_ops; + netns_tracker ns_tracker; struct net *net; u32 dead; const struct nf_defrag_hook *defrag_hook; @@ -42,7 +43,7 @@ get_proto_defrag_hook(struct bpf_nf_link *link, hook = rcu_dereference(*ptr_global_hook); if (!hook) { rcu_read_unlock(); - err = request_module(mod); + err = request_module("%s", mod); if (err) return ERR_PTR(err < 0 ? err : -EINVAL); @@ -120,6 +121,7 @@ static void bpf_nf_link_release(struct bpf_link *link) if (!cmpxchg(&nf_link->dead, 0, 1)) { nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); bpf_nf_disable_defrag(nf_link); + put_net_track(nf_link->net, &nf_link->ns_tracker); } } @@ -150,11 +152,12 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + const struct nf_defrag_hook *hook = nf_link->defrag_hook; info->netfilter.pf = nf_link->hook_ops.pf; info->netfilter.hooknum = nf_link->hook_ops.hooknum; info->netfilter.priority = nf_link->hook_ops.priority; - info->netfilter.flags = 0; + info->netfilter.flags = hook ? BPF_F_NETFILTER_IP_DEFRAG : 0; return 0; } @@ -257,6 +260,8 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + get_net_track(net, &link->ns_tracker, GFP_KERNEL); + return bpf_link_settle(&link_primer); } diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 8715617b02fe..913ede2f57f9 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -132,7 +132,7 @@ static int __nf_conncount_add(struct net *net, struct nf_conn *found_ct; unsigned int collect = 0; - if (time_is_after_eq_jiffies((unsigned long)list->last_gc)) + if ((u32)jiffies == list->last_gc) goto add_new_node; /* check the saved connections */ @@ -234,7 +234,7 @@ bool nf_conncount_gc_list(struct net *net, bool ret = false; /* don't bother if we just did GC */ - if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc))) + if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; /* don't bother if other cpu is already doing GC */ @@ -321,7 +321,6 @@ insert_tree(struct net *net, struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; - u8 keylen = data->keylen; bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); @@ -333,7 +332,7 @@ restart: rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); parent = *rbnode; - diff = key_diff(key, rbconn->key, keylen); + diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { rbnode = &((*rbnode)->rb_left); } else if (diff > 0) { @@ -378,7 +377,9 @@ restart: conn->tuple = *tuple; conn->zone = *zone; - memcpy(rbconn->key, key, sizeof(u32) * keylen); + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); @@ -403,7 +404,6 @@ count_tree(struct net *net, struct rb_node *parent; struct nf_conncount_rb *rbconn; unsigned int hash; - u8 keylen = data->keylen; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; @@ -414,7 +414,7 @@ count_tree(struct net *net, rbconn = rb_entry(parent, struct nf_conncount_rb, node); - diff = key_diff(key, rbconn->key, keylen); + diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { @@ -524,11 +524,10 @@ unsigned int nf_conncount_count(struct net *net, } EXPORT_SYMBOL_GPL(nf_conncount_count); -struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, - unsigned int keylen) +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { struct nf_conncount_data *data; - int ret, i; + int i; if (keylen % sizeof(u32) || keylen / sizeof(u32) > MAX_KEYLEN || @@ -541,12 +540,6 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family if (!data) return ERR_PTR(-ENOMEM); - ret = nf_ct_netns_get(net, family); - if (ret < 0) { - kfree(data); - return ERR_PTR(ret); - } - for (i = 0; i < ARRAY_SIZE(data->root); ++i) data->root[i] = RB_ROOT; @@ -583,13 +576,11 @@ static void destroy_tree(struct rb_root *r) } } -void nf_conncount_destroy(struct net *net, unsigned int family, - struct nf_conncount_data *data) +void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data) { unsigned int i; cancel_work_sync(&data->gc_work); - nf_ct_netns_put(net, family); for (i = 0; i < ARRAY_SIZE(data->root); ++i) destroy_tree(&data->root[i]); diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index d011d2eb0848..7be4c35e4795 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb, /* increase the UDP timeout of the master connection as replies from * Amanda clients to the server can be quite delayed */ - nf_ct_refresh(ct, skb, master_timeout * HZ); + nf_ct_refresh(ct, master_timeout * HZ); /* No data? */ dataoff = protoff + sizeof(struct udphdr); diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index d2492d050fe6..4a136fc3a9c0 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -32,7 +32,9 @@ * -EINVAL - Passed NULL for bpf_tuple pointer * -EINVAL - opts->reserved is not 0 * -EINVAL - netns_id is less than -1 - * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12) + * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12 + * -EINVAL - opts->ct_zone_id set when + opts__sz isn't NF_BPF_CT_OPTS_SZ (16) * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP * -ENONET - No network namespace found for netns_id * -ENOENT - Conntrack lookup could not find entry for tuple @@ -42,6 +44,8 @@ * Values: * IPPROTO_TCP, IPPROTO_UDP * @dir: - connection tracking tuple direction. + * @ct_zone_id - connection tracking zone id. + * @ct_zone_dir - connection tracking zone direction. * @reserved - Reserved member, will be reused for more options in future * Values: * 0 @@ -51,11 +55,13 @@ struct bpf_ct_opts { s32 error; u8 l4proto; u8 dir; - u8 reserved[2]; + u16 ct_zone_id; + u8 ct_zone_dir; + u8 reserved[3]; }; enum { - NF_BPF_CT_OPTS_SZ = 12, + NF_BPF_CT_OPTS_SZ = 16, }; static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, @@ -104,12 +110,21 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, u32 timeout) { struct nf_conntrack_tuple otuple, rtuple; + struct nf_conntrack_zone ct_zone; struct nf_conn *ct; int err; - if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || - opts_len != NF_BPF_CT_OPTS_SZ) + if (!opts || !bpf_tuple) return ERR_PTR(-EINVAL); + if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) + return ERR_PTR(-EINVAL); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) + return ERR_PTR(-EINVAL); + } else { + if (opts->ct_zone_id) + return ERR_PTR(-EINVAL); + } if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) return ERR_PTR(-EINVAL); @@ -130,7 +145,16 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, return ERR_PTR(-ENONET); } - ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple, + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->ct_zone_dir == 0) + opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; + nf_ct_zone_init(&ct_zone, + opts->ct_zone_id, opts->ct_zone_dir, 0); + } else { + ct_zone = nf_ct_zone_dflt; + } + + ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple, GFP_ATOMIC); if (IS_ERR(ct)) goto out; @@ -152,12 +176,21 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, { struct nf_conntrack_tuple_hash *hash; struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone ct_zone; struct nf_conn *ct; int err; - if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || - opts_len != NF_BPF_CT_OPTS_SZ) + if (!opts || !bpf_tuple) return ERR_PTR(-EINVAL); + if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) + return ERR_PTR(-EINVAL); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) + return ERR_PTR(-EINVAL); + } else { + if (opts->ct_zone_id) + return ERR_PTR(-EINVAL); + } if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) return ERR_PTR(-EPROTO); if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) @@ -174,7 +207,16 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, return ERR_PTR(-ENONET); } - hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->ct_zone_dir == 0) + opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; + nf_ct_zone_init(&ct_zone, + opts->ct_zone_id, opts->ct_zone_dir, 0); + } else { + ct_zone = nf_ct_zone_dflt; + } + + hash = nf_conntrack_find_get(net, &ct_zone, &tuple); if (opts->netns_id >= 0) put_net(net); if (!hash) @@ -245,7 +287,7 @@ __bpf_kfunc_start_defs(); * @opts - Additional options for allocation (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ __bpf_kfunc struct nf_conn___init * bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, @@ -279,7 +321,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for lookup (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ __bpf_kfunc struct nf_conn * bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, @@ -312,7 +354,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for allocation (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ __bpf_kfunc struct nf_conn___init * bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, @@ -347,7 +389,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for lookup (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ __bpf_kfunc struct nf_conn * bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index cfa0fe0356de..a7552a46d6ac 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -75,7 +75,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); - nf_ct_refresh(ct, skb, timeout * HZ); + nf_ct_refresh(ct, timeout * HZ); out: return NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c63868666bd9..201d3c4ec623 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -505,6 +505,11 @@ u32 nf_ct_get_id(const struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_ct_get_id); +static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) +{ + return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); +} + static void clean_from_lists(struct nf_conn *ct) { @@ -531,10 +536,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); - if (tmpl != p) { - tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); + if (tmpl != p) tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; - } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) @@ -988,6 +991,56 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) tstamp->start = ktime_get_real_ns(); } +/** + * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow + * @ct1: conntrack in hash table to check against + * @ct2: merge candidate + * + * returns true if ct1 and ct2 happen to refer to the same flow, but + * in opposing directions, i.e. + * ct1: a:b -> c:d + * ct2: c:d -> a:b + * for both directions. If so, @ct2 should not have been created + * as the skb should have been picked up as ESTABLISHED flow. + * But ct1 was not yet committed to hash table before skb that created + * ct2 had arrived. + * + * Note we don't compare netns because ct entries in different net + * namespace cannot clash to begin with. + * + * @return: true if ct1 and ct2 are identical when swapping origin/reply. + */ +static bool +nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) +{ + u16 id1, id2; + + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) + return false; + + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, + &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + return false; + + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); + if (id1 != id2) + return false; + + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); + + return id1 == id2; +} + +static int nf_ct_can_merge(const struct nf_conn *ct, + const struct nf_conn *loser_ct) +{ + return nf_ct_match(ct, loser_ct) || + nf_ct_match_reverse(ct, loser_ct); +} + /* caller must hold locks to prevent concurrent changes */ static int __nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) @@ -999,11 +1052,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, loser_ct = nf_ct_get(skb, &ctinfo); - if (nf_ct_is_dying(ct)) - return NF_DROP; - - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || - nf_ct_match(ct, loser_ct)) { + if (nf_ct_can_merge(ct, loser_ct)) { struct net *net = nf_ct_net(ct); nf_conntrack_get(&ct->ct_general); @@ -1090,7 +1139,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. * - * If there is one, @skb (and the assocated, unconfirmed conntrack) has + * If there is one, @skb (and the associated, unconfirmed conntrack) has * to be dropped. In case @skb is retransmitted, next conntrack lookup * will find the already-existing entry. * @@ -1440,8 +1489,6 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) const struct nf_conntrack_l4proto *l4proto; u8 protonum = nf_ct_protonum(ct); - if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) - return false; if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; @@ -1500,12 +1547,6 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { - nf_ct_offload_timeout(tmp); - if (!nf_conntrack_max95) - continue; - } - if (expired_count > GC_SCAN_EXPIRED_MAX) { rcu_read_unlock(); @@ -1724,7 +1765,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) - return (struct nf_conntrack_tuple_hash *)ct; + return ERR_CAST(ct); if (!nf_ct_add_synproxy(ct, tmpl)) { nf_conntrack_free(ct); @@ -2024,7 +2065,7 @@ repeat: goto repeat; NF_CT_STAT_INC_ATOMIC(state->net, invalid); - if (ret == -NF_DROP) + if (ret == NF_DROP) NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = -ret; @@ -2045,9 +2086,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_in); /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, u32 extra_jiffies, - bool do_acct) + unsigned int bytes) { /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) @@ -2060,8 +2100,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (READ_ONCE(ct->timeout) != extra_jiffies) WRITE_ONCE(ct->timeout, extra_jiffies); acct: - if (do_acct) - nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); + if (bytes) + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -2153,80 +2193,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) nf_conntrack_get(skb_nfct(nskb)); } -static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - const struct nf_nat_hook *nat_hook; - struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - unsigned int status; - int dataoff; - u16 l3num; - u8 l4num; - - l3num = nf_ct_l3num(ct); - - dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); - if (dataoff <= 0) - return NF_DROP; - - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - l4num, net, &tuple)) - return NF_DROP; - - if (ct->status & IPS_SRC_NAT) { - memcpy(tuple.src.u3.all, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, - sizeof(tuple.src.u3.all)); - tuple.src.u.all = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; - } - - if (ct->status & IPS_DST_NAT) { - memcpy(tuple.dst.u3.all, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, - sizeof(tuple.dst.u3.all)); - tuple.dst.u.all = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; - } - - h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); - if (!h) - return NF_ACCEPT; - - /* Store status bits of the conntrack that is clashing to re-do NAT - * mangling according to what it has been done already to this packet. - */ - status = ct->status; - - nf_ct_put(ct); - ct = nf_ct_tuplehash_to_ctrack(h); - nf_ct_set(skb, ct, ctinfo); - - nat_hook = rcu_dereference(nf_nat_hook); - if (!nat_hook) - return NF_ACCEPT; - - if (status & IPS_SRC_NAT) { - unsigned int verdict = nat_hook->manip_pkt(skb, ct, - NF_NAT_MANIP_SRC, - IP_CT_DIR_ORIGINAL); - if (verdict != NF_ACCEPT) - return verdict; - } - - if (status & IPS_DST_NAT) { - unsigned int verdict = nat_hook->manip_pkt(skb, ct, - NF_NAT_MANIP_DST, - IP_CT_DIR_ORIGINAL); - if (verdict != NF_ACCEPT) - return verdict; - } - - return NF_ACCEPT; -} - /* This packet is coming from userspace via nf_queue, complete the packet * processing after the helper invocation in nf_confirm(). */ @@ -2290,17 +2256,6 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) if (!ct) return NF_ACCEPT; - if (!nf_ct_is_confirmed(ct)) { - int ret = __nf_conntrack_update(net, skb, ct, ctinfo); - - if (ret != NF_ACCEPT) - return ret; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return NF_ACCEPT; - } - return nf_confirm_cthelper(skb, ct, ctinfo); } @@ -2558,12 +2513,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) struct hlist_nulls_head *hash; unsigned int nr_slots, i; - if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) @@ -2757,6 +2715,7 @@ static const struct nf_ct_hook nf_conntrack_hook = { .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, .confirm = __nf_conntrack_confirm, + .get_id = nf_conntrack_get_id, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 69948e1d6974..af68c64acaab 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -162,6 +162,14 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, return ret; } +static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (local64_read(&e->timestamp)) + local64_set(&e->timestamp, ktime_get_real_ns()); +#endif +} + int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, u32 portid, int report) { @@ -186,6 +194,8 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, /* This is a resent of a destroy event? If so, skip missed */ missed = e->portid ? 0 : e->missed; + nf_ct_ecache_tstamp_refresh(e); + ret = __nf_conntrack_eventmask_report(e, events, missed, &item); if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { /* This is a destroy event that has been triggered by a process, @@ -297,6 +307,18 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) } } +static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + u64 ts = 0; + + if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + ts = ktime_get_real_ns(); + + local64_set(&e->timestamp, ts); +#endif +} + bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); @@ -326,6 +348,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); if (e) { + nf_ct_ecache_tstamp_new(ct, e); e->ctmask = ctmask; e->expmask = expmask; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 21fa550966f0..cfc2daa3fc7f 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { - struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); + struct nf_conntrack_expect *exp = timer_container_of(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); @@ -118,7 +118,7 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { - if (del_timer(&exp->timeout)) { + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; @@ -214,11 +214,11 @@ nf_ct_find_expectation(struct net *net, if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; - } else if (del_timer(&exp->timeout)) { + } else if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } - /* Undo exp->master refcnt increase, if del_timer() failed */ + /* Undo exp->master refcnt increase, if timer_delete() failed */ nf_ct_put(exp->master); return NULL; @@ -520,7 +520,7 @@ void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, vo hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { - if (iter(exp, data) && del_timer(&exp->timeout)) { + if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } @@ -550,7 +550,7 @@ void nf_ct_expect_iterate_net(struct net *net, if (!net_eq(nf_ct_exp_net(exp), net)) continue; - if (iter(exp, data) && del_timer(&exp->timeout)) { + if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 5a9bce24f3c3..14f73872f647 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1385,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, if (info->timeout > 0) { pr_debug("nf_ct_ras: set RAS connection timeout to " "%u seconds\n", info->timeout); - nf_ct_refresh(ct, skb, info->timeout * HZ); + nf_ct_refresh(ct, info->timeout * HZ); /* Set expect timeout */ spin_lock_bh(&nf_conntrack_expect_lock); @@ -1433,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, info->sig_port[!dir] = 0; /* Give it 30 seconds for UCF or URJ */ - nf_ct_refresh(ct, skb, 30 * HZ); + nf_ct_refresh(ct, 30 * HZ); return 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3b846cbdc050..2cc0fde23344 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -357,11 +357,11 @@ nla_put_failure: static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; - int len, ret; - char *secctx; + struct lsm_context ctx; + int ret; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return 0; ret = -1; @@ -369,20 +369,37 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_secctx) goto nla_put_failure; - if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) + if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context)) goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; nla_put_failure: - security_release_secctx(secctx, len); + security_release_secctx(&ctx); return ret; } #else #define ctnetlink_dump_secctx(a, b) (0) #endif -#ifdef CONFIG_NF_CONNTRACK_LABELS +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int +ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); + + if (e) { + u64 ts = local64_read(&e->timestamp); + + if (ts) + return nla_put_be64(skb, CTA_TIMESTAMP_EVENT, + cpu_to_be64(ts), CTA_TIMESTAMP_PAD); + } +#endif + return 0; +} + static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); @@ -391,6 +408,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct) return 0; return nla_total_size(sizeof(labels->bits)); } +#endif static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) @@ -411,10 +429,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) return 0; } -#else -#define ctnetlink_dump_labels(a, b) (0) -#define ctnetlink_label_size(a) (0) -#endif #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) @@ -652,7 +666,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) return len + len4; } -#endif static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { @@ -667,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK - int len, ret; + int ret; - ret = security_secid_to_secctx(ct->secmark, NULL, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, NULL); + if (ret < 0) return 0; return nla_total_size(0) /* CTA_SECCTX */ - + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ + + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */ #else return 0; #endif @@ -690,6 +703,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) return 0; #endif } +#endif #ifdef CONFIG_NF_CONNTRACK_EVENTS static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) @@ -720,6 +734,9 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */ +#endif ; } @@ -841,6 +858,10 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK))) goto nla_put_failure; #endif + + if (ctnetlink_dump_event_timestamp(skb, ct)) + goto nla_put_failure; + nlmsg_end(skb, nlh); err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); @@ -1560,6 +1581,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, [CTA_STATUS_MASK] = { .type = NLA_U32 }, + [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) @@ -1579,9 +1601,6 @@ static int ctnetlink_flush_conntrack(struct net *net, }; if (ctnetlink_needs_filter(family, cda)) { - if (cda[CTA_FILTER]) - return -EOPNOTSUPP; - filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -1610,14 +1629,14 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, if (err < 0) return err; - if (cda[CTA_TUPLE_ORIG]) + if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, family, &zone); - else if (cda[CTA_TUPLE_REPLY]) + else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, family, &zone); else { - u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC; + u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, @@ -3420,7 +3439,8 @@ static int ctnetlink_del_expect(struct sk_buff *skb, if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); - if (ntohl(id) != (u32)(unsigned long)exp) { + + if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } @@ -3428,7 +3448,7 @@ static int ctnetlink_del_expect(struct sk_buff *skb, /* after list removal, usage count == 1 */ spin_lock_bh(&nf_conntrack_expect_lock); - if (del_timer(&exp->timeout)) { + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_expect_put(exp); @@ -3457,7 +3477,7 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { if (cda[CTA_EXPECT_TIMEOUT]) { - if (!del_timer(&x->timeout)) + if (!timer_delete(&x->timeout)) return -ETIME; x->timeout.expires = jiffies + @@ -3875,7 +3895,7 @@ static int __init ctnetlink_init(void) { int ret; - NL_ASSERT_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx); + NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index e2db1f4ec2df..ebc4f733bb2e 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -525,7 +525,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); if (!dh) - return NF_DROP; + return -NF_ACCEPT; if (dccp_error(dh, skb, dataoff, state)) return -NF_ACCEPT; @@ -533,7 +533,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, /* pull again, including possible 48 bit sequences and subtype header */ dh = dccp_header_pointer(skb, dataoff, dh, &_dh); if (!dh) - return NF_DROP; + return -NF_ACCEPT; type = dh->dccph_type; if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 1020d67600a9..327b8059025d 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = { [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, - [ICMPV6_MLD2_REPORT - 130] = 1 + [ICMPV6_MLD2_REPORT - 130] = 1, + [ICMPV6_MRDISC_ADV - 130] = 1, + [ICMPV6_MRDISC_SOL - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 4cc97f971264..7c6f7c9f7332 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -39,20 +39,15 @@ static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { - [SCTP_CONNTRACK_CLOSED] = 10 SECS, - [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, - [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, - [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, - [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, - [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, - [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), + [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), + [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), + [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index ae493599a3ef..0c1d086e96cb 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -14,7 +14,7 @@ #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/tcp.h> diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index d0eac27f6ba0..ca748f8dbff1 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1553,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; @@ -1624,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0ee98ce5b816..6c4cff10357d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -22,9 +22,6 @@ #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> -#ifdef CONFIG_LWTUNNEL -#include <net/netfilter/nf_hooks_lwtunnel.h> -#endif #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; @@ -101,69 +98,87 @@ struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; + unsigned int skip_elems; unsigned int bucket; u_int64_t time_now; }; -static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) +static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net, + struct ct_iter_state *st) { - struct ct_iter_state *st = seq->private; + struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int i; - for (st->bucket = 0; - st->bucket < st->htable_size; - st->bucket++) { - n = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - if (!is_a_nulls(n)) - return n; - } - return NULL; -} + for (i = st->bucket; i < st->htable_size; i++) { + unsigned int skip = 0; -static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, - struct hlist_nulls_node *head) -{ - struct ct_iter_state *st = seq->private; +restart: + hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct hlist_nulls_node *tmp = n; + + if (!net_eq(net, nf_ct_net(ct))) + continue; + + if (++skip <= st->skip_elems) + continue; + + /* h should be returned, skip to nulls marker. */ + while (!is_a_nulls(tmp)) + tmp = rcu_dereference(hlist_nulls_next_rcu(tmp)); - head = rcu_dereference(hlist_nulls_next_rcu(head)); - while (is_a_nulls(head)) { - if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= st->htable_size) - return NULL; + /* check if h is still linked to hash[i] */ + if (get_nulls_value(tmp) != i) { + skip = 0; + goto restart; + } + + st->skip_elems = skip; + st->bucket = i; + return h; } - head = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - } - return head; -} -static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_nulls_node *head = ct_get_first(seq); + skip = 0; + if (get_nulls_value(n) != i) + goto restart; - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; + st->skip_elems = 0; + } + + st->bucket = i; + return NULL; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; + struct net *net = seq_file_net(seq); st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); - return ct_get_idx(seq, *pos); + + if (*pos == 0) { + st->skip_elems = 0; + st->bucket = 0; + } else if (st->skip_elems) { + /* resume from last dumped entry */ + st->skip_elems--; + } + + return ct_get_next(net, st); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct ct_iter_state *st = s->private; + struct net *net = seq_file_net(s); + (*pos)++; - return ct_get_next(s, v); + return ct_get_next(net, st); } static void ct_seq_stop(struct seq_file *s, void *v) @@ -175,17 +190,16 @@ static void ct_seq_stop(struct seq_file *s, void *v) #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { + struct lsm_context ctx; int ret; - u32 len; - char *secctx; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return; - seq_printf(s, "secctx=%s ", secctx); + seq_printf(s, "secctx=%s ", ctx.context); - security_release_secctx(secctx, len); + security_release_secctx(&ctx); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) @@ -527,7 +541,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_count); static unsigned int nf_conntrack_htable_size_user __read_mostly; static int -nf_conntrack_hash_sysctl(struct ctl_table *table, int write, +nf_conntrack_hash_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -612,22 +626,19 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif -#ifdef CONFIG_LWTUNNEL - NF_SYSCTL_CT_LWTUNNEL, -#endif - __NF_SYSCTL_CT_LAST_SYSCTL, + NF_SYSCTL_CT_LAST_SYSCTL, }; -#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) - static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", @@ -663,7 +674,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", @@ -948,16 +961,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif -#ifdef CONFIG_LWTUNNEL - [NF_SYSCTL_CT_LWTUNNEL] = { - .procname = "nf_hooks_lwtunnel", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = nf_hooks_lwtunnel_sysctl_handler, - }, -#endif - {} }; static struct ctl_table nf_ct_netfilter_table[] = { @@ -966,9 +969,10 @@ static struct ctl_table nf_ct_netfilter_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, - { } }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, @@ -1122,7 +1126,7 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a8e2425e43b0..fab8b9011098 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -15,12 +15,26 @@ #define NF_RECURSION_LIMIT 2 -static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { - if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) goto err; if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { @@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb_clear_tstamp(skb); - __this_cpu_inc(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)++; dev_queue_xmit(skb); - __this_cpu_dec(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)--; return; err: kfree_skb(skb); diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c new file mode 100644 index 000000000000..4a5f5195f2d2 --- /dev/null +++ b/net/netfilter/nf_flow_table_bpf.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable Flow Table Helpers for XDP hook + * + * These are called from the XDP programs. + * Note that it is allowed to break compatibility for these functions since + * the interface they are exposed through to BPF programs is explicitly + * unstable. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <net/netfilter/nf_flow_table.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <net/xdp.h> + +/* bpf_flowtable_opts - options for bpf flowtable helpers + * @error: out parameter, set for any encountered error + */ +struct bpf_flowtable_opts { + s32 error; +}; + +enum { + NF_BPF_FLOWTABLE_OPTS_SZ = 4, +}; + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in nf_flow_table BTF"); + +__bpf_kfunc_start_defs(); + +static struct flow_offload_tuple_rhash * +bpf_xdp_flow_tuple_lookup(struct net_device *dev, + struct flow_offload_tuple *tuple, __be16 proto) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *nf_flow_table; + struct flow_offload *nf_flow; + + nf_flow_table = nf_flowtable_by_dev(dev); + if (!nf_flow_table) + return ERR_PTR(-ENOENT); + + tuplehash = flow_offload_lookup(nf_flow_table, tuple); + if (!tuplehash) + return ERR_PTR(-ENOENT); + + nf_flow = container_of(tuplehash, struct flow_offload, + tuplehash[tuplehash->tuple.dir]); + flow_offload_refresh(nf_flow_table, nf_flow, false); + + return tuplehash; +} + +__bpf_kfunc struct flow_offload_tuple_rhash * +bpf_xdp_flow_lookup(struct xdp_md *ctx, struct bpf_fib_lookup *fib_tuple, + struct bpf_flowtable_opts *opts, u32 opts_len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + struct flow_offload_tuple tuple = { + .iifidx = fib_tuple->ifindex, + .l3proto = fib_tuple->family, + .l4proto = fib_tuple->l4_protocol, + .src_port = fib_tuple->sport, + .dst_port = fib_tuple->dport, + }; + struct flow_offload_tuple_rhash *tuplehash; + __be16 proto; + + if (opts_len != NF_BPF_FLOWTABLE_OPTS_SZ) { + opts->error = -EINVAL; + return NULL; + } + + switch (fib_tuple->family) { + case AF_INET: + tuple.src_v4.s_addr = fib_tuple->ipv4_src; + tuple.dst_v4.s_addr = fib_tuple->ipv4_dst; + proto = htons(ETH_P_IP); + break; + case AF_INET6: + tuple.src_v6 = *(struct in6_addr *)&fib_tuple->ipv6_src; + tuple.dst_v6 = *(struct in6_addr *)&fib_tuple->ipv6_dst; + proto = htons(ETH_P_IPV6); + break; + default: + opts->error = -EAFNOSUPPORT; + return NULL; + } + + tuplehash = bpf_xdp_flow_tuple_lookup(xdp->rxq->dev, &tuple, proto); + if (IS_ERR(tuplehash)) { + opts->error = PTR_ERR(tuplehash); + return NULL; + } + + return tuplehash; +} + +__diag_pop() + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(nf_ft_kfunc_set) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_KFUNCS_END(nf_ft_kfunc_set) + +static const struct btf_kfunc_id_set nf_flow_kfunc_set = { + .owner = THIS_MODULE, + .set = &nf_ft_kfunc_set, +}; + +int nf_flow_register_bpf(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, + &nf_flow_kfunc_set); +} +EXPORT_SYMBOL_GPL(nf_flow_register_bpf); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index a0571339239c..9441ac3d8c1a 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -77,12 +77,8 @@ EXPORT_SYMBOL_GPL(flow_offload_alloc); static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) { - const struct rt6_info *rt; - - if (flow_tuple->l3proto == NFPROTO_IPV6) { - rt = (const struct rt6_info *)flow_tuple->dst_cache; - return rt6_get_cookie(rt); - } + if (flow_tuple->l3proto == NFPROTO_IPV6) + return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache)); return 0; } @@ -165,42 +161,86 @@ void flow_offload_route_init(struct flow_offload *flow, } EXPORT_SYMBOL_GPL(flow_offload_route_init); -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) +static inline bool nf_flow_has_expired(const struct flow_offload *flow) { + return nf_flow_timeout_delta(flow->timeout) <= 0; +} + +static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state) +{ + struct ip_ct_tcp *tcp = &ct->proto.tcp; + + spin_lock_bh(&ct->lock); + if (tcp->state != tcp_state) + tcp->state = tcp_state; + + /* syn packet triggers the TCP reopen case from conntrack. */ + if (tcp->state == TCP_CONNTRACK_CLOSE) + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + /* Conntrack state is outdated due to offload bypass. + * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks + * TCP reset validation will fail. + */ tcp->seen[0].td_maxwin = 0; + tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; tcp->seen[1].td_maxwin = 0; + tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; + spin_unlock_bh(&ct->lock); } -static void flow_offload_fixup_ct(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct flow_offload *flow) { + struct nf_conn *ct = flow->ct; struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); + bool expired, closing = false; + u32 offload_timeout = 0; s32 timeout; if (l4num == IPPROTO_TCP) { - struct nf_tcp_net *tn = nf_tcp_pernet(net); - - flow_offload_fixup_tcp(&ct->proto.tcp); + const struct nf_tcp_net *tn = nf_tcp_pernet(net); + u8 tcp_state; + + /* Enter CLOSE state if fin/rst packet has been seen, this + * allows TCP reopen from conntrack. Otherwise, pick up from + * the last seen TCP state. + */ + closing = test_bit(NF_FLOW_CLOSING, &flow->flags); + if (closing) { + flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE); + timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]); + expired = false; + } else { + tcp_state = READ_ONCE(ct->proto.tcp.state); + flow_offload_fixup_tcp(ct, tcp_state); + timeout = READ_ONCE(tn->timeouts[tcp_state]); + expired = nf_flow_has_expired(flow); + } + offload_timeout = READ_ONCE(tn->offload_timeout); - timeout = tn->timeouts[ct->proto.tcp.state]; - timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { - struct nf_udp_net *tn = nf_udp_pernet(net); + const struct nf_udp_net *tn = nf_udp_pernet(net); enum udp_conntrack state = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? UDP_CT_REPLIED : UDP_CT_UNREPLIED; - timeout = tn->timeouts[state]; - timeout -= tn->offload_timeout; + timeout = READ_ONCE(tn->timeouts[state]); + expired = nf_flow_has_expired(flow); + offload_timeout = READ_ONCE(tn->offload_timeout); } else { return; } + if (expired) + timeout -= offload_timeout; + if (timeout < 0) timeout = 0; - if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) - WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); + if (closing || + nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) + nf_ct_refresh(ct, timeout); } static void flow_offload_route_release(struct flow_offload *flow) @@ -298,7 +338,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - nf_ct_offload_timeout(flow->ct); + nf_ct_refresh(flow->ct, NF_CT_DAY); if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); @@ -320,18 +360,14 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, else return; - if (likely(!nf_flowtable_hw_offload(flow_table))) + if (likely(!nf_flowtable_hw_offload(flow_table)) || + test_bit(NF_FLOW_CLOSING, &flow->flags)) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); -static inline bool nf_flow_has_expired(const struct flow_offload *flow) -{ - return nf_flow_timeout_delta(flow->timeout) <= 0; -} - static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -347,8 +383,8 @@ static void flow_offload_del(struct nf_flowtable *flow_table, void flow_offload_teardown(struct flow_offload *flow) { clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); - set_bit(NF_FLOW_TEARDOWN, &flow->flags); - flow_offload_fixup_ct(flow->ct); + if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags)) + flow_offload_fixup_ct(flow); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -418,15 +454,118 @@ static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, return flow_table->type->gc && flow_table->type->gc(flow); } +/** + * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry + * @ct: Flowtable offloaded tcp ct + * + * Return: number of seconds when ct entry should expire. + */ +static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct) +{ + u8 state = READ_ONCE(ct->proto.tcp.state); + + switch (state) { + case TCP_CONNTRACK_SYN_SENT: + case TCP_CONNTRACK_SYN_RECV: + return 0; + case TCP_CONNTRACK_ESTABLISHED: + return NF_CT_DAY; + case TCP_CONNTRACK_FIN_WAIT: + case TCP_CONNTRACK_CLOSE_WAIT: + case TCP_CONNTRACK_LAST_ACK: + case TCP_CONNTRACK_TIME_WAIT: + return 5 * 60 * HZ; + case TCP_CONNTRACK_CLOSE: + return 0; + } + + return 0; +} + +/** + * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry + * @ct: Flowtable offloaded ct + * + * Datapath lookups in the conntrack table will evict nf_conn entries + * if they have expired. + * + * Once nf_conn entries have been offloaded, nf_conntrack might not see any + * packets anymore. Thus ct->timeout is no longer refreshed and ct can + * be evicted. + * + * To avoid the need for an additional check on the offload bit for every + * packet processed via nf_conntrack_in(), set an arbitrary timeout large + * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT + * from the packet path via nf_ct_is_expired(). + */ +static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct) +{ + static const u32 min_timeout = 5 * 60 * HZ; + u32 expires = nf_ct_expires(ct); + + /* normal case: large enough timeout, nothing to do. */ + if (likely(expires >= min_timeout)) + return; + + /* must check offload bit after this, we do not hold any locks. + * flowtable and ct entries could have been removed on another CPU. + */ + if (!refcount_inc_not_zero(&ct->ct_general.use)) + return; + + /* load ct->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + + if (nf_ct_is_confirmed(ct) && + test_bit(IPS_OFFLOAD_BIT, &ct->status)) { + u8 l4proto = nf_ct_protonum(ct); + u32 new_timeout = true; + + switch (l4proto) { + case IPPROTO_UDP: + new_timeout = NF_CT_DAY; + break; + case IPPROTO_TCP: + new_timeout = nf_flow_table_tcp_timeout(ct); + break; + default: + WARN_ON_ONCE(1); + break; + } + + /* Update to ct->timeout from nf_conntrack happens + * without holding ct->lock. + * + * Use cmpxchg to ensure timeout extension doesn't + * happen when we race with conntrack datapath. + * + * The inverse -- datapath updating ->timeout right + * after this -- is fine, datapath is authoritative. + */ + if (new_timeout) { + new_timeout += nfct_time_stamp; + cmpxchg(&ct->timeout, expires, new_timeout); + } + } + + nf_ct_put(ct); +} + static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { + bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags); + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - nf_flow_custom_gc(flow_table, flow)) + nf_flow_custom_gc(flow_table, flow)) { flow_offload_teardown(flow); + teardown = true; + } else if (!teardown) { + nf_flow_table_extend_ct_timeout(flow->ct); + } - if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (teardown) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); @@ -435,6 +574,10 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, } else { flow_offload_del(flow_table, flow); } + } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) && + test_bit(NF_FLOW_HW, &flow->flags) && + !test_bit(NF_FLOW_HW_DYING, &flow->flags)) { + nf_flow_offload_del(flow_table, flow); } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } @@ -674,8 +817,14 @@ static int __init nf_flow_table_module_init(void) if (ret) goto out_offload; + ret = nf_flow_register_bpf(); + if (ret) + goto out_bpf; + return 0; +out_bpf: + nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); return ret; diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 9505f9d188ff..b0f199171932 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -17,11 +17,15 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return NF_ACCEPT; + veth = (struct vlan_ethhdr *)skb_mac_header(skb); proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): - proto = nf_flow_pppoe_proto(skb); + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; break; default: proto = skb->protocol; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e45fade76409..8cd4cf7ae211 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -28,11 +28,15 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto, return 0; tcph = (void *)(skb_network_header(skb) + thoff); - if (unlikely(tcph->fin || tcph->rst)) { + if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) { flow_offload_teardown(flow); return -1; } + if ((tcph->fin || tcph->rst) && + !test_bit(NF_FLOW_CLOSING, &flow->flags)) + set_bit(NF_FLOW_CLOSING, &flow->flags); + return 0; } @@ -157,7 +161,7 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].proto = skb->protocol; break; case htons(ETH_P_PPP_SES): - phdr = (struct pppoe_hdr *)skb_mac_header(skb); + phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break; @@ -273,13 +277,17 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; + __be16 inner_proto; switch (skb->protocol) { case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return false; + veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { *offset += VLAN_HLEN; @@ -287,7 +295,8 @@ static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, } break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_proto(skb) == proto) { + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { *offset += PPPOE_SES_HLEN; return true; } @@ -316,7 +325,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): - skb->protocol = nf_flow_pppoe_proto(skb); + skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; @@ -432,7 +441,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; @@ -444,7 +453,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); @@ -727,7 +736,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + rt = dst_rt6_info(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; @@ -739,7 +748,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + rt = dst_rt6_info(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index a010b25076ca..e06bc36f49fe 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -841,8 +841,8 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, struct list_head *block_cb_list) { struct flow_cls_offload cls_flow = {}; + struct netlink_ext_ack extack = {}; struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; int err, i = 0; @@ -1192,7 +1192,7 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, int err; if (!nf_flowtable_hw_offload(flowtable)) - return 0; + return nf_flow_offload_xdp_setup(flowtable, dev, cmd); if (dev->netdev_ops->ndo_setup_tc) err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, diff --git a/net/netfilter/nf_flow_table_xdp.c b/net/netfilter/nf_flow_table_xdp.c new file mode 100644 index 000000000000..e1252d042699 --- /dev/null +++ b/net/netfilter/nf_flow_table_xdp.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_flow_table.h> + +struct flow_offload_xdp_ft { + struct list_head head; + struct nf_flowtable *ft; + struct rcu_head rcuhead; +}; + +struct flow_offload_xdp { + struct hlist_node hnode; + unsigned long net_device_addr; + struct list_head head; +}; + +#define NF_XDP_HT_BITS 4 +static DEFINE_HASHTABLE(nf_xdp_hashtable, NF_XDP_HT_BITS); +static DEFINE_MUTEX(nf_xdp_hashtable_lock); + +/* caller must hold rcu read lock */ +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev) +{ + unsigned long key = (unsigned long)dev; + struct flow_offload_xdp *iter; + + hash_for_each_possible_rcu(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + struct flow_offload_xdp_ft *ft_elem; + + /* The user is supposed to insert a given net_device + * just into a single nf_flowtable so we always return + * the first element here. + */ + ft_elem = list_first_or_null_rcu(&iter->head, + struct flow_offload_xdp_ft, + head); + return ft_elem ? ft_elem->ft : NULL; + } + } + + return NULL; +} + +static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft, + const struct net_device *dev) +{ + struct flow_offload_xdp *iter, *elem = NULL; + unsigned long key = (unsigned long)dev; + struct flow_offload_xdp_ft *ft_elem; + + ft_elem = kzalloc(sizeof(*ft_elem), GFP_KERNEL_ACCOUNT); + if (!ft_elem) + return -ENOMEM; + + ft_elem->ft = ft; + + mutex_lock(&nf_xdp_hashtable_lock); + + hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + elem = iter; + break; + } + } + + if (!elem) { + elem = kzalloc(sizeof(*elem), GFP_KERNEL_ACCOUNT); + if (!elem) + goto err_unlock; + + elem->net_device_addr = key; + INIT_LIST_HEAD(&elem->head); + hash_add_rcu(nf_xdp_hashtable, &elem->hnode, key); + } + list_add_tail_rcu(&ft_elem->head, &elem->head); + + mutex_unlock(&nf_xdp_hashtable_lock); + + return 0; + +err_unlock: + mutex_unlock(&nf_xdp_hashtable_lock); + kfree(ft_elem); + + return -ENOMEM; +} + +static void nf_flowtable_by_dev_remove(struct nf_flowtable *ft, + const struct net_device *dev) +{ + struct flow_offload_xdp *iter, *elem = NULL; + unsigned long key = (unsigned long)dev; + + mutex_lock(&nf_xdp_hashtable_lock); + + hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) { + if (key == iter->net_device_addr) { + elem = iter; + break; + } + } + + if (elem) { + struct flow_offload_xdp_ft *ft_elem, *ft_next; + + list_for_each_entry_safe(ft_elem, ft_next, &elem->head, head) { + if (ft_elem->ft == ft) { + list_del_rcu(&ft_elem->head); + kfree_rcu(ft_elem, rcuhead); + } + } + + if (list_empty(&elem->head)) + hash_del_rcu(&elem->hnode); + else + elem = NULL; + } + + mutex_unlock(&nf_xdp_hashtable_lock); + + if (elem) { + synchronize_rcu(); + kfree(elem); + } +} + +int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + switch (cmd) { + case FLOW_BLOCK_BIND: + return nf_flowtable_by_dev_insert(flowtable, dev); + case FLOW_BLOCK_UNBIND: + nf_flowtable_by_dev_remove(flowtable, dev); + return 0; + } + + WARN_ON_ONCE(1); + return 0; +} diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c index 00e89ffd78f6..2d890dd04ff8 100644 --- a/net/netfilter/nf_hooks_lwtunnel.c +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -3,6 +3,9 @@ #include <linux/sysctl.h> #include <net/lwtunnel.h> #include <net/netfilter/nf_hooks_lwtunnel.h> +#include <linux/netfilter.h> + +#include "nf_internals.h" static inline int nf_hooks_lwtunnel_get(void) { @@ -25,7 +28,7 @@ static inline int nf_hooks_lwtunnel_set(int enable) } #ifdef CONFIG_SYSCTL -int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, +int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int proc_nf_hooks_lwtunnel_enabled = 0; @@ -50,4 +53,71 @@ int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, return ret; } EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); + +static struct ctl_table nf_lwtunnel_sysctl_table[] = { + { + .procname = "nf_hooks_lwtunnel", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = nf_hooks_lwtunnel_sysctl_handler, + }, +}; + +static int __net_init nf_lwtunnel_net_init(struct net *net) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = nf_lwtunnel_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_lwtunnel_sysctl_table, + sizeof(nf_lwtunnel_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } + + hdr = register_net_sysctl_sz(net, "net/netfilter", table, + ARRAY_SIZE(nf_lwtunnel_sysctl_table)); + if (!hdr) + goto err_reg; + + net->nf.nf_lwtnl_dir_header = hdr; + + return 0; +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit nf_lwtunnel_net_exit(struct net *net) +{ + const struct ctl_table *table; + + table = net->nf.nf_lwtnl_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations nf_lwtunnel_net_ops = { + .init = nf_lwtunnel_net_init, + .exit = nf_lwtunnel_net_exit, +}; + +int __init netfilter_lwtunnel_init(void) +{ + return register_pernet_subsys(&nf_lwtunnel_net_ops); +} + +void netfilter_lwtunnel_fini(void) +{ + unregister_pernet_subsys(&nf_lwtunnel_net_ops); +} +#else +int __init netfilter_lwtunnel_init(void) { return 0; } +void netfilter_lwtunnel_fini(void) {} #endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 832ae64179f0..25403023060b 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -29,6 +29,12 @@ void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ int __init netfilter_log_init(void); +#ifdef CONFIG_LWTUNNEL +/* nf_hooks_lwtunnel.c */ +int __init netfilter_lwtunnel_init(void); +void netfilter_lwtunnel_fini(void); +#endif + /* core.c */ void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 370f8231385c..6dd0de33eebd 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -395,7 +395,7 @@ static const struct seq_operations nflog_seq_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; -static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO]; static struct ctl_table_header *nf_log_sysctl_fhdr; static struct ctl_table nf_log_sysctl_ftable[] = { @@ -406,10 +406,9 @@ static struct ctl_table nf_log_sysctl_ftable[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; -static int nf_log_proc_dostring(struct ctl_table *table, int write, +static int nf_log_proc_dostring(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; @@ -514,7 +513,7 @@ err_alloc: static void netfilter_log_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->nf.nf_log_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_log_dir_header); diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 58402226045e..86d5fc5d28e3 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -216,7 +216,9 @@ nf_log_dump_tcp_header(struct nf_log_buf *m, /* Max length: 9 "RES=0x3C " */ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->ae) + nf_log_buf_add(m, "AE "); if (th->cwr) nf_log_buf_add(m, "CWR "); if (th->ece) @@ -516,7 +518,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ - /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */ /* UDP: 10+max(25,20) = 35 */ /* UDPLITE: 14+max(25,20) = 39 */ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ @@ -526,7 +528,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* (ICMP allows recursion one level deep) */ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ - /* maxlen = 230+ 91 + 230 + 252 = 803 */ + /* maxlen = 230+ 91 + 230 + 255 = 806 */ } static noinline_for_stack void diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 016c816d91cb..f391cd267922 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -183,7 +183,35 @@ hash_by_src(const struct net *net, return reciprocal_scale(hash, nf_nat_htable_size); } -/* Is this tuple already taken? (not by us) */ +/** + * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry + * @tuple: proposed NAT binding + * @ignored_conntrack: our (unconfirmed) conntrack entry + * + * A conntrack entry can be inserted to the connection tracking table + * if there is no existing entry with an identical tuple in either direction. + * + * Example: + * INITIATOR -> NAT/PAT -> RESPONDER + * + * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). + * Then, later, NAT/PAT itself also connects to RESPONDER. + * + * This will not work if the SNAT done earlier has same IP:PORT source pair. + * + * Conntrack table has: + * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT + * + * and new locally originating connection wants: + * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT + * + * ... which would mean incoming packets cannot be distinguished between + * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). + * + * @return: true if the proposed NAT mapping collides with an existing entry. + */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) @@ -200,6 +228,100 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } +static bool nf_nat_allow_clash(const struct nf_conn *ct) +{ + return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; +} + +/** + * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry + * @tuple: proposed NAT binding + * @ignored_ct: our (unconfirmed) conntrack entry + * + * Same as nf_nat_used_tuple, but also check for rare clash in reverse + * direction. Should be called only when @tuple has not been altered, i.e. + * @ignored_conntrack will not be subject to NAT. + * + * @return: true if the proposed NAT mapping collides with existing entry. + */ +static noinline bool +nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_ct) +{ + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; + const struct nf_conntrack_tuple_hash *thash; + const struct nf_conntrack_zone *zone; + struct nf_conn *ct; + bool taken = true; + struct net *net; + + if (!nf_nat_used_tuple(tuple, ignored_ct)) + return false; + + if (!nf_nat_allow_clash(ignored_ct)) + return true; + + /* Initial choice clashes with existing conntrack. + * Check for (rare) reverse collision. + * + * This can happen when new packets are received in both directions + * at the exact same time on different CPUs. + * + * Without SMP, first packet creates new conntrack entry and second + * packet is resolved as established reply packet. + * + * With parallel processing, both packets could be picked up as + * new and both get their own ct entry allocated. + * + * If ignored_conntrack and colliding ct are not subject to NAT then + * pretend the tuple is available and let later clash resolution + * handle this at insertion time. + * + * Without it, the 'reply' packet has its source port rewritten + * by nat engine. + */ + if (READ_ONCE(ignored_ct->status) & uses_nat) + return true; + + net = nf_ct_net(ignored_ct); + zone = nf_ct_zone(ignored_ct); + + thash = nf_conntrack_find_get(net, zone, tuple); + if (unlikely(!thash)) { + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuple(&reply, tuple); + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) /* clashing entry went away */ + return false; + } + + ct = nf_ct_tuplehash_to_ctrack(thash); + + /* NB: IP_CT_DIR_ORIGINAL should be impossible because + * nf_nat_used_tuple() handles origin collisions. + * + * Handle remote chance other CPU confirmed its ct right after. + */ + if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) + goto out; + + /* clashing connection subject to NAT? Retry with new tuple. */ + if (READ_ONCE(ct->status) & uses_nat) + goto out; + + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && + nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { + taken = false; + goto out; + } +out: + nf_ct_put(ct); + return taken; +} + static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | @@ -611,7 +733,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { - if (!nf_nat_used_tuple(orig_tuple, ct)) { + if (!nf_nat_used_tuple_new(orig_tuple, ct)) { *tuple = *orig_tuple; return; } @@ -974,10 +1096,8 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], range->flags |= NF_NAT_RANGE_MAP_IPS; } - if (tb[CTA_NAT_V4_MAXIP]) - range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); - else - range->max_addr.ip = range->min_addr.ip; + range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], + range->min_addr.ip); return 0; } @@ -1104,7 +1224,7 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); - nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); + nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; @@ -1208,7 +1328,6 @@ static const struct nf_nat_hook nat_hook = { #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif - .manip_pkt = nf_nat_manip_pkt, .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 5b140c12b7df..3fa3f5dfb264 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/tcp.h> #include <net/netns/generic.h> #include <linux/proc_fs.h> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5fa3d3540c93..24c71ecb2179 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -26,12 +26,14 @@ #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) #define NFT_SET_MAX_ANONLEN 16 +/* limit compaction to avoid huge kmalloc/krealloc sizes. */ +#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem)) + unsigned int nf_tables_net_id __read_mostly; static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); -static LIST_HEAD(nf_tables_destroy_list); static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); static DEFINE_SPINLOCK(nf_tables_gc_list_lock); @@ -122,7 +124,6 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s table->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); -static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); static void nft_trans_gc_work(struct work_struct *work); static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); @@ -146,6 +147,8 @@ static void nft_ctx_init(struct nft_ctx *ctx, ctx->report = nlmsg_report(nlh); ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; + + bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, @@ -153,14 +156,18 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, { struct nft_trans *trans; - trans = kzalloc(sizeof(struct nft_trans) + size, gfp); + trans = kzalloc(size, gfp); if (trans == NULL) return NULL; INIT_LIST_HEAD(&trans->list); - INIT_LIST_HEAD(&trans->binding_list); trans->msg_type = msg_type; - trans->ctx = *ctx; + + trans->net = ctx->net; + trans->table = ctx->table; + trans->seq = ctx->seq; + trans->flags = ctx->flags; + trans->report = ctx->report; return trans; } @@ -171,10 +178,26 @@ static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } +static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) +{ + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + case NFT_MSG_NEWSET: + return container_of(trans, struct nft_trans_binding, nft_trans); + } + + return NULL; +} + static void nft_trans_list_del(struct nft_trans *trans) { + struct nft_trans_binding *trans_binding; + list_del(&trans->list); - list_del(&trans->binding_list); + + trans_binding = nft_trans_get_binding(trans); + if (trans_binding) + list_del(&trans_binding->binding_list); } static void nft_trans_destroy(struct nft_trans *trans) @@ -236,7 +259,7 @@ static void __nft_chain_trans_bind(const struct nft_ctx *ctx, nft_trans_chain_bound(trans) = bind; break; case NFT_MSG_NEWRULE: - if (trans->ctx.chain == chain) + if (nft_trans_rule_chain(trans) == chain) nft_trans_rule_bound(trans) = bind; break; } @@ -277,40 +300,75 @@ void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { + struct nf_hook_ops *ops; struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) - goto err_register; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nf_register_net_hook(net, ops); + if (err < 0) + goto err_register; - j++; + j++; + } } return 0; err_register: list_for_each_entry(hook, hook_list, list) { - if (j-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (j-- <= 0) + break; - nf_unregister_net_hook(net, &hook->ops); + nf_unregister_net_hook(net, ops); + } } return err; } +static void nft_netdev_hook_free_ops(struct nft_hook *hook) +{ + struct nf_hook_ops *ops, *next; + + list_for_each_entry_safe(ops, next, &hook->ops_list, list) { + list_del(&ops->list); + kfree(ops); + } +} + +static void nft_netdev_hook_free(struct nft_hook *hook) +{ + nft_netdev_hook_free_ops(hook); + kfree(hook); +} + +static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu) +{ + struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu); + + nft_netdev_hook_free(hook); +} + +static void nft_netdev_hook_free_rcu(struct nft_hook *hook) +{ + call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } @@ -369,24 +427,132 @@ static void nf_tables_unregister_hook(struct net *net, return __nf_tables_unregister_hook(net, table, chain, false); } +static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b) +{ + /* NB: the ->bound equality check is defensive, at this time we only merge + * a new nft_trans_elem transaction request with the transaction tail + * element, but a->bound != b->bound would imply a NEWRULE transaction + * is queued in-between. + * + * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents + * huge krealloc() requests. + */ + return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS; +} + +static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, + struct nft_trans_elem *tail, + struct nft_trans_elem *trans, + gfp_t gfp) +{ + unsigned int nelems, old_nelems = tail->nelems; + struct nft_trans_elem *new_trans; + + if (!nft_trans_collapse_set_elem_allowed(tail, trans)) + return false; + + /* "cannot happen", at this time userspace element add + * requests always allocate a new transaction element. + * + * This serves as a reminder to adjust the list_add_tail + * logic below in case this ever changes. + */ + if (WARN_ON_ONCE(trans->nelems != 1)) + return false; + + if (check_add_overflow(old_nelems, trans->nelems, &nelems)) + return false; + + /* krealloc might free tail which invalidates list pointers */ + list_del_init(&tail->nft_trans.list); + + new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp); + if (!new_trans) { + list_add_tail(&tail->nft_trans.list, &nft_net->commit_list); + return false; + } + + /* + * new_trans->nft_trans.list contains garbage, but + * list_add_tail() doesn't care. + */ + new_trans->nelems = nelems; + new_trans->elems[old_nelems] = trans->elems[0]; + list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list); + + return true; +} + +static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, + struct nft_trans *trans, gfp_t gfp) +{ + struct nft_trans *tail; + + if (list_empty(&nft_net->commit_list)) + return false; + + tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list); + + if (tail->msg_type != trans->msg_type) + return false; + + switch (trans->msg_type) { + case NFT_MSG_NEWSETELEM: + case NFT_MSG_DELSETELEM: + return nft_trans_collapse_set_elem(nft_net, + nft_trans_container_elem(tail), + nft_trans_container_elem(trans), gfp); + } + + return false; +} + static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); + struct nft_trans_binding *binding; + struct nft_trans_set *trans_set; + + list_add_tail(&trans->list, &nft_net->commit_list); + + binding = nft_trans_get_binding(trans); + if (!binding) + return; switch (trans->msg_type) { case NFT_MSG_NEWSET: + trans_set = nft_trans_container_set(trans); + if (!nft_trans_set_update(trans) && nft_set_is_anonymous(nft_trans_set(trans))) - list_add_tail(&trans->binding_list, &nft_net->binding_list); + list_add_tail(&binding->binding_list, &nft_net->binding_list); + + list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list); break; case NFT_MSG_NEWCHAIN: if (!nft_trans_chain_update(trans) && nft_chain_binding(nft_trans_chain(trans))) - list_add_tail(&trans->binding_list, &nft_net->binding_list); + list_add_tail(&binding->binding_list, &nft_net->binding_list); break; } +} - list_add_tail(&trans->list, &nft_net->commit_list); +static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, + gfp_t gfp) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + + WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && + trans->msg_type != NFT_MSG_DELSETELEM); + + might_alloc(gfp); + + if (nft_trans_try_collapse(nft_net, trans, gfp)) { + kfree(trans); + return; + } + + nft_trans_commit_list_add_tail(net, trans); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -416,11 +582,28 @@ static int nft_deltable(struct nft_ctx *ctx) return err; } -static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +static struct nft_trans * +nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type) { + struct nft_trans_chain *trans_chain; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (!trans) + return NULL; + + trans_chain = nft_trans_container_chain(trans); + INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list); + trans_chain->chain = ctx->chain; + + return trans; +} + +static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc_chain(ctx, msg_type); if (trans == NULL) return ERR_PTR(-ENOMEM); @@ -432,7 +615,6 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); } } - nft_trans_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); return trans; @@ -505,6 +687,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); } nft_trans_rule(trans) = rule; + nft_trans_rule_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); return trans; @@ -560,12 +743,17 @@ static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, struct nft_set *set, const struct nft_set_desc *desc) { + struct nft_trans_set *trans_set; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); if (trans == NULL) return -ENOMEM; + trans_set = nft_trans_container_set(trans); + INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list); + INIT_LIST_HEAD(&trans_set->list_trans_newset); + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); @@ -594,6 +782,12 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; @@ -617,6 +811,7 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } @@ -626,6 +821,7 @@ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; @@ -1200,6 +1396,26 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) __NFT_TABLE_F_WAS_AWAKEN | \ __NFT_TABLE_F_WAS_ORPHAN) +static bool nft_table_pending_update(const struct nft_ctx *ctx) +{ + struct nftables_pernet *nft_net = nft_pernet(ctx->net); + struct nft_trans *trans; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return true; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->table == ctx->table && + ((trans->msg_type == NFT_MSG_NEWCHAIN && + nft_trans_chain_update(trans)) || + (trans->msg_type == NFT_MSG_DELCHAIN && + nft_is_base_chain(nft_trans_chain(trans))))) + return true; + } + + return false; +} + static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; @@ -1226,7 +1442,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) return -EOPNOTSUPP; /* No dormant off/on/off/on games in single transaction */ - if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + if (nft_table_pending_update(ctx)) return -EINVAL; trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, @@ -1587,15 +1803,15 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, return nft_flush_table(&ctx); } -static void nf_tables_table_destroy(struct nft_ctx *ctx) +static void nf_tables_table_destroy(struct nft_table *table) { - if (WARN_ON(ctx->table->use > 0)) + if (WARN_ON(table->use > 0)) return; - rhltable_destroy(&ctx->table->chains_ht); - kfree(ctx->table->name); - kfree(ctx->table->udata); - kfree(ctx->table); + rhltable_destroy(&table->chains_ht); + kfree(table->name); + kfree(table->udata); + kfree(table); } void nft_register_chain_type(const struct nft_chain_type *ctype) @@ -1742,7 +1958,8 @@ nla_put_failure: return -ENOSPC; } -static int nft_dump_basechain_hook(struct sk_buff *skb, int family, +static int nft_dump_basechain_hook(struct sk_buff *skb, + const struct net *net, int family, const struct nft_base_chain *basechain, const struct list_head *hook_list) { @@ -1767,19 +1984,21 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family, if (!hook_list) hook_list = &basechain->hook_list; - list_for_each_entry(hook, hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { if (!first) first = hook; - if (nla_put_string(skb, NFTA_DEVICE_NAME, - hook->ops.dev->name)) + if (nla_put(skb, NFTA_DEVICE_NAME, + hook->ifnamelen, hook->ifname)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + nla_put(skb, NFTA_HOOK_DEV, + first->ifnamelen, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -1818,7 +2037,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; - if (nft_dump_basechain_hook(skb, family, basechain, hook_list)) + if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -2000,14 +2219,14 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, NULL); if (err < 0) - return ERR_PTR(err); + return ERR_PTR_PCPU(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) - return ERR_PTR(-EINVAL); + return ERR_PTR_PCPU(-EINVAL); newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) - return ERR_PTR(-ENOMEM); + return ERR_PTR_PCPU(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. @@ -2021,18 +2240,19 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) return newstats; } -static void nft_chain_stats_replace(struct nft_trans *trans) +static void nft_chain_stats_replace(struct nft_trans_chain *trans) { - struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain); + const struct nft_trans *t = &trans->nft_trans_binding.nft_trans; + struct nft_base_chain *chain = nft_base_chain(trans->chain); - if (!nft_trans_chain_stats(trans)) + if (!trans->stats) return; - nft_trans_chain_stats(trans) = - rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans), - lockdep_commit_lock_is_held(trans->ctx.net)); + trans->stats = + rcu_replace_pointer(chain->stats, trans->stats, + lockdep_commit_lock_is_held(t->net)); - if (!nft_trans_chain_stats(trans)) + if (!trans->stats) static_branch_inc(&nft_counters_enabled); } @@ -2050,9 +2270,9 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->blob_next); } -void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_chain *chain) { - struct nft_chain *chain = ctx->chain; + const struct nft_table *table = chain->table; struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) @@ -2064,11 +2284,11 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); - if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } module_put(basechain->type->owner); @@ -2089,34 +2309,43 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx) static struct nft_hook *nft_netdev_hook_alloc(struct net *net, const struct nlattr *attr) { + struct nf_hook_ops *ops; struct net_device *dev; - char ifname[IFNAMSIZ]; struct nft_hook *hook; int err; hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); - if (!hook) { - err = -ENOMEM; - goto err_hook_alloc; - } + if (!hook) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&hook->ops_list); + + err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); + if (err < 0) + goto err_hook_free; + + hook->ifnamelen = nla_len(attr); - nla_strscpy(ifname, attr, IFNAMSIZ); /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with * the rtnl_mutex. */ - dev = __dev_get_by_name(net, ifname); - if (!dev) { - err = -ENOENT; - goto err_hook_dev; - } - hook->ops.dev = dev; + for_each_netdev(net, dev) { + if (strncmp(dev->name, hook->ifname, hook->ifnamelen)) + continue; + ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); + if (!ops) { + err = -ENOMEM; + goto err_hook_free; + } + ops->dev = dev; + list_add_tail(&ops->list, &hook->ops_list); + } return hook; -err_hook_dev: - kfree(hook); -err_hook_alloc: +err_hook_free: + nft_netdev_hook_free(hook); return ERR_PTR(err); } @@ -2126,7 +2355,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { - if (this->ops.dev == hook->ops.dev) + if (!strncmp(hook->ifname, this->ifname, + min(hook->ifnamelen, this->ifnamelen))) return hook; } @@ -2156,7 +2386,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, } if (nft_hook_list_find(hook_list, hook)) { NL_SET_BAD_ATTR(extack, tmp); - kfree(hook); + nft_netdev_hook_free(hook); err = -EEXIST; goto err_hook; } @@ -2174,7 +2404,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, err_hook: list_for_each_entry_safe(hook, next, hook_list, list) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } return err; } @@ -2317,7 +2547,7 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) list_for_each_entry_safe(h, next, &hook->list, list) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } module_put(hook->type->owner); } @@ -2370,6 +2600,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; + struct nf_hook_ops *ops; struct nft_hook *h; basechain->type = hook->type; @@ -2378,8 +2609,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); - list_for_each_entry(h, &basechain->hook_list, list) - nft_basechain_hook_init(&h->ops, family, hook, chain); + list_for_each_entry(h, &basechain->hook_list, list) { + list_for_each_entry(ops, &h->ops_list, list) + nft_basechain_hook_init(ops, family, hook, chain); + } } nft_basechain_hook_init(&basechain->ops, family, hook, chain); @@ -2412,9 +2645,8 @@ int nft_chain_add(struct nft_table *table, struct nft_chain *chain) static u64 chain_id; -static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, - u8 policy, u32 flags, - struct netlink_ext_ack *extack) +static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, + u32 flags, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -2430,6 +2662,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook = {}; + if (table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; + if (flags & NFT_CHAIN_BINDING) return -EOPNOTSUPP; @@ -2447,10 +2682,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) { + if (IS_ERR_PCPU(stats)) { nft_chain_release_hook(&hook); kfree(basechain); - return PTR_ERR(stats); + return PTR_ERR_PCPU(stats); } rcu_assign_pointer(basechain->stats, stats); } @@ -2550,7 +2785,7 @@ err_chain_add: err_trans: nft_use_dec_restore(&table->use); err_destroy_chain: - nf_tables_chain_destroy(ctx); + nf_tables_chain_destroy(chain); return err; } @@ -2564,7 +2799,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, struct nft_table *table = ctx->table; struct nft_chain *chain = ctx->chain; struct nft_chain_hook hook = {}; - struct nft_stats *stats = NULL; + struct nft_stats __percpu *stats = NULL; struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; @@ -2596,15 +2831,17 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { list_for_each_entry_safe(h, next, &hook.list, list) { - h->ops.pf = basechain->ops.pf; - h->ops.hooknum = basechain->ops.hooknum; - h->ops.priority = basechain->ops.priority; - h->ops.priv = basechain->ops.priv; - h->ops.hook = basechain->ops.hook; + list_for_each_entry(ops, &h->ops_list, list) { + ops->pf = basechain->ops.pf; + ops->hooknum = basechain->ops.hooknum; + ops->priority = basechain->ops.priority; + ops->priv = basechain->ops.priv; + ops->hook = basechain->ops.hook; + } if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } } } else { @@ -2631,6 +2868,13 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, } } + if (table->flags & __NFT_TABLE_F_UPDATE && + !list_empty(&hook.list)) { + NL_SET_BAD_ATTR(extack, attr); + err = -EOPNOTSUPP; + goto err_hooks; + } + if (!(table->flags & NFT_TABLE_F_DORMANT) && nft_is_base_chain(chain) && !list_empty(&hook.list)) { @@ -2641,11 +2885,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, err = nft_netdev_register_hooks(ctx->net, &hook.list); if (err < 0) goto err_hooks; + + unregister = true; } } - unregister = true; - if (nla[NFTA_CHAIN_COUNTERS]) { if (!nft_is_base_chain(chain)) { err = -EOPNOTSUPP; @@ -2653,15 +2897,14 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, } stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) { - err = PTR_ERR(stats); + if (IS_ERR_PCPU(stats)) { + err = PTR_ERR_PCPU(stats); goto err_hooks; } } err = -ENOMEM; - trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN, - sizeof(struct nft_trans_chain)); + trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN); if (trans == NULL) goto err_trans; @@ -2687,7 +2930,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, err = -EEXIST; list_for_each_entry(tmp, &nft_net->commit_list, list) { if (tmp->msg_type == NFT_MSG_NEWCHAIN && - tmp->ctx.table == table && + tmp->table == table && nft_trans_chain_update(tmp) && nft_trans_chain_name(tmp) && strcmp(name, nft_trans_chain_name(tmp)) == 0) { @@ -2716,10 +2959,12 @@ err_trans: err_hooks: if (nla[NFTA_CHAIN_HOOK]) { list_for_each_entry_safe(h, next, &hook.list, list) { - if (unregister) - nf_unregister_net_hook(ctx->net, &h->ops); + if (unregister) { + list_for_each_entry(ops, &h->ops_list, list) + nf_unregister_net_hook(ctx->net, ops); + } list_del(&h->list); - kfree_rcu(h, rcu); + nft_netdev_hook_free_rcu(h); } module_put(hook.type->owner); } @@ -2736,13 +2981,11 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net, struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { - struct nft_chain *chain = trans->ctx.chain; - if (trans->msg_type == NFT_MSG_NEWCHAIN && - chain->table == table && + nft_trans_chain(trans)->table == table && id == nft_trans_chain_id(trans) && - nft_active_genmask(chain, genmask)) - return chain; + nft_active_genmask(nft_trans_chain(trans), genmask)) + return nft_trans_chain(trans); } return ERR_PTR(-ENOENT); } @@ -2845,7 +3088,7 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, extack); } - return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack); + return nf_tables_addchain(&ctx, family, policy, flags, extack); } static int nft_delchain_hook(struct nft_ctx *ctx, @@ -2860,6 +3103,9 @@ static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_trans *trans; int err; + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) @@ -2874,8 +3120,7 @@ static int nft_delchain_hook(struct nft_ctx *ctx, list_move(&hook->list, &chain_del_list); } - trans = nft_trans_alloc(ctx, NFT_MSG_DELCHAIN, - sizeof(struct nft_trans_chain)); + trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN); if (!trans) { err = -ENOMEM; goto err_chain_del_hook; @@ -2944,7 +3189,8 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (nla[NFTA_CHAIN_HOOK]) { - if (chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN || + chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; if (nft_is_base_chain(chain)) { @@ -3026,7 +3272,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -3058,9 +3304,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -3197,25 +3447,37 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME]) return -EINVAL; + rcu_read_lock(); + type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); - if (!type) - return -ENOENT; + if (!type) { + err = -ENOENT; + goto out_unlock; + } - if (!type->inner_ops) - return -EOPNOTSUPP; + if (!type->inner_ops) { + err = -EOPNOTSUPP; + goto out_unlock; + } err = nla_parse_nested_deprecated(info->tb, type->maxattr, tb[NFTA_EXPR_DATA], type->policy, NULL); if (err < 0) - goto err_nla_parse; + goto out_unlock; info->attr = nla; info->ops = type->inner_ops; + /* No module reference will be taken on type->owner. + * Presence of type->inner_ops implies that the expression + * is builtin, so it cannot go away. + */ + rcu_read_unlock(); return 0; -err_nla_parse: +out_unlock: + rcu_read_unlock(); return err; } @@ -3287,7 +3549,7 @@ err_expr_parse: return ERR_PTR(err); } -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp) { int err; @@ -3295,7 +3557,7 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) return -EINVAL; dst->ops = src->ops; - err = src->ops->clone(dst, src); + err = src->ops->clone(dst, src, gfp); if (err < 0) return err; @@ -3314,13 +3576,15 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) * Rules */ -static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *__nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, u64 handle) { struct nft_rule *rule; // FIXME: this sucks - list_for_each_entry_rcu(rule, &chain->rules, list) { + list_for_each_entry_rcu(rule, &chain->rules, list, + lockdep_commit_lock_is_held(net)) { if (handle == rule->handle) return rule; } @@ -3328,13 +3592,14 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, return ERR_PTR(-ENOENT); } -static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); - return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); + return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { @@ -3635,7 +3900,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) return 0; } -/* called with rcu_read_lock held */ +/* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) @@ -3662,7 +3927,7 @@ nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, return ERR_CAST(chain); } - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return ERR_CAST(rule); @@ -3771,16 +4036,27 @@ void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) kfree(rule); } +/* can only be used if rule is no longer visible to dumps */ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { + lockdep_commit_lock_is_held(ctx->net); + nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); } +/** nft_chain_validate - loop detection and hook validation + * + * @ctx: context containing call depth and base chain + * @chain: chain to validate + * + * Walk through the rules of the given chain and chase all jumps/gotos + * and set lookups until either the jump limit is hit or all reachable + * chains have been validated. + */ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; - const struct nft_data *data; struct nft_rule *rule; int err; @@ -3798,7 +4074,10 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (!expr->ops->validate) continue; - err = expr->ops->validate(ctx, expr, &data); + /* This may call nft_chain_validate() recursively, + * callers that do so must increment ctx->level. + */ + err = expr->ops->validate(ctx, expr); if (err < 0) return err; } @@ -3841,6 +4120,9 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_data *data; int err; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; @@ -3864,17 +4146,21 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { - u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_iter dummy_iter = { + .genmask = nft_genmask_next(ctx->net), + }; struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) + if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; - ret = nft_setelem_validate(ctx, set, NULL, catchall->elem); + ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); if (ret < 0) return ret; } @@ -3943,7 +4229,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); - rule = __nft_rule_lookup(chain, handle); + rule = __nft_rule_lookup(net, chain, handle); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); @@ -3965,7 +4251,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nft_rule_lookup(chain, pos_handle); + old_rule = __nft_rule_lookup(net, chain, pos_handle); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); @@ -4136,7 +4422,7 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWRULE && - trans->ctx.chain == chain && + nft_trans_rule_chain(trans) == chain && id == nft_trans_rule_id(trans)) return nft_trans_rule(trans); } @@ -4182,7 +4468,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, if (chain) { if (nla[NFTA_RULE_HANDLE]) { - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { if (PTR_ERR(rule) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) @@ -4331,6 +4617,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), + [NFTA_SET_TYPE] = { .type = NLA_REJECT }, + [NFTA_SET_COUNT] = { .type = NLA_REJECT }, }; static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { @@ -4342,7 +4630,8 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy), }; -static struct nft_set *nft_set_lookup(const struct nft_table *table, +static struct nft_set *nft_set_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -4350,7 +4639,8 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(set, &table->sets, list) { + list_for_each_entry_rcu(set, &table->sets, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; @@ -4378,17 +4668,16 @@ static struct nft_set *nft_set_lookup_byid(const struct net *net, { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); - struct nft_trans *trans; + struct nft_trans_set *trans; - list_for_each_entry(trans, &nft_net->commit_list, list) { - if (trans->msg_type == NFT_MSG_NEWSET) { - struct nft_set *set = nft_trans_set(trans); + /* its likely the id we need is at the tail, not at start */ + list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) { + struct nft_set *set = trans->set; - if (id == nft_trans_set_id(trans) && - set->table == table && - nft_active_genmask(set, genmask)) - return set; - } + if (id == trans->set_id && + set->table == table && + nft_active_genmask(set, genmask)) + return set; } return ERR_PTR(-ENOENT); } @@ -4401,7 +4690,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net, { struct nft_set *set; - set = nft_set_lookup(table, nla_set_name, genmask); + set = nft_set_lookup(net, table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; @@ -4480,7 +4769,7 @@ int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) return -ERANGE; ms *= NSEC_PER_MSEC; - *result = nsecs_to_jiffies64(ms); + *result = nsecs_to_jiffies64(ms) ? : !!ms; return 0; } @@ -4516,6 +4805,35 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, return 0; } +static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) +{ + if (ops->usize) + return ops->usize(size); + + return size; +} + +static noinline_for_stack int +nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set) +{ + unsigned int nelems; + char str[40]; + int ret; + + ret = snprintf(str, sizeof(str), "%ps", set->ops); + + /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped + * to userspace purely for informational/debug purposes. + */ + DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str)); + + if (nla_put_string(skb, NFTA_SET_TYPE, str)) + return -EMSGSIZE; + + nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems)); + return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -4586,7 +4904,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (!nest) goto nla_put_failure; if (set->size && - nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + nla_put_be32(skb, NFTA_SET_DESC_SIZE, + htonl(nft_set_userspace_size(set->ops, set->size)))) goto nla_put_failure; if (set->field_count > 1 && @@ -4595,6 +4914,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); + if (nf_tables_fill_set_info(skb, set)) + goto nla_put_failure; + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) @@ -4777,7 +5099,7 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); @@ -4828,7 +5150,7 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { - u32 num_regs = 0, key_num_regs = 0; + u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; @@ -4842,12 +5164,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, } for (i = 0; i < desc->field_count; i++) - num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + len += round_up(desc->field_len[i], sizeof(u32)); - key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); - if (key_num_regs != num_regs) + if (len != desc->klen) return -EINVAL; + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; @@ -4954,6 +5276,15 @@ static bool nft_set_is_same(const struct nft_set *set, return true; } +static u32 nft_set_kernel_size(const struct nft_set_ops *ops, + const struct nft_set_desc *desc) +{ + if (ops->ksize) + return ops->ksize(desc->size); + + return desc->size; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { @@ -5113,7 +5444,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); @@ -5136,6 +5467,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) return err; + if (desc.size) + desc.size = nft_set_kernel_size(set->ops, &desc); + err = 0; if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); @@ -5158,6 +5492,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (IS_ERR(ops)) return PTR_ERR(ops); + if (desc.size) + desc.size = nft_set_kernel_size(ops, &desc); + udlen = 0; if (nla[NFTA_SET_USERDATA]) udlen = nla_len(nla[NFTA_SET_USERDATA]); @@ -5315,7 +5652,7 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; - set = nft_set_lookup(table, attr, genmask); + set = nft_set_lookup(net, table, attr, genmask); } if (IS_ERR(set)) { @@ -5363,6 +5700,11 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + return nft_setelem_data_validate(ctx, set, elem_priv); } @@ -5374,7 +5716,8 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; @@ -5407,6 +5750,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; @@ -5454,6 +5798,13 @@ static int nft_mapelem_activate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + /* called from abort path, reverse check to undo changes. */ + if (nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; @@ -5471,6 +5822,7 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } @@ -5480,6 +5832,7 @@ static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; @@ -5506,6 +5859,8 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { + lockdep_commit_lock_is_held(ctx->net); + switch (phase) { case NFT_TRANS_PREPARE_ERROR: nft_set_trans_unbind(ctx, set); @@ -5566,12 +5921,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u8), }, [NFT_SET_EXT_TIMEOUT] = { - .len = sizeof(u64), - .align = __alignof__(u64), - }, - [NFT_SET_EXT_EXPIRATION] = { - .len = sizeof(u64), - .align = __alignof__(u64), + .len = sizeof(struct nft_timeout), + .align = __alignof__(struct nft_timeout), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), @@ -5673,8 +6024,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), - set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, - set->dlen) < 0) + nft_set_datatype(set), set->dlen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && @@ -5691,25 +6041,32 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && - nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout); + u64 set_timeout = READ_ONCE(set->timeout); + __be64 msecs = 0; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - u64 expires, now = get_jiffies_64(); + if (set_timeout != timeout) { + msecs = nf_jiffies64_to_msecs(timeout); + if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs, + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } - expires = *nft_set_ext_expiration(ext); - if (time_before64(now, expires)) - expires -= now; - else - expires = 0; + if (timeout > 0) { + u64 expires, now = get_jiffies_64(); - if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - nf_jiffies64_to_msecs(expires), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration); + if (time_before64(now, expires)) + expires -= now; + else + expires = 0; + + if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, + nf_jiffies64_to_msecs(expires), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -5744,6 +6101,9 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; @@ -5854,6 +6214,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); + args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; @@ -6124,7 +6485,7 @@ static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx, return PTR_ERR(table); } - set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); @@ -6275,17 +6636,21 @@ err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } -static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, +static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { + struct nft_trans_elem *te; struct nft_trans *trans; - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1)); if (trans == NULL) return NULL; - nft_trans_elem_set(trans) = set; + te = nft_trans_container_elem(trans); + te->nelems = 1; + te->set = set; + return trans; } @@ -6368,13 +6733,14 @@ struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, nft_set_ext_data(ext), data, set->dlen) < 0) goto err_ext_check; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + nft_set_ext_timeout(ext)->timeout = timeout; + if (expiration == 0) - *nft_set_ext_expiration(ext) += timeout; + expiration = timeout; + + nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration; } - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) - *nft_set_ext_timeout(ext) = timeout; return elem; @@ -6406,28 +6772,52 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } /* Drop references and destroy. Called from gc, dynset and abort path. */ -void nft_set_elem_destroy(const struct nft_set *set, - const struct nft_elem_priv *elem_priv, - bool destroy_expr) +static void __nft_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - struct nft_ctx ctx = { - .net = read_pnet(&set->net), - .family = set->table->family, - }; nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) - nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); + nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); kfree(elem_priv); } + +/* Drop references and destroy. Called from gc and dynset. */ +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) +{ + struct nft_ctx ctx = { + .net = read_pnet(&set->net), + .family = set->table->family, + }; + + __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr); +} EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +/* Drop references and destroy. Called from abort path. */ +static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + /* skip update request, see nft_trans_elems_new_abort() */ + if (!te->elems[i].priv) + continue; + + __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); + } +} + /* Destroy element. References have been already dropped in the preparation * path via nft_setelem_data_deactivate(). */ @@ -6443,6 +6833,15 @@ void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, kfree(elem_priv); } +static void nft_trans_elems_destroy(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) + nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv); +} + int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]) { @@ -6454,7 +6853,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, if (!expr) goto err_expr; - err = nft_expr_clone(expr, set->exprs[i]); + err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT); if (err < 0) { kfree(expr); goto err_expr; @@ -6493,7 +6892,7 @@ static int nft_set_elem_expr_setup(struct nft_ctx *ctx, for (i = 0; i < num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); - err = nft_expr_clone(expr, expr_array[i]); + err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT); if (err < 0) goto err_elem_expr_setup; @@ -6549,7 +6948,7 @@ static int nft_setelem_catchall_insert(const struct net *net, } } - catchall = kmalloc(sizeof(*catchall), GFP_KERNEL); + catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT); if (!catchall) return -ENOMEM; @@ -6593,12 +6992,44 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_setelem_is_catchall(set, elem_priv)) { - nft_set_elem_change_active(net, set, ext); + nft_clear(net, ext); } else { set->ops->activate(net, set, elem_priv); } } +static void nft_trans_elem_update(const struct nft_set *set, + const struct nft_trans_one_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_elem_update *update = elem->update; + + if (update->flags & NFT_TRANS_UPD_TIMEOUT) + WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout); + + if (update->flags & NFT_TRANS_UPD_EXPIRATION) + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration); +} + +static void nft_trans_elems_add(const struct nft_ctx *ctx, + struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + struct nft_trans_one_elem *elem = &te->elems[i]; + + if (elem->update) + nft_trans_elem_update(te->set, elem); + else + nft_setelem_activate(ctx->net, te->set, elem->priv); + + nf_tables_setelem_notify(ctx, te->set, elem->priv, + NFT_MSG_NEWSETELEM); + kfree(elem->update); + } +} + static int nft_setelem_catchall_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem) @@ -6681,6 +7112,26 @@ static void nft_setelem_remove(const struct net *net, set->ops->remove(net, set, elem_priv); } +static void nft_trans_elems_remove(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + WARN_ON_ONCE(te->elems[i].update); + + nf_tables_setelem_notify(ctx, te->set, + te->elems[i].priv, + te->nft_trans.msg_type); + + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) { + atomic_dec(&te->set->nelems); + te->set->ndeact--; + } + } +} + static bool nft_setelem_valid_key_end(const struct nft_set *set, struct nlattr **nla, u32 flags) { @@ -6700,6 +7151,27 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set, return true; } +static u32 nft_set_maxsize(const struct nft_set *set) +{ + u32 maxsize, delta; + + if (!set->size) + return UINT_MAX; + + if (set->ops->adjust_maxsize) + delta = set->ops->adjust_maxsize(set); + else + delta = 0; + + if (check_add_overflow(set->size, set->ndeact, &maxsize)) + return UINT_MAX; + + if (check_add_overflow(maxsize, delta, &maxsize)) + return UINT_MAX; + + return maxsize; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { @@ -6785,17 +7257,23 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } else if (set->flags & NFT_SET_TIMEOUT && !(flags & NFT_SET_ELEM_INTERVAL_END)) { - timeout = READ_ONCE(set->timeout); + timeout = set->timeout; } expiration = 0; if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (timeout == 0) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], &expiration); if (err) return err; + + if (expiration > timeout) + return -ERANGE; } if (nla[NFTA_SET_ELEM_EXPR]) { @@ -6881,16 +7359,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_parse_key_end; } - if (timeout > 0) { - err = nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); + if (set->flags & NFT_SET_TIMEOUT) { + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); if (err < 0) goto err_parse_key_end; - - if (timeout != READ_ONCE(set->timeout)) { - err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); - if (err < 0) - goto err_parse_key_end; - } } if (num_exprs) { @@ -7028,8 +7500,40 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) goto err_element_clash; - else if (!(nlmsg_flags & NLM_F_EXCL)) + else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; + if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { + struct nft_elem_update update = { }; + + if (timeout != nft_set_ext_timeout(ext2)->timeout) { + update.timeout = timeout; + if (expiration == 0) + expiration = timeout; + + update.flags |= NFT_TRANS_UPD_TIMEOUT; + } + if (expiration) { + update.expiration = expiration; + update.flags |= NFT_TRANS_UPD_EXPIRATION; + } + + if (update.flags) { + struct nft_trans_one_elem *ue; + + ue = &nft_trans_container_elem(trans)->elems[0]; + + ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL); + if (!ue->update) { + err = -ENOMEM; + goto err_element_clash; + } + + ue->priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + goto err_elem_free; + } + } + } } else if (err == -ENOTEMPTY) { /* ENOTEMPTY reports overlapping between this element * and an existing one. @@ -7040,7 +7544,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } if (!(flags & NFT_SET_ELEM_CATCHALL)) { - unsigned int max = set->size ? set->size + set->ndeact : UINT_MAX; + unsigned int max = nft_set_maxsize(set); if (!atomic_add_unless(&set->nelems, 1, max)) { err = -ENFILE; @@ -7048,8 +7552,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } } - nft_trans_elem_priv(trans) = elem.priv; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; err_set_full: @@ -7152,6 +7656,16 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) } } +static int nft_setelem_active_next(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + u8 genmask = nft_genmask_next(net); + + return nft_set_elem_active(ext, genmask); +} + static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -7176,6 +7690,55 @@ void nft_setelem_data_deactivate(const struct net *net, nft_use_dec(&(*nft_set_ext_obj(ext))->use); } +/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem. + * No notifications and no ndeact changes. + * + * Returns true if set had been added to (i.e., elements need to be removed again). + */ +static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, + struct nft_trans_elem *te) +{ + bool removed = false; + int i; + + for (i = 0; i < te->nelems; i++) { + if (te->elems[i].update) { + kfree(te->elems[i].update); + te->elems[i].update = NULL; + /* Update request, so do not release this element */ + te->elems[i].priv = NULL; + continue; + } + + if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + atomic_dec(&te->set->nelems); + + removed = true; + } + + return removed; +} + +/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */ +static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) { + nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv); + nft_setelem_activate(ctx->net, te->set, te->elems[i].priv); + } + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + te->set->ndeact--; + } +} + static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -7255,8 +7818,8 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_data_deactivate(ctx->net, set, elem.priv); - nft_trans_elem_priv(trans) = elem.priv; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; fail_ops: @@ -7275,10 +7838,15 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_ATOMIC); + struct_size_t(struct nft_trans_elem, elems, 1), + GFP_ATOMIC); if (!trans) return -ENOMEM; @@ -7287,8 +7855,9 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; - nft_trans_elem_priv(trans) = elem_priv; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_container_elem(trans)->nelems = 1; + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); return 0; } @@ -7299,15 +7868,13 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, { struct nft_trans *trans; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_KERNEL); + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (!trans) return -ENOMEM; nft_setelem_data_deactivate(ctx->net, set, elem_priv); - nft_trans_elem_set(trans) = set; - nft_trans_elem_priv(trans) = elem_priv; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; } @@ -7320,7 +7887,8 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; @@ -7338,6 +7906,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, + .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; @@ -7369,7 +7938,7 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return PTR_ERR(table); } - set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); @@ -7573,7 +8142,7 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; - list_for_each_entry(type, &nf_tables_objects, list) { + list_for_each_entry_rcu(type, &nf_tables_objects, list) { if (type->family != NFPROTO_UNSPEC && type->family != family) continue; @@ -7589,9 +8158,13 @@ nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; + rcu_read_lock(); type = __nft_obj_type_get(objtype, family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -7612,9 +8185,7 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, struct nft_trans *trans; int err = -ENOMEM; - if (!try_module_get(type->owner)) - return -ENOENT; - + /* caller must have obtained type->owner reference. */ trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) @@ -7682,12 +8253,16 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - type = __nft_obj_type_get(objtype, family); - if (WARN_ON_ONCE(!type)) - return -ENOENT; + if (!obj->ops->update) + return 0; + + type = nft_obj_type_get(net, objtype, family); + if (WARN_ON_ONCE(IS_ERR(type))) + return PTR_ERR(type); nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + /* type->owner reference is put when transaction object is released. */ return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } @@ -7873,6 +8448,19 @@ cont: return skb->len; } +static int nf_tables_dumpreset_obj(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + int ret; + + mutex_lock(&nft_net->commit_mutex); + ret = nf_tables_dump_obj(skb, cb); + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -7889,12 +8477,18 @@ static int nf_tables_dump_obj_start(struct netlink_callback *cb) if (nla[NFTA_OBJ_TYPE]) ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - ctx->reset = true; - return 0; } +static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) +{ + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; + + ctx->reset = true; + + return nf_tables_dump_obj_start(cb); +} + static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -7904,9 +8498,10 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) return 0; } -/* called with rcu_read_lock held */ -static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, - const struct nlattr * const nla[]) +/* Caller must hold rcu read lock or transaction mutex */ +static struct sk_buff * +nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, + const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); @@ -7915,72 +8510,109 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; - bool reset = false; u32 objtype; int err; - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start = nf_tables_dump_obj_start, - .dump = nf_tables_dump_obj, - .done = nf_tables_dump_obj_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - if (!nla[NFTA_OBJ_NAME] || !nla[NFTA_OBJ_TYPE]) - return -EINVAL; + return ERR_PTR(-EINVAL); table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); - return PTR_ERR(table); + return ERR_CAST(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); - return PTR_ERR(obj); + return ERR_CAST(obj); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - reset = true; + err = nf_tables_fill_obj_info(skb2, net, portid, + info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + family, table, obj, reset); + if (err < 0) { + kfree_skb(skb2); + return ERR_PTR(err); + } - if (reset) { - const struct nftables_pernet *nft_net; - char *buf; + return skb2; +} - nft_net = nft_pernet(net); - buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq); +static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + u32 portid = NETLINK_CB(skb).portid; + struct sk_buff *skb2; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dump_obj_start, + .dump = nf_tables_dump_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; - audit_log_nfcfg(buf, - family, - 1, - AUDIT_NFT_OP_OBJ_RESET, - GFP_ATOMIC); - kfree(buf); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, - info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, - family, table, obj, reset); - if (err < 0) - goto err_fill_obj_info; + skb2 = nf_tables_getobj_single(portid, info, nla, false); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, info->net, portid); +} -err_fill_obj_info: - kfree_skb(skb2); - return err; +static int nf_tables_getobj_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + char *buf; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dumpreset_obj_start, + .dump = nf_tables_dumpreset_obj, + .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + skb2 = nf_tables_getobj_single(portid, info, nla, true); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); + + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", + nla_len(nla[NFTA_OBJ_TABLE]), + (char *)nla_data(nla[NFTA_OBJ_TABLE]), + nft_net->base_seq); + audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, + AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); + kfree(buf); + + return nfnetlink_unicast(skb2, net, portid); } static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) @@ -8135,12 +8767,14 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; @@ -8199,6 +8833,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; struct nft_hook *hook; int hooknum, priority; int err; @@ -8253,21 +8888,24 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, } list_for_each_entry(hook, &flowtable_hook->list, list) { - hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = flowtable_hook->num; - hook->ops.priority = flowtable_hook->priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + list_for_each_entry(ops, &hook->ops_list, list) { + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable_hook->num; + ops->priority = flowtable_hook->priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + } } return err; } +/* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; - list_for_each_entry(type, &nf_tables_flowtables, list) { + list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } @@ -8279,9 +8917,13 @@ nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; + rcu_read_lock(); type = __nft_flowtable_type_get(family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -8294,34 +8936,58 @@ nft_flowtable_type_get(struct net *net, u8 family) } /* Only called from error and netdev event paths. */ -static void nft_unregister_flowtable_hook(struct net *net, - struct nft_flowtable *flowtable, - struct nft_hook *hook) +static void nft_unregister_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + nf_unregister_net_hook(net, ops); + flowtable->data.type->setup(&flowtable->data, ops->dev, FLOW_BLOCK_UNBIND); } static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list) { - __nft_unregister_flowtable_net_hooks(net, hook_list, false); + __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); +} + +static int nft_register_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) +{ + int err; + + err = flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_BIND); + if (err < 0) + return err; + + err = nf_register_net_hook(net, ops); + if (!err) + return 0; + + flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_UNBIND); + return err; } static int nft_register_flowtable_net_hooks(struct net *net, @@ -8329,8 +8995,9 @@ static int nft_register_flowtable_net_hooks(struct net *net, struct list_head *hook_list, struct nft_flowtable *flowtable) { - struct nft_hook *hook, *hook2, *next; + struct nft_hook *hook, *next; struct nft_flowtable *ft; + struct nf_hook_ops *ops; int err, i = 0; list_for_each_entry(hook, hook_list, list) { @@ -8338,42 +9005,33 @@ static int nft_register_flowtable_net_hooks(struct net *net, if (!nft_is_active_next(net, ft)) continue; - list_for_each_entry(hook2, &ft->hook_list, list) { - if (hook->ops.dev == hook2->ops.dev && - hook->ops.pf == hook2->ops.pf) { - err = -EEXIST; - goto err_unregister_net_hooks; - } + if (nft_hook_list_find(&ft->hook_list, hook)) { + err = -EEXIST; + goto err_unregister_net_hooks; } } - err = flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_BIND); - if (err < 0) - goto err_unregister_net_hooks; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nft_register_flowtable_ops(net, flowtable, ops); + if (err < 0) + goto err_unregister_net_hooks; - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) { - flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_UNBIND); - goto err_unregister_net_hooks; + i++; } - - i++; } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - nft_unregister_flowtable_hook(net, flowtable, hook); + nft_unregister_flowtable_ops(net, flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -8385,7 +9043,7 @@ static void nft_hooks_destroy(struct list_head *hook_list) list_for_each_entry_safe(hook, next, hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } @@ -8396,6 +9054,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; struct nft_hook *hook, *next; + struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; u32 flags; @@ -8409,7 +9068,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } } @@ -8453,10 +9112,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { - if (unregister) - nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + if (unregister) { + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(ctx->net, + flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -8491,7 +9153,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); @@ -8602,7 +9264,7 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { list_del(&this->list); - kfree(this); + nft_netdev_hook_free(this); } } @@ -8685,7 +9347,7 @@ static int nf_tables_delflowtable(struct sk_buff *skb, flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_FLOWTABLE_NAME]; - flowtable = nft_flowtable_lookup(table, attr, genmask); + flowtable = nft_flowtable_lookup(net, table, attr, genmask); } if (IS_ERR(flowtable)) { @@ -8755,8 +9417,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!hook_list) hook_list = &flowtable->hook_list; - list_for_each_entry_rcu(hook, hook_list, list) { - if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + if (nla_put(skb, NFTA_DEVICE_NAME, + hook->ifnamelen, hook->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -8897,7 +9561,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); @@ -8962,10 +9626,8 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); - kfree(hook); + nft_netdev_hook_free_rcu(hook); } kfree(flowtable->name); module_put(flowtable->data.type->owner); @@ -8998,46 +9660,190 @@ nla_put_failure: return -EMSGSIZE; } -static void nft_flowtable_event(unsigned long event, struct net_device *dev, - struct nft_flowtable *flowtable) +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops); + +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry_rcu(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); + +static void +nf_tables_device_notify(const struct nft_table *table, int attr, + const char *name, const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + struct net *net = dev_net(dev); + struct nlmsghdr *nlh; + struct sk_buff *skb; + u16 flags = 0; + + if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + goto err; + + event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); + nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) + goto err; + + if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) || + nla_put_string(skb, attr, name) || + nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) || + nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + goto err; + + nlmsg_end(skb, nlh); + nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV, + nlmsg_report(nlh), GFP_KERNEL); + return; +err: + if (skb) + kfree_skb(skb); + nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS); +} + +void +nf_tables_chain_device_notify(const struct nft_chain *chain, + const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN, + chain->name, hook, dev, event); +} + +static void +nf_tables_flowtable_device_notify(const struct nft_flowtable *ft, + const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE, + ft->name, hook, dev, event); +} + +static int nft_flowtable_event(unsigned long event, struct net_device *dev, + struct nft_flowtable *flowtable, bool changename) { + struct nf_hook_ops *ops; struct nft_hook *hook; + bool match; list_for_each_entry(hook, &flowtable->hook_list, list) { - if (hook->ops.dev != dev) - continue; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - /* flow_offload_netdev_event() cleans up entries for us. */ - nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + /* flow_offload_netdev_event() cleans up entries for us. */ + nft_unregister_flowtable_ops(dev_net(dev), + flowtable, ops); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kzalloc(sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable->hooknum; + ops->priority = flowtable->data.priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->dev = dev; + if (nft_register_flowtable_ops(dev_net(dev), + flowtable, ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } + nf_tables_flowtable_device_notify(flowtable, hook, dev, event); break; } + return 0; +} + +static int __nf_tables_flowtable_event(unsigned long event, + struct net_device *dev, + bool changename) +{ + struct nftables_pernet *nft_net = nft_pernet(dev_net(dev)); + struct nft_flowtable *flowtable; + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + if (nft_flowtable_event(event, dev, + flowtable, changename)) + return 1; + } + } + return 0; } static int nf_tables_flowtable_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; - struct nft_table *table; + int ret = NOTIFY_DONE; struct net *net; - if (event != NETDEV_UNREGISTER) - return 0; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; net = dev_net(dev); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); - list_for_each_entry(table, &nft_net->tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - nft_flowtable_event(event, dev, flowtable); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; } + __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_flowtable_event(event, dev, false)) { + ret = NOTIFY_BAD; } +out_unlock: mutex_unlock(&nft_net->commit_mutex); - - return NOTIFY_DONE; + return ret; } static struct notifier_block nf_tables_flowtable_notifier = { @@ -9258,7 +10064,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call = nf_tables_getobj, + .call = nf_tables_getobj_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, @@ -9320,51 +10126,53 @@ static int nf_tables_validate(struct net *net) * * We defer the drop policy until the transaction has been finalized. */ -static void nft_chain_commit_drop_policy(struct nft_trans *trans) +static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans) { struct nft_base_chain *basechain; - if (nft_trans_chain_policy(trans) != NF_DROP) + if (trans->policy != NF_DROP) return; - if (!nft_is_base_chain(trans->ctx.chain)) + if (!nft_is_base_chain(trans->chain)) return; - basechain = nft_base_chain(trans->ctx.chain); + basechain = nft_base_chain(trans->chain); basechain->policy = NF_DROP; } -static void nft_chain_commit_update(struct nft_trans *trans) +static void nft_chain_commit_update(struct nft_trans_chain *trans) { + struct nft_table *table = trans->nft_trans_binding.nft_trans.table; struct nft_base_chain *basechain; - if (nft_trans_chain_name(trans)) { - rhltable_remove(&trans->ctx.table->chains_ht, - &trans->ctx.chain->rhlhead, + if (trans->name) { + rhltable_remove(&table->chains_ht, + &trans->chain->rhlhead, nft_chain_ht_params); - swap(trans->ctx.chain->name, nft_trans_chain_name(trans)); - rhltable_insert_key(&trans->ctx.table->chains_ht, - trans->ctx.chain->name, - &trans->ctx.chain->rhlhead, + swap(trans->chain->name, trans->name); + rhltable_insert_key(&table->chains_ht, + trans->chain->name, + &trans->chain->rhlhead, nft_chain_ht_params); } - if (!nft_is_base_chain(trans->ctx.chain)) + if (!nft_is_base_chain(trans->chain)) return; nft_chain_stats_replace(trans); - basechain = nft_base_chain(trans->ctx.chain); + basechain = nft_base_chain(trans->chain); - switch (nft_trans_chain_policy(trans)) { + switch (trans->policy) { case NF_DROP: case NF_ACCEPT: - basechain->policy = nft_trans_chain_policy(trans); + basechain->policy = trans->policy; break; } } -static void nft_obj_commit_update(struct nft_trans *trans) +static void nft_obj_commit_update(const struct nft_ctx *ctx, + struct nft_trans *trans) { struct nft_object *newobj; struct nft_object *obj; @@ -9372,18 +10180,25 @@ static void nft_obj_commit_update(struct nft_trans *trans) obj = nft_trans_obj(trans); newobj = nft_trans_obj_newobj(trans); - if (obj->ops->update) - obj->ops->update(obj, newobj); + if (WARN_ON_ONCE(!obj->ops->update)) + return; - nft_obj_destroy(&trans->ctx, newobj); + obj->ops->update(obj, newobj); + nft_obj_destroy(ctx, newobj); } static void nft_commit_release(struct nft_trans *trans) { + struct nft_ctx ctx = { + .net = trans->net, + }; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_DELTABLE: case NFT_MSG_DESTROYTABLE: - nf_tables_table_destroy(&trans->ctx); + nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: free_percpu(nft_trans_chain_stats(trans)); @@ -9394,25 +10209,23 @@ static void nft_commit_release(struct nft_trans *trans) if (nft_trans_chain_update(trans)) nft_hooks_destroy(&nft_trans_chain_hooks(trans)); else - nf_tables_chain_destroy(&trans->ctx); + nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: - nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: - nft_set_destroy(&trans->ctx, nft_trans_set(trans)); + nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: - nf_tables_set_elem_destroy(&trans->ctx, - nft_trans_elem_set(trans), - nft_trans_elem_priv(trans)); + nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: - nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); + nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: @@ -9424,18 +10237,19 @@ static void nft_commit_release(struct nft_trans *trans) } if (trans->put_net) - put_net(trans->ctx.net); + put_net(trans->net); kfree(trans); } static void nf_tables_trans_destroy_work(struct work_struct *w) { + struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work); struct nft_trans *trans, *next; LIST_HEAD(head); spin_lock(&nf_tables_destroy_list_lock); - list_splice_init(&nf_tables_destroy_list, &head); + list_splice_init(&nft_net->destroy_list, &head); spin_unlock(&nf_tables_destroy_list_lock); if (list_empty(&head)) @@ -9449,9 +10263,11 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) } } -void nf_tables_trans_destroy_flush_work(void) +void nf_tables_trans_destroy_flush_work(struct net *net) { - flush_work(&trans_destroy_work); + struct nftables_pernet *nft_net = nft_pernet(net); + + flush_work(&nft_net->destroy_work); } EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); @@ -9543,10 +10359,10 @@ static void nf_tables_commit_chain_prepare_cancel(struct net *net) struct nft_trans *trans, *next; list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { - struct nft_chain *chain = trans->ctx.chain; - if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { + struct nft_chain *chain = nft_trans_rule_chain(trans); + kvfree(chain->blob_next); chain->blob_next = NULL; } @@ -9904,16 +10720,16 @@ static void nf_tables_commit_release(struct net *net) trans = list_last_entry(&nft_net->commit_list, struct nft_trans, list); - get_net(trans->ctx.net); + get_net(trans->net); WARN_ON_ONCE(trans->put_net); trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); - list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list); + list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); - schedule_work(&trans_destroy_work); + schedule_work(&nft_net->destroy_work); mutex_unlock(&nft_net->commit_mutex); } @@ -9981,9 +10797,24 @@ static void nf_tables_commit_audit_free(struct list_head *adl) } } +/* nft audit emits the number of elements that get added/removed/updated, + * so NEW/DELSETELEM needs to increment based on the total elem count. + */ +static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans) +{ + switch (trans->msg_type) { + case NFT_MSG_NEWSETELEM: + case NFT_MSG_DELSETELEM: + return nft_trans_container_elem(trans)->nelems; + } + + return 1; +} + static void nf_tables_commit_audit_collect(struct list_head *adl, - struct nft_table *table, u32 op) + const struct nft_trans *trans, u32 op) { + const struct nft_table *table = trans->table; struct nft_audit_data *adp; list_for_each_entry(adp, adl, list) { @@ -9993,7 +10824,7 @@ static void nf_tables_commit_audit_collect(struct list_head *adl, WARN_ONCE(1, "table=%s not expected in commit list", table->name); return; found: - adp->entries++; + adp->entries += nf_tables_commit_audit_entrycount(trans); if (!adp->op || adp->op > op) adp->op = op; } @@ -10048,12 +10879,15 @@ static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); + const struct nlmsghdr *nlh = nlmsg_hdr(skb); + struct nft_trans_binding *trans_binding; struct nft_trans *trans, *next; unsigned int base_seq, gc_seq; LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + struct nft_ctx ctx; LIST_HEAD(adl); int err; @@ -10062,7 +10896,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } - list_for_each_entry(trans, &nft_net->binding_list, binding_list) { + nft_ctx_init(&ctx, net, skb, nlh, NFPROTO_UNSPEC, NULL, NULL, NULL); + + list_for_each_entry(trans_binding, &nft_net->binding_list, binding_list) { + trans = &trans_binding->nft_trans; switch (trans->msg_type) { case NFT_MSG_NEWSET: if (!nft_trans_set_update(trans) && @@ -10080,6 +10917,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return -EINVAL; } break; + default: + WARN_ONCE(1, "Unhandled bind type %d", trans->msg_type); + break; } } @@ -10095,9 +10935,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) /* 1. Allocate space for next generation rules_gen_X[] */ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { + struct nft_table *table = trans->table; int ret; - ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table); + ret = nf_tables_commit_audit_alloc(&adl, table); if (ret) { nf_tables_commit_chain_prepare_cancel(net); nf_tables_commit_audit_free(&adl); @@ -10105,7 +10946,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { - chain = trans->ctx.chain; + chain = nft_trans_rule_chain(trans); ret = nf_tables_commit_chain_prepare(net, chain); if (ret < 0) { @@ -10138,68 +10979,71 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) net->nft.gencursor = nft_gencursor_next(net); list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { - nf_tables_commit_audit_collect(&adl, trans->ctx.table, - trans->msg_type); + struct nft_table *table = trans->table; + + nft_ctx_update(&ctx, trans); + + nf_tables_commit_audit_collect(&adl, trans, trans->msg_type); switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) { + if (!(table->flags & __NFT_TABLE_F_UPDATE)) { nft_trans_destroy(trans); break; } - if (trans->ctx.table->flags & NFT_TABLE_F_DORMANT) - nf_tables_table_disable(net, trans->ctx.table); + if (table->flags & NFT_TABLE_F_DORMANT) + nf_tables_table_disable(net, table); - trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE; + table->flags &= ~__NFT_TABLE_F_UPDATE; } else { - nft_clear(net, trans->ctx.table); + nft_clear(net, table); } - nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); + nf_tables_table_notify(&ctx, NFT_MSG_NEWTABLE); nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: case NFT_MSG_DESTROYTABLE: - list_del_rcu(&trans->ctx.table->list); - nf_tables_table_notify(&trans->ctx, trans->msg_type); + list_del_rcu(&table->list); + nf_tables_table_notify(&ctx, trans->msg_type); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - nft_chain_commit_update(trans); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN, + nft_chain_commit_update(nft_trans_container_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, &nft_trans_chain_hooks(trans)); list_splice(&nft_trans_chain_hooks(trans), &nft_trans_basechain(trans)->hook_list); /* trans destroyed after rcu grace period */ } else { - nft_chain_commit_drop_policy(trans); - nft_clear(net, trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN, NULL); + nft_chain_commit_drop_policy(nft_trans_container_chain(trans)); + nft_clear(net, nft_trans_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL); nft_trans_destroy(trans); } break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { - nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN, + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, &nft_trans_chain_hooks(trans)); - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } } else { - nft_chain_del(trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN, + nft_chain_del(nft_trans_chain(trans)); + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, NULL); - nf_tables_unregister_hook(trans->ctx.net, - trans->ctx.table, - trans->ctx.chain); + nf_tables_unregister_hook(ctx.net, ctx.table, + nft_trans_chain(trans)); } break; case NFT_MSG_NEWRULE: - nft_clear(trans->ctx.net, nft_trans_rule(trans)); - nf_tables_rule_notify(&trans->ctx, - nft_trans_rule(trans), + nft_clear(net, nft_trans_rule(trans)); + nf_tables_rule_notify(&ctx, nft_trans_rule(trans), NFT_MSG_NEWRULE); - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); nft_trans_destroy(trans); @@ -10207,17 +11051,16 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: list_del_rcu(&nft_trans_rule(trans)->list); - nf_tables_rule_notify(&trans->ctx, - nft_trans_rule(trans), + nf_tables_rule_notify(&ctx, nft_trans_rule(trans), trans->msg_type); - nft_rule_expr_deactivate(&trans->ctx, - nft_trans_rule(trans), + nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: + list_del(&nft_trans_container_set(trans)->list_trans_newset); if (nft_trans_set_update(trans)) { struct nft_set *set = nft_trans_set(trans); @@ -10233,9 +11076,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) */ if (nft_set_is_anonymous(nft_trans_set(trans)) && !list_empty(&nft_trans_set(trans)->bindings)) - nft_use_dec(&trans->ctx.table->use); + nft_use_dec(&table->use); } - nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + nf_tables_set_notify(&ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); nft_trans_destroy(trans); break; @@ -10243,16 +11086,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DESTROYSET: nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); - nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + nf_tables_set_notify(&ctx, nft_trans_set(trans), trans->msg_type, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: - te = (struct nft_trans_elem *)trans->data; + te = nft_trans_container_elem(trans); + + nft_trans_elems_add(&ctx, te); - nft_setelem_activate(net, te->set, te->elem_priv); - nf_tables_setelem_notify(&trans->ctx, te->set, - te->elem_priv, - NFT_MSG_NEWSETELEM); if (te->set->ops->commit && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, @@ -10262,16 +11103,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: - te = (struct nft_trans_elem *)trans->data; - - nf_tables_setelem_notify(&trans->ctx, te->set, - te->elem_priv, - trans->msg_type); - nft_setelem_remove(net, te->set, te->elem_priv); - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) { - atomic_dec(&te->set->nelems); - te->set->ndeact--; - } + te = nft_trans_container_elem(trans); + + nft_trans_elems_remove(&ctx, te); + if (te->set->ops->commit && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, @@ -10280,13 +11115,13 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { - nft_obj_commit_update(trans); - nf_tables_obj_notify(&trans->ctx, + nft_obj_commit_update(&ctx, trans); + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), NFT_MSG_NEWOBJ); } else { nft_clear(net, nft_trans_obj(trans)); - nf_tables_obj_notify(&trans->ctx, + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), NFT_MSG_NEWOBJ); nft_trans_destroy(trans); @@ -10295,14 +11130,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: nft_obj_del(nft_trans_obj(trans)); - nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), + nf_tables_obj_notify(&ctx, nft_trans_obj(trans), trans->msg_type); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_trans_flowtable(trans)->data.flags = nft_trans_flowtable_flags(trans); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), NFT_MSG_NEWFLOWTABLE); @@ -10310,7 +11145,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) &nft_trans_flowtable(trans)->hook_list); } else { nft_clear(net, nft_trans_flowtable(trans)); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, NFT_MSG_NEWFLOWTABLE); @@ -10320,19 +11155,21 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); - nf_tables_flowtable_notify(&trans->ctx, + nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -10370,28 +11207,31 @@ static void nf_tables_module_autoload(struct net *net) static void nf_tables_abort_release(struct nft_trans *trans) { + struct nft_ctx ctx = { }; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_NEWTABLE: - nf_tables_table_destroy(&trans->ctx); + nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) nft_hooks_destroy(&nft_trans_chain_hooks(trans)); else - nf_tables_chain_destroy(&trans->ctx); + nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_NEWRULE: - nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_NEWSET: - nft_set_destroy(&trans->ctx, nft_trans_set(trans)); + nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: - nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem_priv(trans), true); + nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_NEWOBJ: - nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); + nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) @@ -10423,46 +11263,56 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) struct nft_trans *trans, *next; LIST_HEAD(set_update_list); struct nft_trans_elem *te; + struct nft_ctx ctx = { + .net = net, + }; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { + struct nft_table *table = trans->table; + + nft_ctx_update(&ctx, trans); + switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) { + if (!(table->flags & __NFT_TABLE_F_UPDATE)) { nft_trans_destroy(trans); break; } - if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_DORMANT) { - nf_tables_table_disable(net, trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; - } else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) { - trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT; + if (table->flags & __NFT_TABLE_F_WAS_DORMANT) { + nf_tables_table_disable(net, table); + table->flags |= NFT_TABLE_F_DORMANT; + } else if (table->flags & __NFT_TABLE_F_WAS_AWAKEN) { + table->flags &= ~NFT_TABLE_F_DORMANT; } - if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_ORPHAN) { - trans->ctx.table->flags &= ~NFT_TABLE_F_OWNER; - trans->ctx.table->nlpid = 0; + if (table->flags & __NFT_TABLE_F_WAS_ORPHAN) { + table->flags &= ~NFT_TABLE_F_OWNER; + table->nlpid = 0; } - trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE; + table->flags &= ~__NFT_TABLE_F_UPDATE; nft_trans_destroy(trans); } else { - list_del_rcu(&trans->ctx.table->list); + list_del_rcu(&table->list); } break; case NFT_MSG_DELTABLE: case NFT_MSG_DESTROYTABLE: - nft_clear(trans->ctx.net, trans->ctx.table); + nft_clear(trans->net, table); nft_trans_destroy(trans); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); @@ -10471,11 +11321,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - nft_use_dec_restore(&trans->ctx.table->use); - nft_chain_del(trans->ctx.chain); - nf_tables_unregister_hook(trans->ctx.net, - trans->ctx.table, - trans->ctx.chain); + nft_use_dec_restore(&table->use); + nft_chain_del(nft_trans_chain(trans)); + nf_tables_unregister_hook(trans->net, table, + nft_trans_chain(trans)); } break; case NFT_MSG_DELCHAIN: @@ -10484,8 +11333,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_splice(&nft_trans_chain_hooks(trans), &nft_trans_basechain(trans)->hook_list); } else { - nft_use_inc_restore(&trans->ctx.table->use); - nft_clear(trans->ctx.net, trans->ctx.chain); + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_chain(trans)); } nft_trans_destroy(trans); break; @@ -10494,30 +11343,31 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - nft_use_dec_restore(&trans->ctx.chain->use); + nft_use_dec_restore(&nft_trans_rule_chain(trans)->use); list_del_rcu(&nft_trans_rule(trans)->list); - nft_rule_expr_deactivate(&trans->ctx, + nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans), NFT_TRANS_ABORT); - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: - nft_use_inc_restore(&trans->ctx.chain->use); - nft_clear(trans->ctx.net, nft_trans_rule(trans)); - nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); - if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_use_inc_restore(&nft_trans_rule_chain(trans)->use); + nft_clear(trans->net, nft_trans_rule(trans)); + nft_rule_expr_activate(&ctx, nft_trans_rule(trans)); + if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: + list_del(&nft_trans_container_set(trans)->list_trans_newset); if (nft_trans_set_update(trans)) { nft_trans_destroy(trans); break; } - nft_use_dec_restore(&trans->ctx.table->use); + nft_use_dec_restore(&table->use); if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; @@ -10527,10 +11377,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: - nft_use_inc_restore(&trans->ctx.table->use); - nft_clear(trans->ctx.net, nft_trans_set(trans)); + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_set(trans)); if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) - nft_map_activate(&trans->ctx, nft_trans_set(trans)); + nft_map_activate(&ctx, nft_trans_set(trans)); nft_trans_destroy(trans); break; @@ -10539,10 +11389,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - te = (struct nft_trans_elem *)trans->data; - nft_setelem_remove(net, te->set, te->elem_priv); - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) - atomic_dec(&te->set->nelems); + te = nft_trans_container_elem(trans); + if (!nft_trans_elems_new_abort(&ctx, te)) { + nft_trans_destroy(trans); + break; + } if (te->set->ops->abort && list_empty(&te->set->pending_update)) { @@ -10552,12 +11403,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: - te = (struct nft_trans_elem *)trans->data; + te = nft_trans_container_elem(trans); - nft_setelem_data_activate(net, te->set, te->elem_priv); - nft_setelem_activate(net, te->set, te->elem_priv); - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) - te->set->ndeact--; + nft_trans_elems_destroy_abort(&ctx, te); if (te->set->ops->abort && list_empty(&te->set->pending_update)) { @@ -10568,27 +11416,29 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { - nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans)); + nft_obj_destroy(&ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { - nft_use_dec_restore(&trans->ctx.table->use); + nft_use_dec_restore(&table->use); nft_obj_del(nft_trans_obj(trans)); } break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: - nft_use_inc_restore(&trans->ctx.table->use); - nft_clear(trans->ctx.net, nft_trans_obj(trans)); + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { - nft_use_dec_restore(&trans->ctx.table->use); + nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -10598,14 +11448,16 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); } else { - nft_use_inc_restore(&trans->ctx.table->use); - nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + nft_use_inc_restore(&table->use); + nft_clear(trans->net, nft_trans_flowtable(trans)); } nft_trans_destroy(trans); break; } } + WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); + nft_set_abort_update(&set_update_list); synchronize_rcu(); @@ -10616,12 +11468,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - - return 0; + return err; } static int nf_tables_abort(struct net *net, struct sk_buff *skb, @@ -10634,6 +11481,17 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, gc_seq = nft_gc_seq_begin(nft_net); ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + mutex_unlock(&nft_net->commit_mutex); return ret; @@ -10698,146 +11556,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain, } EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); -/* - * Loop detection - walk through the ruleset beginning at the destination chain - * of a new jump until either the source chain is reached (loop) or all - * reachable chains have been traversed. - * - * The loop check is performed whenever a new jump verdict is added to an - * expression or verdict map or a verdict map is bound to a new chain. - */ - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain); - -static int nft_check_loops(const struct nft_ctx *ctx, - const struct nft_set_ext *ext) -{ - const struct nft_data *data; - int ret; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - ret = nf_tables_check_loops(ctx, data->verdict.chain); - break; - default: - ret = 0; - break; - } - - return ret; -} - -static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_elem_priv *elem_priv) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - return nft_check_loops(ctx, ext); -} - -static int nft_set_catchall_loops(const struct nft_ctx *ctx, - struct nft_set *set) -{ - u8 genmask = nft_genmask_next(ctx->net); - struct nft_set_elem_catchall *catchall; - struct nft_set_ext *ext; - int ret = 0; - - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) - continue; - - ret = nft_check_loops(ctx, ext); - if (ret < 0) - return ret; - } - - return ret; -} - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain) -{ - const struct nft_rule *rule; - const struct nft_expr *expr, *last; - struct nft_set *set; - struct nft_set_binding *binding; - struct nft_set_iter iter; - - if (ctx->chain == chain) - return -ELOOP; - - if (fatal_signal_pending(current)) - return -EINTR; - - list_for_each_entry(rule, &chain->rules, list) { - nft_rule_for_each_expr(expr, last, rule) { - struct nft_immediate_expr *priv; - const struct nft_data *data; - int err; - - if (strcmp(expr->ops->type->name, "immediate")) - continue; - - priv = nft_expr_priv(expr); - if (priv->dreg != NFT_REG_VERDICT) - continue; - - data = &priv->data; - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - err = nf_tables_check_loops(ctx, - data->verdict.chain); - if (err < 0) - return err; - break; - default: - break; - } - } - } - - list_for_each_entry(set, &ctx->table->sets, list) { - if (!nft_is_active_next(ctx->net, set)) - continue; - if (!(set->flags & NFT_SET_MAP) || - set->dtype != NFT_DATA_VERDICT) - continue; - - list_for_each_entry(binding, &set->bindings, list) { - if (!(binding->flags & NFT_SET_MAP) || - binding->chain != chain) - continue; - - iter.genmask = nft_genmask_next(ctx->net); - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_loop_check_setelem; - - set->ops->walk(ctx, set, &iter); - if (!iter.err) - iter.err = nft_set_catchall_loops(ctx, set); - - if (iter.err < 0) - return iter.err; - } - } - - return 0; -} - /** * nft_parse_u32_check - fetch u32 attribute and check for maximum value * @@ -10916,10 +11634,11 @@ static int nft_validate_register_load(enum nft_registers reg, unsigned int len) return 0; } -int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) +int nft_parse_register_load(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *sreg, u32 len) { - u32 reg; - int err; + int err, invalid_reg; + u32 reg, next_register; err = nft_parse_register(attr, ®); if (err < 0) @@ -10929,11 +11648,36 @@ int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) if (err < 0) return err; + next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg; + + /* Can't happen: nft_validate_register_load() should have failed */ + if (WARN_ON_ONCE(next_register > NFT_REG32_NUM)) + return -EINVAL; + + /* find first register that did not see an earlier store. */ + invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg); + + /* invalid register within the range that we're loading from? */ + if (invalid_reg < next_register) + return -ENODATA; + *sreg = reg; return 0; } EXPORT_SYMBOL_GPL(nft_parse_register_load); +static void nft_saw_register_store(const struct nft_ctx *__ctx, + int reg, unsigned int len) +{ + unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE); + struct nft_ctx *ctx = (struct nft_ctx *)__ctx; + + if (WARN_ON_ONCE(len == 0 || reg < 0)) + return; + + bitmap_set(ctx->reg_inited, reg, registers); +} + static int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, @@ -10950,13 +11694,16 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, if (data != NULL && (data->verdict.code == NFT_GOTO || data->verdict.code == NFT_JUMP)) { - err = nf_tables_check_loops(ctx, data->verdict.chain); + err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; } - return 0; + break; default: + if (type != NFT_DATA_VALUE) + return -EINVAL; + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; if (len == 0) @@ -10965,10 +11712,11 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, sizeof_field(struct nft_regs, data)) return -ERANGE; - if (data != NULL && type != NFT_DATA_VALUE) - return -EINVAL; - return 0; + break; } + + nft_saw_register_store(ctx, reg, len); + return 0; } int nft_parse_register_store(const struct nft_ctx *ctx, @@ -11234,27 +11982,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -int __nft_release_basechain(struct nft_ctx *ctx) -{ - struct nft_rule *rule, *nr; - - if (WARN_ON(!nft_is_base_chain(ctx->chain))) - return 0; - - nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); - list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { - list_del(&rule->list); - nft_use_dec(&ctx->chain->use); - nf_tables_rule_release(ctx, rule); - } - nft_chain_del(ctx->chain); - nft_use_dec(&ctx->table->use); - nf_tables_chain_destroy(ctx); - - return 0; -} -EXPORT_SYMBOL_GPL(__nft_release_basechain); - static void __nft_release_hook(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable; @@ -11263,7 +11990,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) list_for_each_entry(chain, &table->chains, list) __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) - __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list, + __nft_unregister_flowtable_net_hooks(net, flowtable, + &flowtable->hook_list, true); } @@ -11324,12 +12052,11 @@ static void __nft_release_table(struct net *net, struct nft_table *table) nft_obj_destroy(&ctx, obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { - ctx.chain = chain; nft_chain_del(chain); nft_use_dec(&table->use); - nf_tables_chain_destroy(&ctx); + nf_tables_chain_destroy(chain); } - nf_tables_table_destroy(&ctx); + nf_tables_table_destroy(table); } static void __nft_release_tables(struct net *net) @@ -11367,8 +12094,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nf_tables_destroy_list)) - nf_tables_trans_destroy_flush_work(); + nf_tables_trans_destroy_flush_work(net); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && @@ -11410,6 +12136,8 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->destroy_list); + INIT_LIST_HEAD(&nft_net->commit_set_list); INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); @@ -11417,6 +12145,7 @@ static int __net_init nf_tables_init_net(struct net *net) nft_net->base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; + INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); return 0; } @@ -11439,18 +12168,23 @@ static void __net_exit nf_tables_exit_net(struct net *net) gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nft_net->commit_list) || - !list_empty(&nft_net->module_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); + cancel_work_sync(&nft_net->destroy_work); __nft_release_tables(net); nft_gc_seq_end(nft_net, gc_seq); mutex_unlock(&nft_net->commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->tables)); WARN_ON_ONCE(!list_empty(&nft_net->module_list)); WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); + WARN_ON_ONCE(!list_empty(&nft_net->destroy_list)); } static void nf_tables_exit_batch(struct list_head *net_exit_list) @@ -11471,6 +12205,14 @@ static int __init nf_tables_module_init(void) { int err; + BUILD_BUG_ON(offsetof(struct nft_trans_table, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_chain, nft_trans_binding.nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_rule, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_set, nft_trans_binding.nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_elem, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_obj, nft_trans) != 0); + BUILD_BUG_ON(offsetof(struct nft_trans_flowtable, nft_trans) != 0); + err = register_pernet_subsys(&nf_tables_net_ops); if (err < 0) return err; @@ -11535,7 +12277,6 @@ static void __exit nf_tables_module_exit(void) nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_gc_work); - cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); nf_tables_core_module_exit(); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index a48d5f0e2f3e..6557a4018c09 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,25 +21,22 @@ #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> -#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_X86) - +#ifdef CONFIG_MITIGATION_RETPOLINE static struct static_key_false nf_tables_skip_direct_calls; -static bool nf_skip_indirect_calls(void) +static inline bool nf_skip_indirect_calls(void) { return static_branch_likely(&nf_tables_skip_direct_calls); } -static void __init nf_skip_indirect_calls_enable(void) +static inline void __init nf_skip_indirect_calls_enable(void) { if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) static_branch_enable(&nf_tables_skip_direct_calls); } #else -static inline bool nf_skip_indirect_calls(void) { return false; } - static inline void nf_skip_indirect_calls_enable(void) { } -#endif +#endif /* CONFIG_MITIGATION_RETPOLINE */ static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, @@ -256,7 +253,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) const struct net *net = nft_net(pkt); const struct nft_expr *expr, *last; const struct nft_rule_dp *rule; - struct nft_regs regs = {}; + struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; bool genbit = READ_ONCE(net->nft.gencursor); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 12ab78fa5d84..fd30e205de84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -220,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain) bool nft_chain_offload_support(const struct nft_base_chain *basechain) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; @@ -227,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain) return false; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.pf != NFPROTO_NETDEV || - hook->ops.hooknum != NF_NETDEV_INGRESS) - return false; - - dev = hook->ops.dev; - if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) - return false; + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->pf != NFPROTO_NETDEV || + ops->hooknum != NF_NETDEV_INGRESS) + return false; + + dev = ops->dev; + if (!dev->netdev_ops->ndo_setup_tc && + !flow_indr_dev_exists()) + return false; + } } return true; @@ -455,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { - struct net_device *dev; + struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { - dev = hook->ops.dev; - if (this_dev && this_dev != dev) - continue; + list_for_each_entry(ops, &hook->ops_list, list) { + if (this_dev && this_dev != ops->dev) + continue; - err = nft_chain_offload_cmd(basechain, dev, cmd); - if (err < 0 && cmd == FLOW_BLOCK_BIND) { - if (!this_dev) - goto err_flow_block; + err = nft_chain_offload_cmd(basechain, ops->dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; - return err; + return err; + } + i++; } - i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - dev = hook->ops.dev; - nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + nft_chain_offload_cmd(basechain, ops->dev, + FLOW_BLOCK_UNBIND); + } } return err; } @@ -513,38 +520,38 @@ static void nft_flow_rule_offload_abort(struct net *net, int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) + if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; - err = nft_flow_offload_chain(trans->ctx.chain, NULL, + err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_chain(trans->ctx.chain, NULL, + err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); @@ -564,46 +571,46 @@ int nft_flow_rule_offload_commit(struct net *net) u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) + if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); - err = nft_flow_offload_chain(trans->ctx.chain, &policy, + err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); - err = nft_flow_offload_chain(trans->ctx.chain, &policy, + err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - if (trans->ctx.flags & NLM_F_REPLACE || - !(trans->ctx.flags & NLM_F_APPEND)) { + if (trans->flags & NLM_F_REPLACE || + !(trans->flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans->ctx.chain, + err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; @@ -638,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops(hook, dev)) continue; found = hook; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index a83637e3f455..ae3fe87195ab 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -15,6 +15,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -90,6 +91,49 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb, return 0; } +static int nf_trace_fill_ct_info(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + const struct nf_ct_hook *ct_hook; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + u32 state; + + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) + return 0; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { + if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */ + return 0; + + state = NF_CT_STATE_UNTRACKED_BIT; + } else { + state = NF_CT_STATE_BIT(ctinfo); + } + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state))) + return -1; + + if (ct) { + u32 id = ct_hook->get_id(&ct->ct_general); + u32 status = READ_ONCE(ct->status); + u8 dir = CTINFO2DIR(ctinfo); + + if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir)) + return -1; + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id)) + return -1; + + if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status))) + return -1; + } + + return 0; +} + static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { @@ -210,7 +254,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(sizeof(u32)) + /* ct id */ + nla_total_size(sizeof(u8)) + /* ct direction */ + nla_total_size(sizeof(u32)) + /* ct state */ + nla_total_size(sizeof(u32)) + /* ct status */ + nla_total_size(sizeof(u32)) + /* trace id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + @@ -291,6 +339,10 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, if (nf_trace_fill_pkt_info(skb, pkt)) goto nla_put_failure; + + if (nf_trace_fill_ct_info(skb, pkt->skb)) + goto nla_put_failure; + info->packet_dumped = true; } @@ -317,7 +369,7 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, net_get_random_once(&trace_key, sizeof(trace_key)); info->skbid = (u32)siphash_3u32(hash32_ptr(skb), - skb_get_hash(skb), + skb_get_hash_net(nft_net(pkt), skb), skb->skb_iif, &trace_key); } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index c9fbe0f707b5..ac77fc21632d 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -86,6 +86,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, + [NFNLGRP_NFT_DEV] = NFNL_SUBSYS_NFTABLES, }; static struct nfnl_net *nfnl_pernet(struct net *net) @@ -402,31 +403,36 @@ replay_abort: { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } } if (!ss->valid_genid || !ss->commit || !ss->abort) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } if (!try_module_get(ss->owner)) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } if (!ss->valid_genid(net, genid)) { module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); - return kfree_skb(skb); + return consume_skb(skb); } nfnl_unlock(subsys_id); + if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); + nfnl_err_add(&err_list, nlh, 0, &extack); + } + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -512,7 +518,7 @@ replay_abort: err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, attr, attrlen, - ss->cb[cb_id].policy, NULL); + ss->cb[cb_id].policy, &extack); if (err < 0) goto ack; @@ -562,7 +568,7 @@ done: if (status & NFNL_BATCH_REPLAY) { ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD); nfnl_err_reset(&err_list); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { @@ -573,6 +579,9 @@ done: } else if (err) { ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); + } else if (nlh->nlmsg_flags & NLM_F_ACK) { + memset(&extack, 0, sizeof(extack)); + nfnl_err_add(&err_list, nlh, 0, &extack); } } else { enum nfnl_abort_action abort_action; @@ -585,7 +594,7 @@ done: err = ss->abort(net, oskb, abort_action); if (err == -EAGAIN) { nfnl_err_reset(&err_list); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); status |= NFNL_BATCH_FAILURE; goto replay_abort; @@ -593,7 +602,7 @@ done: } nfnl_err_deliver(&err_list, oskb); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index f466af4f8531..eab4f476b47f 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -366,8 +366,7 @@ static int cttimeout_default_set(struct sk_buff *skb, __u8 l4num; int ret; - if (!cda[CTA_TIMEOUT_L3PROTO] || - !cda[CTA_TIMEOUT_L4PROTO] || + if (!cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 134e05d31061..bfcb9cd335bf 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -381,7 +381,7 @@ static void __nfulnl_flush(struct nfulnl_instance *inst) { /* timer holds a reference */ - if (del_timer(&inst->timer)) + if (timer_delete(&inst->timer)) instance_put(inst); if (inst->skb) __nfulnl_send(inst); @@ -390,7 +390,7 @@ __nfulnl_flush(struct nfulnl_instance *inst) static void nfulnl_timer(struct timer_list *t) { - struct nfulnl_instance *inst = from_timer(inst, t, timer); + struct nfulnl_instance *inst = timer_container_of(inst, t, timer); spin_lock_bh(&inst->lock); if (inst->skb) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 00f4bd21c59b..8b7b39d8a109 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -169,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head) struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); + rcu_read_lock(); nfqnl_flush(inst, NULL, 0); + rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } @@ -323,7 +325,7 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) hooks = nf_hook_entries_head(net, pf, entry->state.hook); i = entry->hook_index; - if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { + if (!hooks || i >= hooks->num_hook_entries) { kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); nf_queue_entry_free(entry); return; @@ -468,18 +470,18 @@ static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) return 0; } -static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx) { - u32 seclen = 0; + int seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) - security_secid_to_secctx(skb->secmark, secdata, &seclen); - + seclen = security_secid_to_secctx(skb->secmark, ctx); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; @@ -538,6 +540,14 @@ nla_put_failure: return -1; } +static int nf_queue_checksum_help(struct sk_buff *entskb) +{ + if (skb_csum_is_sctp(entskb)) + return skb_crc32c_csum_help(entskb); + + return skb_checksum_help(entskb); +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -557,8 +567,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; - char *secdata = NULL; - u32 seclen = 0; + struct lsm_context ctx = { NULL, 0, 0 }; + int seclen = 0; ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) @@ -600,7 +610,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(entskb)) + nf_queue_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); @@ -632,7 +642,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { - seclen = nfqnl_get_sk_secctx(entskb, &secdata); + seclen = nfqnl_get_sk_secctx(entskb, &ctx); + if (seclen < 0) + return NULL; if (seclen) size += nla_total_size(seclen); } @@ -772,7 +784,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) goto nla_put_failure; - if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) @@ -800,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } nlh->nlmsg_len = skb->len; - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return skb; nla_put_failure: @@ -809,8 +821,8 @@ nla_put_failure: kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return NULL; } @@ -818,10 +830,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; - const struct nf_conn *ct = (void *)skb_nfct(entry->skb); + struct nf_conn *ct = (void *)skb_nfct(entry->skb); + unsigned long status; + unsigned int use; - if (ct && ((ct->status & flags) == IPS_DYING)) + if (!ct) + return false; + + status = READ_ONCE(ct->status); + if ((status & flags) == IPS_DYING) return true; + + if (status & IPS_CONFIRMED) + return false; + + /* in some cases skb_clone() can occur after initial conntrack + * pickup, but conntrack assumes exclusive skb->_nfct ownership for + * unconfirmed entries. + * + * This happens for br_netfilter and with ip multicast routing. + * We can't be solved with serialization here because one clone could + * have been queued for local delivery. + */ + use = refcount_read(&ct->ct_general.use); + if (likely(use == 1)) + return false; + + /* Can't decrement further? Exclusive ownership. */ + if (!refcount_dec_not_one(&ct->ct_general.use)) + return false; + + skb_set_nfct(entry->skb, 0); + /* No nf_ct_put(): we already decremented .use and it cannot + * drop down to 0. + */ + return true; #endif return false; } @@ -981,7 +1024,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) break; } - if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) + if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb))) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index ca857afbf061..d550910aabec 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -17,6 +17,7 @@ struct nft_bitwise { u8 sreg; + u8 sreg2; u8 dreg; enum nft_bitwise_ops op:8; u8 len; @@ -25,8 +26,8 @@ struct nft_bitwise { struct nft_data data; }; -static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, - const struct nft_bitwise *priv) +static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) { unsigned int i; @@ -60,28 +61,72 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, } } +static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] & src2[i]; +} + +static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] | src2[i]; +} + +static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] ^ src2[i]; +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); - const u32 *src = ®s->data[priv->sreg]; + const u32 *src = ®s->data[priv->sreg], *src2; u32 *dst = ®s->data[priv->dreg]; - switch (priv->op) { - case NFT_BITWISE_BOOL: - nft_bitwise_eval_bool(dst, src, priv); - break; - case NFT_BITWISE_LSHIFT: + if (priv->op == NFT_BITWISE_MASK_XOR) { + nft_bitwise_eval_mask_xor(dst, src, priv); + return; + } + if (priv->op == NFT_BITWISE_LSHIFT) { nft_bitwise_eval_lshift(dst, src, priv); - break; - case NFT_BITWISE_RSHIFT: + return; + } + if (priv->op == NFT_BITWISE_RSHIFT) { nft_bitwise_eval_rshift(dst, src, priv); - break; + return; + } + + src2 = priv->sreg2 ? ®s->data[priv->sreg2] : priv->data.data; + + if (priv->op == NFT_BITWISE_AND) { + nft_bitwise_eval_and(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_OR) { + nft_bitwise_eval_or(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_XOR) { + nft_bitwise_eval_xor(dst, src, src2, priv); + return; } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_SREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_SREG2] = { .type = NLA_U32 }, [NFTA_BITWISE_DREG] = { .type = NLA_U32 }, [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, @@ -90,8 +135,8 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; -static int nft_bitwise_init_bool(struct nft_bitwise *priv, - const struct nlattr *const tb[]) +static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { struct nft_data_desc mask = { .type = NFT_DATA_VALUE, @@ -105,7 +150,8 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv, }; int err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || @@ -139,7 +185,8 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, int err; if (tb[NFTA_BITWISE_MASK] || - tb[NFTA_BITWISE_XOR]) + tb[NFTA_BITWISE_XOR] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_DATA]) @@ -157,6 +204,41 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, return 0; } +static int nft_bitwise_init_bool(const struct nft_ctx *ctx, + struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + int err; + + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) || + (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2])) + return -EINVAL; + + if (tb[NFTA_BITWISE_DATA]) { + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = priv->len, + }; + + err = nft_data_init(NULL, &priv->data, &desc, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + } else { + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2], + &priv->sreg2, priv->len); + if (err < 0) + return err; + } + + return 0; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -171,7 +253,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg, priv->len); if (err < 0) return err; @@ -185,32 +267,40 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (tb[NFTA_BITWISE_OP]) { priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); switch (priv->op) { - case NFT_BITWISE_BOOL: + case NFT_BITWISE_MASK_XOR: case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: break; default: return -EOPNOTSUPP; } } else { - priv->op = NFT_BITWISE_BOOL; + priv->op = NFT_BITWISE_MASK_XOR; } switch(priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_init_bool(priv, tb); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_init_mask_xor(priv, tb); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: err = nft_bitwise_init_shift(priv, tb); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_init_bool(ctx, priv, tb); + break; } return err; } -static int nft_bitwise_dump_bool(struct sk_buff *skb, - const struct nft_bitwise *priv) +static int nft_bitwise_dump_mask_xor(struct sk_buff *skb, + const struct nft_bitwise *priv) { if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) @@ -232,6 +322,21 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb, return 0; } +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (priv->sreg2) { + if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2)) + return -1; + } else { + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + } + + return 0; +} + static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { @@ -248,13 +353,18 @@ static int nft_bitwise_dump(struct sk_buff *skb, return -1; switch (priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_dump_bool(skb, priv); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_dump_mask_xor(skb, priv); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: err = nft_bitwise_dump_shift(skb, priv); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_dump_bool(skb, priv); + break; } return err; @@ -269,7 +379,7 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; - if (priv->op != NFT_BITWISE_BOOL) + if (priv->op != NFT_BITWISE_MASK_XOR) return -EOPNOTSUPP; if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || @@ -299,6 +409,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track, track->regs[priv->dreg].bitwise && track->regs[priv->dreg].bitwise->ops == expr->ops && priv->sreg == bitwise->sreg && + priv->sreg2 == bitwise->sreg2 && priv->dreg == bitwise->dreg && priv->op == bitwise->op && priv->len == bitwise->len && @@ -365,7 +476,7 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); int err; - err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg, sizeof(u32)); if (err < 0) return err; @@ -375,7 +486,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || @@ -406,7 +518,7 @@ nft_bitwise_fast_dump(struct sk_buff *skb, return -1; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32)))) return -1; - if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_BOOL))) + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR))) return -1; data.data[0] = priv->mask; @@ -501,7 +613,7 @@ nft_bitwise_select_ops(const struct nft_ctx *ctx, return &nft_bitwise_ops; if (tb[NFTA_BITWISE_OP] && - ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_BOOL) + ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR) return &nft_bitwise_ops; return &nft_bitwise_fast_ops; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index f6e791a68101..af9206a3afd1 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -5,7 +5,7 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> @@ -139,7 +139,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg, priv->len); if (err < 0) return err; diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 274b6f7e6bb5..846d48ba8965 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -318,67 +318,74 @@ static const struct nft_chain_type nft_chain_filter_netdev = { }, }; -static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_ctx *ctx) +static int nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_base_chain *basechain, bool changename) { - struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - struct nft_hook *hook, *found = NULL; - int n = 0; - - if (event != NETDEV_UNREGISTER) - return; + struct nft_table *table = basechain->chain.table; + struct nf_hook_ops *ops; + struct nft_hook *hook; + bool match; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev == dev) - found = hook; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - n++; - } - if (!found) - return; - - if (n > 1) { - nf_unregister_net_hook(ctx->net, &found->ops); - list_del_rcu(&found->list); - kfree_rcu(found, rcu); - return; - } + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nf_unregister_net_hook(dev_net(dev), ops); - /* UNREGISTER events are also happening on netns exit. - * - * Although nf_tables core releases all tables/chains, only this event - * handler provides guarantee that hook->ops.dev is still accessible, - * so we cannot skip exiting net namespaces. - */ - __nft_release_basechain(ctx); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kmemdup(&basechain->ops, + sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->dev = dev; + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + nf_register_net_hook(dev_net(dev), ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } + nf_tables_chain_device_notify(&basechain->chain, + hook, dev, event); + break; + } + return 0; } -static int nf_tables_netdev_event(struct notifier_block *this, - unsigned long event, void *ptr) +static int __nf_tables_netdev_event(unsigned long event, + struct net_device *dev, + bool changename) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_base_chain *basechain; struct nftables_pernet *nft_net; - struct nft_chain *chain, *nr; + struct nft_chain *chain; struct nft_table *table; - struct nft_ctx ctx = { - .net = dev_net(dev), - }; - if (event != NETDEV_UNREGISTER && - event != NETDEV_CHANGENAME) - return NOTIFY_DONE; - - nft_net = nft_pernet(ctx.net); - mutex_lock(&nft_net->commit_mutex); + nft_net = nft_pernet(dev_net(dev)); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV && table->family != NFPROTO_INET) continue; - ctx.family = table->family; - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { + list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain)) continue; @@ -387,13 +394,40 @@ static int nf_tables_netdev_event(struct notifier_block *this, basechain->ops.hooknum != NF_INET_INGRESS) continue; - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); + if (nft_netdev_event(event, dev, basechain, changename)) + return 1; } } - mutex_unlock(&nft_net->commit_mutex); + return 0; +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; + int ret = NOTIFY_DONE; - return NOTIFY_DONE; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; + + nft_net = nft_pernet(dev_net(dev)); + mutex_lock(&nft_net->commit_mutex); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; + } + __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_netdev_event(event, dev, false)) { + ret = NOTIFY_BAD; + } +out_unlock: + mutex_unlock(&nft_net->commit_mutex); + return ret; } static struct notifier_block nf_tables_netdev_notifier = { diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index cd4652259095..2605f43737bc 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -83,7 +83,7 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -222,7 +222,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -323,7 +323,7 @@ static int nft_cmp16_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index d3d11dede545..72711d62fddf 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -228,7 +228,7 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) return 0; } -static void nft_compat_wait_for_destructors(void) +static void nft_compat_wait_for_destructors(struct net *net) { /* xtables matches or targets can have side effects, e.g. * creation/destruction of /proc files. @@ -236,7 +236,7 @@ static void nft_compat_wait_for_destructors(void) * work queue. If we have pending invocations we thus * need to wait for those to finish. */ - nf_tables_trans_destroy_flush_work(); + nf_tables_trans_destroy_flush_work(net); } static int @@ -262,7 +262,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); - nft_compat_wait_for_destructors(); + nft_compat_wait_for_destructors(ctx->net); ret = xt_check_target(&par, size, proto, inv); if (ret < 0) { @@ -350,8 +350,7 @@ nla_put_failure: } static int nft_target_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; unsigned int hook_mask = 0; @@ -516,7 +515,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); - nft_compat_wait_for_destructors(); + nft_compat_wait_for_destructors(ctx->net); return xt_check_match(&par, size, proto, inv); } @@ -536,7 +535,7 @@ nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_match *m = expr->ops->data; int ret; - priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL); + priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT); if (!priv->info) return -ENOMEM; @@ -611,8 +610,7 @@ static int nft_match_large_dump(struct sk_buff *skb, } static int nft_match_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; unsigned int hook_mask = 0; @@ -810,7 +808,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; @@ -900,7 +898,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index de9d1980df69..92b984fa8175 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -210,12 +210,12 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx, nft_connlimit_do_destroy(ctx, priv); } -static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC); + priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp); if (!priv_dst->list) return -ENOMEM; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index dccc68a5135a..cc7325329496 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -8,7 +8,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/seqlock.h> +#include <linux/u64_stats_sync.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -17,6 +17,11 @@ #include <net/netfilter/nf_tables_offload.h> struct nft_counter { + u64_stats_t bytes; + u64_stats_t packets; +}; + +struct nft_counter_tot { s64 bytes; s64 packets; }; @@ -25,25 +30,24 @@ struct nft_counter_percpu_priv { struct nft_counter __percpu *counter; }; -static DEFINE_PER_CPU(seqcount_t, nft_counter_seq); +static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync); static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); - - write_seqcount_begin(myseq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - this_cpu->bytes += pkt->skb->len; - this_cpu->packets++; + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->bytes, pkt->skb->len); + u64_stats_inc(&this_cpu->packets); + u64_stats_update_end(nft_sync); - write_seqcount_end(myseq); local_bh_enable(); } @@ -66,17 +70,16 @@ static int nft_counter_do_init(const struct nlattr * const tb[], if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); + this_cpu = raw_cpu_ptr(cpu_stats); if (tb[NFTA_COUNTER_PACKETS]) { - this_cpu->packets = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + u64_stats_set(&this_cpu->packets, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]))); } if (tb[NFTA_COUNTER_BYTES]) { - this_cpu->bytes = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + u64_stats_set(&this_cpu->bytes, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]))); } - preempt_enable(); + priv->counter = cpu_stats; return 0; } @@ -104,35 +107,41 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx, } static void nft_counter_reset(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - this_cpu->packets -= total->packets; - this_cpu->bytes -= total->bytes; + nft_sync = this_cpu_ptr(&nft_counter_sync); + + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, -total->packets); + u64_stats_add(&this_cpu->bytes, -total->bytes); + u64_stats_update_end(nft_sync); + local_bh_enable(); } static void nft_counter_fetch(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { struct nft_counter *this_cpu; - const seqcount_t *myseq; u64 bytes, packets; unsigned int seq; int cpu; memset(total, 0, sizeof(*total)); for_each_possible_cpu(cpu) { - myseq = per_cpu_ptr(&nft_counter_seq, cpu); + struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu); + this_cpu = per_cpu_ptr(priv->counter, cpu); do { - seq = read_seqcount_begin(myseq); - bytes = this_cpu->bytes; - packets = this_cpu->packets; - } while (read_seqcount_retry(myseq, seq)); + seq = u64_stats_fetch_begin(nft_sync); + bytes = u64_stats_read(&this_cpu->bytes); + packets = u64_stats_read(&this_cpu->packets); + } while (u64_stats_fetch_retry(nft_sync, seq)); total->bytes += bytes; total->packets += packets; @@ -143,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb, struct nft_counter_percpu_priv *priv, bool reset) { - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); @@ -226,25 +235,23 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, nft_counter_do_destroy(priv); } -static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_counter_percpu_priv *priv = nft_expr_priv(src); struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); - cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC); + cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp); if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); - this_cpu->packets = total.packets; - this_cpu->bytes = total.bytes; - preempt_enable(); + this_cpu = raw_cpu_ptr(cpu_stats); + u64_stats_set(&this_cpu->packets, total.packets); + u64_stats_set(&this_cpu->bytes, total.bytes); priv_clone->counter = cpu_stats; return 0; @@ -262,18 +269,18 @@ static void nft_counter_offload_stats(struct nft_expr *expr, const struct flow_stats *stats) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; - preempt_disable(); + local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - write_seqcount_begin(myseq); - this_cpu->packets += stats->pkts; - this_cpu->bytes += stats->bytes; - write_seqcount_end(myseq); - preempt_enable(); + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, stats->pkts); + u64_stats_add(&this_cpu->bytes, stats->bytes); + u64_stats_update_end(nft_sync); + local_bh_enable(); } void nft_counter_init_seqcount(void) @@ -281,7 +288,7 @@ void nft_counter_init_seqcount(void) int cpu; for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu)); + u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu)); } struct nft_expr_type nft_counter_type; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 452ed94c3a4d..d526e69a2a2b 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -230,6 +230,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; + int oldcnt; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ @@ -250,10 +251,11 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); - if (likely(refcount_read(&ct->ct_general.use) == 1)) { - refcount_inc(&ct->ct_general.use); + __refcount_inc(&ct->ct_general.use, &oldcnt); + if (likely(oldcnt == 1)) { nf_ct_zone_add(ct, &zone); } else { + refcount_dec(&ct->ct_general.use); /* previous skb got queued to userspace, allocate temporary * one until percpu template can be reused. */ @@ -606,7 +608,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, } priv->len = len; - err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; @@ -929,7 +931,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj, */ values = nf_ct_timeout_data(timeout); if (values) - nf_ct_refresh(ct, pkt->skb, values[0]); + nf_ct_refresh(ct, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index e5739a59ebf1..0573f96ce079 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -40,7 +40,7 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_DEV] == NULL) return -EINVAL; - return nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, + return nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index c09dba57354c..88922e0e8e83 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -35,7 +35,7 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, for (i = 0; i < priv->num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); - if (nft_expr_clone(expr, priv->expr_array[i]) < 0) + if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0) return -1; elem_expr->size += priv->expr_array[i]->ops->size; @@ -56,7 +56,7 @@ static struct nft_elem_priv *nft_dynset_new(struct nft_set *set, if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; - timeout = priv->timeout ? : set->timeout; + timeout = priv->timeout ? : READ_ONCE(set->timeout); elem_priv = nft_set_elem_init(set, &priv->tmpl, ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], @@ -94,9 +94,10 @@ void nft_dynset_eval(const struct nft_expr *expr, if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { if (priv->op == NFT_DYNSET_OP_UPDATE && - nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - timeout = priv->timeout ? : set->timeout; - *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout; + nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && + READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) { + timeout = priv->timeout ? : READ_ONCE(set->timeout); + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + timeout); } nft_set_elem_update_expr(ext, regs, pkt); @@ -215,7 +216,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return err; } - err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, + err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, set->klen); if (err < 0) return err; @@ -226,7 +227,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; - err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_DATA], + err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_DATA], &priv->sreg_data, set->dlen); if (err < 0) return err; @@ -312,12 +313,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (priv->num_exprs) nft_dynset_ext_add_expr(priv); - if (set->flags & NFT_SET_TIMEOUT) { - if (timeout || set->timeout) { - nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); - nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); - } - } + if (set->flags & NFT_SET_TIMEOUT && + (timeout || READ_ONCE(set->timeout))) + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); priv->timeout = timeout; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 6eb571d0c3fd..c74012c99125 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -5,7 +5,7 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/netfilter.h> @@ -85,7 +85,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; struct iphdr *iph, _iph; - unsigned int start; bool found = false; __be32 info; int optlen; @@ -93,7 +92,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (!iph) return -EBADMSG; - start = sizeof(struct iphdr); optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); if (optlen <= 0) @@ -103,7 +101,7 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, /* Copy the options since __ip_options_compile() modifies * the options. */ - if (skb_copy_bits(skb, start, opt->__data, optlen)) + if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen)) return -EBADMSG; opt->optlen = optlen; @@ -118,18 +116,18 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, found = target == IPOPT_SSRR ? opt->is_strictroute : !opt->is_strictroute; if (found) - *offset = opt->srr + start; + *offset = opt->srr; break; case IPOPT_RR: if (!opt->rr) break; - *offset = opt->rr + start; + *offset = opt->rr; found = true; break; case IPOPT_RA: if (!opt->router_alert) break; - *offset = opt->router_alert + start; + *offset = opt->router_alert; found = true; break; default: @@ -588,7 +586,7 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, priv->flags = flags; priv->op = op; - return nft_parse_register_load(tb[NFTA_EXTHDR_SREG], &priv->sreg, + return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg, priv->len); } diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 37cfe6dd712d..96e02a83c045 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -26,8 +26,7 @@ const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { }; EXPORT_SYMBOL(nft_fib_policy); -int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_fib *priv = nft_expr_priv(expr); unsigned int hooks; @@ -35,11 +34,9 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, switch (priv->result) { case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: - hooks = (1 << NF_INET_PRE_ROUTING); - if (priv->flags & NFTA_FIB_F_IIF) { - hooks |= (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_FORWARD); - } + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); break; case NFT_FIB_RESULT_ADDRTYPE: if (priv->flags & NFTA_FIB_F_IIF) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index ab9576098701..225ff293cd50 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -8,7 +8,8 @@ #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> -#include <net/ip.h> /* for ipv4 options. */ +#include <net/ip.h> +#include <net/inet_dscp.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> @@ -174,7 +175,7 @@ static bool nft_flowtable_find_dev(const struct net_device *dev, bool found = false; list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops_rcu(hook, dev)) continue; found = true; @@ -235,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos); + fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; @@ -288,6 +289,15 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family) return false; } +static void flow_offload_ct_tcp(struct nf_conn *ct) +{ + /* conntrack will not see all packets, disable tcp window validation. */ + spin_lock_bh(&ct->lock); + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + spin_unlock_bh(&ct->lock); +} + static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -355,11 +365,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto err_flow_alloc; flow_offload_route_init(flow, &route); - - if (tcph) { - ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - } + if (tcph) + flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); @@ -380,8 +387,7 @@ out: } static int nft_flow_offload_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); @@ -409,8 +415,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->net, ctx->table, + tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 358e742afad7..152a9fb4d23a 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -52,7 +52,7 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_FWD_SREG_DEV] == NULL) return -EINVAL; - return nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, sizeof(int)); } @@ -178,12 +178,12 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + err = nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, sizeof(int)); if (err < 0) return err; - return nft_parse_register_load(tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, + return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, addr_len); } @@ -204,8 +204,7 @@ nla_put_failure: } static int nft_fwd_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) | (1 << NF_NETDEV_EGRESS)); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 92d47e469204..5d034bbb6913 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -51,7 +51,8 @@ static void nft_symhash_eval(const struct nft_expr *expr, struct sk_buff *skb = pkt->skb; u32 h; - h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus); + h = reciprocal_scale(__skb_get_hash_symmetric_net(nft_net(pkt), skb), + priv->modulus); regs->data[priv->dreg] = h + priv->offset; } @@ -91,7 +92,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_HASH_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_HASH_SREG], &priv->sreg, len); if (err < 0) return err; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 6475c7abc1fe..02ee5fb69871 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -221,7 +221,7 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, list_del(&rule->list); nf_tables_rule_destroy(&chain_ctx, rule); } - nf_tables_chain_destroy(&chain_ctx); + nf_tables_chain_destroy(chain); break; default: break; @@ -244,8 +244,7 @@ nla_put_failure: } static int nft_immediate_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **d) + const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_ctx *pctx = (struct nft_ctx *)ctx; diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 928312d01eb1..c4569d4b9228 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -23,7 +23,14 @@ #include <linux/ip.h> #include <linux/ipv6.h> -static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); +struct nft_inner_tun_ctx_locked { + struct nft_inner_tun_ctx ctx; + local_lock_t bh_lock; +}; + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { @@ -210,35 +217,71 @@ static int nft_inner_parse(const struct nft_inner *priv, struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { - struct nft_inner_tun_ctx ctx = {}; u32 off = pkt->inneroff; if (priv->flags & NFT_INNER_HDRSIZE && - nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0) + nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0) return -1; if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { - if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0) + if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0) return -1; } else if (priv->flags & NFT_INNER_TH) { - ctx.inner_thoff = off; - ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH; + tun_ctx->inner_thoff = off; + tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; } - *tun_ctx = ctx; tun_ctx->type = priv->type; + tun_ctx->cookie = (unsigned long)pkt->skb; pkt->flags |= NFT_PKTINFO_INNER_FULL; return 0; } +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx *this_cpu_tun_ctx; + + local_bh_disable(); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); + if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { + local_bh_enable(); + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + return false; + } + *tun_ctx = *this_cpu_tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + local_bh_enable(); + + return true; +} + +static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, + const struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx *this_cpu_tun_ctx; + + local_bh_disable(); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); + if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) + *this_cpu_tun_ctx = *tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + local_bh_enable(); +} + static bool nft_inner_parse_needed(const struct nft_inner *priv, const struct nft_pktinfo *pkt, - const struct nft_inner_tun_ctx *tun_ctx) + struct nft_inner_tun_ctx *tun_ctx) { if (!(pkt->flags & NFT_PKTINFO_INNER_FULL)) return true; + if (!nft_inner_restore_tun_ctx(pkt, tun_ctx)) + return true; + if (priv->type != tun_ctx->type) return true; @@ -248,27 +291,29 @@ static bool nft_inner_parse_needed(const struct nft_inner *priv, static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); const struct nft_inner *priv = nft_expr_priv(expr); + struct nft_inner_tun_ctx tun_ctx = {}; if (nft_payload_inner_offset(pkt) < 0) goto err; - if (nft_inner_parse_needed(priv, pkt, tun_ctx) && - nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0) + if (nft_inner_parse_needed(priv, pkt, &tun_ctx) && + nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0) goto err; switch (priv->expr_type) { case NFT_INNER_EXPR_PAYLOAD: - nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); + nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); break; case NFT_INNER_EXPR_META: - nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); + nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); break; default: WARN_ON_ONCE(1); goto err; } + nft_inner_save_tun_ctx(pkt, &tun_ctx); + return; err: regs->verdict.code = NFT_BREAK; diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 8e6d7eaf9dc8..de1b6066bfa8 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -102,12 +102,12 @@ static void nft_last_destroy(const struct nft_ctx *ctx, kfree(priv->last); } -static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_last_priv *priv_dst = nft_expr_priv(dst); struct nft_last_priv *priv_src = nft_expr_priv(src); - priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC); + priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp); if (!priv_dst->last) return -ENOMEM; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index cefa25e0dbb0..21d26b79b460 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -150,7 +150,7 @@ static void nft_limit_destroy(const struct nft_ctx *ctx, } static int nft_limit_clone(struct nft_limit_priv *priv_dst, - const struct nft_limit_priv *priv_src) + const struct nft_limit_priv *priv_src, gfp_t gfp) { priv_dst->tokens_max = priv_src->tokens_max; priv_dst->rate = priv_src->rate; @@ -158,7 +158,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst, priv_dst->burst = priv_src->burst; priv_dst->invert = priv_src->invert; - priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC); + priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp); if (!priv_dst->limit) return -ENOMEM; @@ -223,14 +223,15 @@ static void nft_limit_pkts_destroy(const struct nft_ctx *ctx, nft_limit_destroy(ctx, &priv->limit); } -static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) { struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst); struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src); priv_dst->cost = priv_src->cost; - return nft_limit_clone(&priv_dst->limit, &priv_src->limit); + return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp); } static struct nft_expr_type nft_limit_type; @@ -281,12 +282,13 @@ static void nft_limit_bytes_destroy(const struct nft_ctx *ctx, nft_limit_destroy(ctx, priv); } -static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) { struct nft_limit_priv *priv_dst = nft_expr_priv(dst); struct nft_limit_priv *priv_src = nft_expr_priv(src); - return nft_limit_clone(priv_dst, priv_src); + return nft_limit_clone(priv_dst, priv_src, gfp); } static const struct nft_expr_ops nft_limit_bytes_ops = { diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 5defe6e4fd98..e35588137995 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -163,7 +163,7 @@ static int nft_log_init(const struct nft_ctx *ctx, nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { - priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); + priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT); if (priv->prefix == NULL) return -ENOMEM; nla_strscpy(priv->prefix, nla, nla_len(nla) + 1); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index a0055f510e31..63ef832b8aa7 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -113,7 +113,7 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - err = nft_parse_register_load(tb[NFTA_LOOKUP_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg, set->klen); if (err < 0) return err; @@ -132,7 +132,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return -EINVAL; err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], - &priv->dreg, NULL, set->dtype, + &priv->dreg, NULL, + nft_set_datatype(set), set->dlen); if (err < 0) return err; @@ -205,8 +206,7 @@ nla_put_failure: } static int nft_lookup_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **d) + const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); struct nft_set_iter iter; @@ -216,6 +216,7 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, return 0; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 8a14aaca93bb..868bd4d73555 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -27,8 +27,7 @@ static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { }; static int nft_masq_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { int err; @@ -52,13 +51,13 @@ static int nft_masq_init(const struct nft_ctx *ctx, priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); if (tb[NFTA_MASQ_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_MASQ_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index ba0d3683a45d..05cd1e6e6a2f 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -581,8 +581,7 @@ static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) } static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -600,8 +599,7 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, } int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -657,7 +655,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, } priv->len = len; - err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; @@ -839,6 +837,9 @@ static int nft_meta_inner_init(const struct nft_ctx *ctx, struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; + if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG]) + return -EINVAL; + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_PROTOCOL: @@ -951,7 +952,7 @@ static int nft_secmark_obj_init(const struct nft_ctx *ctx, if (tb[NFTA_SECMARK_CTX] == NULL) return -EINVAL; - priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL); + priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT); if (!priv->ctx) return -ENOMEM; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 808f5802c270..6e21f72c5b57 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -137,8 +137,7 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { }; static int nft_nat_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_nat *priv = nft_expr_priv(expr); int err; @@ -214,13 +213,13 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) @@ -234,13 +233,13 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 7d29db7c2ac0..bd058babfc82 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -66,7 +66,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL); + priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT); if (!priv->counter) return -ENOMEM; diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 509011b1ef59..09da7a3f9f96 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -143,7 +143,7 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; - err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_OBJREF_SET_SREG], &priv->sreg, set->klen); if (err < 0) return err; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 7fec57ff736f..1c0b493ef0a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -108,8 +108,7 @@ nla_put_failure: } static int nft_osf_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { unsigned int hooks; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 0a689c8e0295..7dfc5343dae4 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -45,36 +45,27 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; - u8 vlan_hlen = 0; - - if ((skb->protocol == htons(ETH_P_8021AD) || - skb->protocol == htons(ETH_P_8021Q)) && - offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) - vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < VLAN_ETH_HLEN + vlan_hlen) { + if (offset < VLAN_ETH_HLEN) { u8 ethlen = len; - if (vlan_hlen && - skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) - return false; - else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) + if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - if (offset + len > VLAN_ETH_HLEN + vlan_hlen) - ethlen -= offset + len - VLAN_ETH_HLEN - vlan_hlen; + if (offset + len > VLAN_ETH_HLEN) + ethlen -= offset + len - VLAN_ETH_HLEN; - memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); + memcpy(dst_u8, vlanh + offset, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN + vlan_hlen; + offset = ETH_HLEN; } else { - offset -= VLAN_HLEN + vlan_hlen; + offset -= VLAN_HLEN; } return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; @@ -154,12 +145,12 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt) return pkt->inneroff; } -static bool nft_payload_need_vlan_copy(const struct nft_payload *priv) +static bool nft_payload_need_vlan_adjust(u32 offset, u32 len) { - unsigned int len = priv->offset + priv->len; + unsigned int boundary = offset + len; /* data past ether src/dst requested, copy needed */ - if (len > offsetof(struct ethhdr, h_proto)) + if (boundary > offsetof(struct ethhdr, h_proto)) return true; return false; @@ -183,7 +174,7 @@ void nft_payload_eval(const struct nft_expr *expr, goto err; if (skb_vlan_tag_present(skb) && - nft_payload_need_vlan_copy(priv)) { + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; @@ -659,6 +650,10 @@ static int nft_payload_inner_init(const struct nft_ctx *ctx, struct nft_payload *priv = nft_expr_priv(expr); u32 base; + if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] || + !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG]) + return -EINVAL; + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); switch (base) { case NFT_PAYLOAD_TUN_HEADER: @@ -810,21 +805,79 @@ struct nft_payload_set { u8 csum_flags; }; +/* This is not struct vlan_hdr. */ +struct nft_payload_vlan_hdr { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; +}; + +static bool +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, + int *vlan_hlen) +{ + struct nft_payload_vlan_hdr *vlanh; + __be16 vlan_proto; + u16 vlan_tci; + + if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) { + *vlan_hlen = VLAN_HLEN; + return true; + } + + switch (offset) { + case offsetof(struct vlan_ethhdr, h_vlan_proto): + if (len == 2) { + vlan_proto = nft_reg_load_be16(src); + skb->vlan_proto = vlan_proto; + } else if (len == 4) { + vlanh = (struct nft_payload_vlan_hdr *)src; + __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto, + ntohs(vlanh->h_vlan_TCI)); + } else { + return false; + } + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (len != 2) + return false; + + vlan_tci = ntohs(nft_reg_load_be16(src)); + skb->vlan_tci = vlan_tci; + break; + default: + return false; + } + + return true; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload_set *priv = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; const u32 *src = ®s->data[priv->sreg]; - int offset, csum_offset; + int offset, csum_offset, vlan_hlen = 0; + struct sk_buff *skb = pkt->skb; __wsum fsum, tsum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; - offset = skb_mac_header(skb) - skb->data; + + if (skb_vlan_tag_present(skb) && + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { + if (!nft_payload_set_vlan(src, skb, + priv->offset, priv->len, + &vlan_hlen)) + goto err; + + if (!vlan_hlen) + return; + } + + offset = skb_mac_header(skb) - skb->data - vlan_hlen; break; case NFT_PAYLOAD_NETWORK_HEADER: offset = skb_network_offset(skb); @@ -851,6 +904,9 @@ static void nft_payload_set_eval(const struct nft_expr *expr, ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER && priv->base != NFT_PAYLOAD_INNER_HEADER) || skb->ip_summed != CHECKSUM_PARTIAL)) { + if (offset + priv->len > skb->len) + goto err; + fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); @@ -928,7 +984,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, } priv->csum_type = csum_type; - return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg, + return nft_parse_register_load(ctx, tb[NFTA_PAYLOAD_SREG], &priv->sreg, priv->len); } diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index b2b8127c8d43..344fe311878f 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -69,8 +69,7 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr, } static int nft_queue_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | @@ -136,7 +135,7 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, struct nft_queue *priv = nft_expr_priv(expr); int err; - err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM], + err = nft_parse_register_load(ctx, tb[NFTA_QUEUE_SREG_QNUM], &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 3ba12a7471b0..df0798da2329 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -19,10 +19,16 @@ struct nft_quota { }; static inline bool nft_overquota(struct nft_quota *priv, - const struct sk_buff *skb) + const struct sk_buff *skb, + bool *report) { - return atomic64_add_return(skb->len, priv->consumed) >= - atomic64_read(&priv->quota); + u64 consumed = atomic64_add_return(skb->len, priv->consumed); + u64 quota = atomic64_read(&priv->quota); + + if (report) + *report = consumed >= quota; + + return consumed > quota; } static inline bool nft_quota_invert(struct nft_quota *priv) @@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv)) + if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; } @@ -51,13 +57,13 @@ static void nft_quota_obj_eval(struct nft_object *obj, const struct nft_pktinfo *pkt) { struct nft_quota *priv = nft_obj_data(obj); - bool overquota; + bool overquota, report; - overquota = nft_overquota(priv, pkt->skb); + overquota = nft_overquota(priv, pkt->skb, &report); if (overquota ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; - if (overquota && + if (report && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); @@ -233,7 +239,7 @@ static void nft_quota_destroy(const struct nft_ctx *ctx, return nft_quota_do_destroy(ctx, priv); } -static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_quota *priv_dst = nft_expr_priv(dst); struct nft_quota *priv_src = nft_expr_priv(src); @@ -241,7 +247,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) priv_dst->quota = priv_src->quota; priv_dst->flags = priv_src->flags; - priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC); + priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp); if (!priv_dst->consumed) return -ENOMEM; diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 51ae64cd268f..ea382f7bbd78 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -83,7 +83,7 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr goto err2; } - err = nft_parse_register_load(tb[NFTA_RANGE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_RANGE_SREG], &priv->sreg, desc_from.len); if (err < 0) goto err2; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index a58bd8d291ff..95eedad85c83 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -27,8 +27,7 @@ static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { }; static int nft_redir_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { int err; @@ -51,13 +50,13 @@ static int nft_redir_init(const struct nft_ctx *ctx, plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index ed2e668474d6..196a92c7ea09 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -24,8 +24,7 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 973fa31a9dd6..49020e67304a 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -61,8 +61,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, } static int nft_reject_inet_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c index 7865cd8b11bb..2558ce1505d9 100644 --- a/net/netfilter/nft_reject_netdev.c +++ b/net/netfilter/nft_reject_netdev.c @@ -145,8 +145,7 @@ out: } static int nft_reject_netdev_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); } diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 24d977138572..dc50b9a5bd68 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -73,14 +73,14 @@ void nft_rt_get_eval(const struct nft_expr *expr, if (nft_pf(pkt) != NFPROTO_IPV4) goto err; - *dest = (__force u32)rt_nexthop((const struct rtable *)dst, + *dest = (__force u32)rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: if (nft_pf(pkt) != NFPROTO_IPV6) goto err; - memcpy(dest, rt6_nexthop((struct rt6_info *)dst, + memcpy(dest, rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr), sizeof(struct in6_addr)); break; @@ -160,8 +160,7 @@ nla_put_failure: return -1; } -static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_rt *priv = nft_expr_priv(expr); unsigned int hooks; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 32df7a16835d..12390d2e994f 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -88,13 +88,15 @@ bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, } static struct nft_bitmap_elem * -nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, +nft_bitmap_elem_find(const struct net *net, + const struct nft_set *set, struct nft_bitmap_elem *this, u8 genmask) { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(net)->commit_mutex)) { if (memcmp(nft_set_ext_key(&be->ext), nft_set_ext_key(&this->ext), set->klen) || !nft_set_elem_active(&be->ext, genmask)) @@ -132,7 +134,7 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_next(net); u32 idx, off; - be = nft_bitmap_elem_find(set, new, genmask); + be = nft_bitmap_elem_find(net, set, new, genmask); if (be) { *elem_priv = &be->priv; return -EEXIST; @@ -172,7 +174,7 @@ static void nft_bitmap_activate(const struct net *net, nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); - nft_set_elem_change_active(net, set, &be->ext); + nft_clear(net, &be->ext); } static void nft_bitmap_flush(const struct net *net, @@ -201,7 +203,7 @@ nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, nft_bitmap_location(set, elem->key.val.data, &idx, &off); - be = nft_bitmap_elem_find(set, this, genmask); + be = nft_bitmap_elem_find(net, set, this, genmask); if (!be) return NULL; @@ -222,8 +224,6 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, list_for_each_entry_rcu(be, &priv->list, head) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&be->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &be->priv); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6968a3b34236..abb0c8ec6371 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -24,11 +24,13 @@ struct nft_rhash { struct rhashtable ht; struct delayed_work gc_work; + u32 wq_gc_seq; }; struct nft_rhash_elem { struct nft_elem_priv priv; struct rhash_head node; + u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -199,7 +201,7 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, { struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_rhash_flush(const struct net *net, @@ -286,8 +288,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) @@ -309,7 +309,8 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, nft_setelem_expr_foreach(expr, elem_expr, size) { if (expr->ops->gc && - expr->ops->gc(read_pnet(&set->net), expr)) + expr->ops->gc(read_pnet(&set->net), expr) && + set->flags & NFT_SET_EVAL) return true; } @@ -340,6 +341,10 @@ static void nft_rhash_gc(struct work_struct *work) if (!gc) goto done; + /* Elements never collected use a zero gc worker sequence number. */ + if (unlikely(++priv->wq_gc_seq == 0)) + priv->wq_gc_seq++; + rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -357,6 +362,14 @@ static void nft_rhash_gc(struct work_struct *work) goto try_later; } + /* rhashtable walk is unstable, already seen in this gc run? + * Then, skip this element. In case of (unlikely) sequence + * wraparound and stale element wq_gc_seq, next gc run will + * just find this expired element. + */ + if (he->wq_gc_seq == priv->wq_gc_seq) + continue; + if (nft_set_elem_is_dead(&he->ext)) goto dead_elem; @@ -373,6 +386,8 @@ dead_elem: if (!gc) goto try_later; + /* annotate gc sequence for this attempt. */ + he->wq_gc_seq = priv->wq_gc_seq; nft_trans_gc_elem_add(gc, he); } @@ -599,7 +614,7 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, { struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_hash_flush(const struct net *net, @@ -649,11 +664,10 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, int i; for (i = 0; i < priv->buckets; i++) { - hlist_for_each_entry_rcu(he, &priv->table[i], node) { + hlist_for_each_entry_rcu(he, &priv->table[i], node, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index df8de5090246..c5855069bdab 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -434,7 +434,7 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, res_map = scratch->map + (map_index ? m->bsize_max : 0); fill_map = scratch->map + (map_index ? 0 : m->bsize_max); - memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; @@ -504,6 +504,7 @@ out: * pipapo_get() - Get matching element reference given key data * @net: Network namespace * @set: nftables API set representation + * @m: storage containing active/existing elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask * @tstamp: timestamp to check for expired elements @@ -517,17 +518,15 @@ out: */ static struct nft_pipapo_elem *pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_pipapo_match *m, const u8 *data, u8 genmask, u64 tstamp, gfp_t gfp) { struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); - struct nft_pipapo *priv = nft_set_priv(set); unsigned long *res_map, *fill_map = NULL; - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; int i; - m = priv->clone; if (m->bsize_max == 0) return ret; @@ -543,7 +542,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, goto out; } - memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; @@ -612,9 +611,11 @@ static struct nft_elem_priv * nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = rcu_dereference(priv->match); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, + e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, nft_genmask_cur(net), get_jiffies_64(), GFP_ATOMIC); if (IS_ERR(e)) @@ -662,7 +663,10 @@ static int pipapo_realloc_mt(struct nft_pipapo_field *f, check_add_overflow(rules, extra, &rules_alloc)) return -EOVERFLOW; - new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL); + if (rules_alloc > (INT_MAX / sizeof(*new_mt))) + return -ENOMEM; + + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); if (!new_mt) return -ENOMEM; @@ -682,6 +686,30 @@ out_free: return 0; } + +/** + * lt_calculate_size() - Get storage size for lookup table with overflow check + * @groups: Amount of bit groups + * @bb: Number of bits grouped together in lookup table buckets + * @bsize: Size of each bucket in lookup table, in longs + * + * Return: allocation size including alignment overhead, negative on overflow + */ +static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb, + unsigned int bsize) +{ + ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long); + + if (check_mul_overflow(ret, bsize, &ret)) + return -1; + if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret)) + return -1; + if (ret > INT_MAX) + return -1; + + return ret; +} + /** * pipapo_resize() - Resize lookup or mapping table, or both * @f: Field containing lookup and mapping tables @@ -700,6 +728,7 @@ static int pipapo_resize(struct nft_pipapo_field *f, long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; unsigned int new_bucket_size, copy; int group, bucket, err; + ssize_t lt_size; if (rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; @@ -718,10 +747,11 @@ static int pipapo_resize(struct nft_pipapo_field *f, else copy = new_bucket_size; - new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * - new_bucket_size * sizeof(*new_lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size); + if (lt_size < 0) + return -ENOMEM; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return -ENOMEM; @@ -906,7 +936,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) { unsigned int groups, bb; unsigned long *new_lt; - size_t lt_size; + ssize_t lt_size; lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * sizeof(*f->lt); @@ -916,15 +946,17 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) groups = f->groups * 2; bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && lt_size < NFT_PIPAPO_LT_SIZE_LOW) { groups = f->groups / 2; bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; /* Don't increase group width if the resulting lookup table size * would exceed the upper size threshold for a "small" set. @@ -935,7 +967,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) return; } - new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return; @@ -1211,7 +1243,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, scratch = kzalloc_node(struct_size(scratch, map, bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL, cpu_to_node(i)); + GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous * allocations: this means that some scratch maps have @@ -1247,6 +1279,40 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, return 0; } +static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) +{ +#ifdef CONFIG_PROVE_LOCKING + const struct net *net = read_pnet(&set->net); + + return lockdep_is_held(&nft_pernet(net)->commit_mutex); +#else + return true; +#endif +} + +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old); + +/** + * pipapo_maybe_clone() - Build clone for pending data changes, if not existing + * @set: nftables API set representation + * + * Return: newly created or existing clone, if any. NULL on allocation failure + */ +static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + + if (priv->clone) + return priv->clone; + + m = rcu_dereference_protected(priv->match, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = pipapo_clone(m); + + return priv->clone; +} + /** * nft_pipapo_insert() - Validate and insert ranged elements * @net: Network namespace @@ -1263,8 +1329,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m = priv->clone; + struct nft_pipapo_match *m = pipapo_maybe_clone(set); u8 genmask = nft_genmask_next(net); struct nft_pipapo_elem *e, *dup; u64 tstamp = nft_net_tstamp(net); @@ -1272,12 +1337,15 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const u8 *start_p, *end_p; int i, bsize_max, err = 0; + if (!m) + return -ENOMEM; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) end = (const u8 *)nft_set_ext_key_end(ext)->data; else end = start; - dup = pipapo_get(net, set, start, genmask, tstamp, GFP_KERNEL); + dup = pipapo_get(net, set, m, start, genmask, tstamp, GFP_KERNEL); if (!IS_ERR(dup)) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1299,7 +1367,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (PTR_ERR(dup) == -ENOENT) { /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, end, nft_genmask_next(net), tstamp, + dup = pipapo_get(net, set, m, end, nft_genmask_next(net), tstamp, GFP_KERNEL); } @@ -1332,8 +1400,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, } /* Insert */ - priv->dirty = true; - bsize_max = m->bsize_max; nft_pipapo_for_each_field(f, i, m) { @@ -1384,7 +1450,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, * pipapo_clone() - Clone matching data to create new working copy * @old: Existing matching data * - * Return: copy of matching data passed as 'old', error pointer on failure + * Return: copy of matching data passed as 'old' or NULL. */ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) { @@ -1392,9 +1458,9 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) struct nft_pipapo_match *new; int i; - new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL); + new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); if (!new) - return ERR_PTR(-ENOMEM); + return NULL; new->field_count = old->field_count; new->bsize_max = old->bsize_max; @@ -1416,13 +1482,15 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) for (i = 0; i < old->field_count; i++) { unsigned long *new_lt; + ssize_t lt_size; memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); - new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * - src->bsize * sizeof(*dst->lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + lt_size = lt_calculate_size(src->groups, src->bb, src->bsize); + if (lt_size < 0) + goto out_lt; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) goto out_lt; @@ -1434,8 +1502,12 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) src->groups * NFT_PIPAPO_BUCKETS(src->bb)); if (src->rules > 0) { + if (src->rules_alloc > (INT_MAX / sizeof(*src->mt))) + goto out_mt; + dst->mt = kvmalloc_array(src->rules_alloc, - sizeof(*src->mt), GFP_KERNEL); + sizeof(*src->mt), + GFP_KERNEL_ACCOUNT); if (!dst->mt) goto out_mt; @@ -1466,7 +1538,7 @@ out_scratch: free_percpu(new->scratch); kfree(new); - return ERR_PTR(-ENOMEM); + return NULL; } /** @@ -1698,8 +1770,6 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) * NFT_SET_ELEM_DEAD_BIT. */ if (__nft_set_elem_expired(&e->ext, tstamp)) { - priv->dirty = true; - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return; @@ -1777,57 +1847,30 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *new_clone, *old; - - if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - pipapo_gc(set, priv->clone); + struct nft_pipapo_match *old; - if (!priv->dirty) + if (!priv->clone) return; - new_clone = pipapo_clone(priv->clone); - if (IS_ERR(new_clone)) - return; + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); - priv->dirty = false; + old = rcu_replace_pointer(priv->match, priv->clone, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = NULL; - old = rcu_access_pointer(priv->match); - rcu_assign_pointer(priv->match, priv->clone); if (old) call_rcu(&old->rcu, pipapo_reclaim_match); - - priv->clone = new_clone; -} - -static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) -{ -#ifdef CONFIG_PROVE_LOCKING - const struct net *net = read_pnet(&set->net); - - return lockdep_is_held(&nft_pernet(net)->commit_mutex); -#else - return true; -#endif } static void nft_pipapo_abort(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *new_clone, *m; - - if (!priv->dirty) - return; - - m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set)); - new_clone = pipapo_clone(m); - if (IS_ERR(new_clone)) + if (!priv->clone) return; - - priv->dirty = false; - pipapo_free_match(priv->clone); - priv->clone = new_clone; + priv->clone = NULL; } /** @@ -1847,56 +1890,42 @@ static void nft_pipapo_activate(const struct net *net, { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &e->ext); + nft_clear(net, &e->ext); } /** - * pipapo_deactivate() - Check that element is in set, mark as inactive + * nft_pipapo_deactivate() - Search for element and make it inactive * @net: Network namespace * @set: nftables API set representation - * @data: Input key data - * @ext: nftables API extension pointer, used to check for end element - * - * This is a convenience function that can be called from both - * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same - * operation. + * @elem: nftables API element representation containing key data * * Return: deactivated element if found, NULL otherwise. */ -static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, - const u8 *data, const struct nft_set_ext *ext) +static struct nft_elem_priv * +nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_pipapo_match *m = pipapo_maybe_clone(set); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, data, nft_genmask_next(net), - nft_net_tstamp(net), GFP_KERNEL); + /* removal must occur on priv->clone, if we are low on memory + * we have no choice and must fail the removal request. + */ + if (!m) + return NULL; + + e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, + nft_genmask_next(net), nft_net_tstamp(net), GFP_KERNEL); if (IS_ERR(e)) return NULL; nft_set_elem_change_active(net, set, &e->ext); - return e; -} - -/** - * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * - * Return: deactivated element if found, NULL otherwise. - */ -static struct nft_elem_priv * -nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - - return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); + return &e->priv; } /** - * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * nft_pipapo_flush() - make element inactive * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data @@ -2077,6 +2106,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; @@ -2089,46 +2120,37 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); - } - if (i == m->field_count) { - priv->dirty = true; - pipapo_drop(m, rulemap); - return; + if (last && f->mt[rulemap[i].to].e == e) { + pipapo_drop(m, rulemap); + return; + } } first_rule += rules_f0; } + + WARN_ON_ONCE(1); /* elem_priv not found */ } /** - * nft_pipapo_walk() - Walk over elements + * nft_pipapo_do_walk() - Walk over elements in m * @ctx: nftables API context * @set: nftables API set representation + * @m: matching data pointing to key mapping array * @iter: Iterator * * As elements are referenced in the mapping array for the last field, directly * scan that array: there's no need to follow rule mappings from the first - * field. + * field. @m is protected either by RCU read lock or by transaction mutex. */ -static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_pipapo_match *m, + struct nft_set_iter *iter) { - struct nft_pipapo *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->net); - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; unsigned int i, r; - rcu_read_lock(); - if (iter->genmask == nft_genmask_cur(net)) - m = rcu_dereference(priv->match); - else - m = priv->clone; - - if (unlikely(!m)) - goto out; - for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; @@ -2143,19 +2165,51 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, e = f->mt[r].e; - if (!nft_set_elem_active(&e->ext, iter->genmask)) - goto cont; - iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) - goto out; + return; cont: iter->count++; } +} -out: - rcu_read_unlock(); +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * Test if destructive action is needed or not, clone active backend if needed + * and call the real function to work on the data. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + + switch (iter->type) { + case NFT_ITER_UPDATE: + m = pipapo_maybe_clone(set); + if (!m) { + iter->err = -ENOMEM; + return; + } + + nft_pipapo_do_walk(ctx, set, m, iter); + break; + case NFT_ITER_READ: + rcu_read_lock(); + m = rcu_dereference(priv->match); + nft_pipapo_do_walk(ctx, set, m, iter); + rcu_read_unlock(); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } /** @@ -2264,21 +2318,10 @@ static int nft_pipapo_init(const struct nft_set *set, f->mt = NULL; } - /* Create an initial clone of matching data for next insertion */ - priv->clone = pipapo_clone(m); - if (IS_ERR(priv->clone)) { - err = PTR_ERR(priv->clone); - goto out_free; - } - - priv->dirty = false; - rcu_assign_pointer(priv->match, m); return 0; -out_free: - free_percpu(m->scratch); out_scratch: kfree(m); @@ -2323,33 +2366,18 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; - int cpu; m = rcu_dereference_protected(priv->match, true); - if (m) { - rcu_barrier(); - - for_each_possible_cpu(cpu) - pipapo_free_scratch(m, cpu); - free_percpu(m->scratch); - pipapo_free_fields(m); - kfree(m); - priv->match = NULL; - } if (priv->clone) { - m = priv->clone; - - nft_set_pipapo_match_destroy(ctx, set, m); - - for_each_possible_cpu(cpu) - pipapo_free_scratch(priv->clone, cpu); - free_percpu(priv->clone->scratch); - - pipapo_free_fields(priv->clone); - kfree(priv->clone); + nft_set_pipapo_match_destroy(ctx, set, priv->clone); + pipapo_free_match(priv->clone); priv->clone = NULL; + } else { + nft_set_pipapo_match_destroy(ctx, set, m); } + + pipapo_free_match(m); } /** diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 24cd1ff73f98..4a2ff85ce1c4 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -155,14 +155,12 @@ struct nft_pipapo_match { * @match: Currently in-use matching data * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding - * @dirty: Working copy has pending insertions or deletions * @last_gc: Timestamp of last garbage collection run, jiffies */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; - bool dirty; unsigned long last_gc; }; @@ -280,4 +278,25 @@ static u64 pipapo_estimate_size(const struct nft_set_desc *desc) return size; } +/** + * pipapo_resmap_init() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Initialize all bits covered by the first field to one, so that after + * the first step, only the matching bits of the first bit group remain. + * + * If other fields have a large bitmap, set remainder of res_map to 0. + */ +static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + for (i = 0; i < f->bsize; i++) + res_map[i] = ULONG_MAX; + + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} #endif /* _NFT_SET_PIPAPO_H */ diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index d08407d589ea..be7c16c79f71 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -994,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_AND(3, 4, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); - NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_AND(0, 3, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); @@ -1036,6 +1037,7 @@ nothing: /** * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes + * @mdata: Matching data, including mapping table * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables @@ -1051,7 +1053,8 @@ nothing: * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ -static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, +static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata, + unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) @@ -1060,7 +1063,7 @@ static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, int i, ret = -1, b; if (first) - memset(map, 0xff, bsize * sizeof(*map)); + pipapo_resmap_init(mdata, map); for (i = offset; i < bsize; i++) { if (f->bb == 8) @@ -1111,6 +1114,25 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, } /** + * pipapo_resmap_init_avx2() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Like pipapo_resmap_init() but do not set start map bits covered by the first field. + */ +static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + /* Starting map doesn't need to be set to all-ones for this implementation, + * but we do need to zero the remaining bits, if any. + */ + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} + +/** * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation @@ -1137,8 +1159,14 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, bool map_index; int i, ret = 0; - if (unlikely(!irq_fpu_usable())) - return nft_pipapo_lookup(net, set, key, ext); + local_bh_disable(); + + if (unlikely(!irq_fpu_usable())) { + bool fallback_res = nft_pipapo_lookup(net, set, key, ext); + + local_bh_enable(); + return fallback_res; + } m = rcu_dereference(priv->match); @@ -1153,6 +1181,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) { kernel_fpu_end(); + local_bh_enable(); return false; } @@ -1161,7 +1190,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, res = scratch->map + (map_index ? m->bsize_max : 0); fill = scratch->map + (map_index ? 0 : m->bsize_max); - /* Starting map doesn't need to be set for this implementation */ + pipapo_resmap_init_avx2(m, res); nft_pipapo_avx2_prepare(); @@ -1186,7 +1215,7 @@ next_match: } else if (f->groups == 16) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { - ret = nft_pipapo_avx2_lookup_slow(res, fill, f, + ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, ret, rp, first, last); } @@ -1202,7 +1231,7 @@ next_match: } else if (f->groups == 32) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { - ret = nft_pipapo_avx2_lookup_slow(res, fill, f, + ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, ret, rp, first, last); } @@ -1233,6 +1262,7 @@ out: if (i % 2) scratch->map_index = !map_index; kernel_fpu_end(); + local_bh_enable(); return ret >= 0; } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9944fe479e53..2e8ef16ff191 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -532,7 +532,7 @@ static void nft_rbtree_activate(const struct net *net, { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &rbe->ext); + nft_clear(net, &rbe->ext); } static void nft_rbtree_flush(const struct net *net, @@ -600,8 +600,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&rbe->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) { @@ -752,6 +750,46 @@ static void nft_rbtree_gc_init(const struct nft_set *set) priv->last_gc = jiffies; } +/* rbtree stores ranges as singleton elements, each range is composed of two + * elements ... + */ +static u32 nft_rbtree_ksize(u32 size) +{ + return size * 2; +} + +/* ... hide this detail to userspace. */ +static u32 nft_rbtree_usize(u32 size) +{ + if (!size) + return 0; + + return size / 2; +} + +static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *node; + const void *key; + + node = rb_last(&priv->root); + if (!node) + return 0; + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_rbtree_interval_end(rbe)) + return 0; + + key = nft_set_ext_key(&rbe->ext); + if (memchr(key, 1, set->klen)) + return 0; + + /* this is the all-zero no-match element. */ + return 1; +} + const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -770,5 +808,8 @@ const struct nft_set_type nft_set_rbtree_type = { .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, + .ksize = nft_rbtree_ksize, + .usize = nft_rbtree_usize, + .adjust_maxsize = nft_rbtree_adjust_maxsize, }, }; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index f30163e2ca62..35d0409b0095 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -9,7 +9,8 @@ struct nft_socket { enum nft_socket_keys key:8; - u8 level; + u8 level; /* cgroupv2 level to extract */ + u8 level_user; /* cgroupv2 level provided by userspace */ u8 len; union { u8 dreg; @@ -53,6 +54,28 @@ nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo memcpy(dest, &cgid, sizeof(u64)); return true; } + +/* process context only, uses current->nsproxy. */ +static noinline int nft_socket_cgroup_subtree_level(void) +{ + struct cgroup *cgrp = cgroup_get_from_path("/"); + int level; + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + level = cgrp->level; + + cgroup_put(cgrp); + + if (level > 255) + return -ERANGE; + + if (WARN_ON_ONCE(level < 0)) + return -EINVAL; + + return level; +} #endif static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt) @@ -110,13 +133,13 @@ static void nft_socket_eval(const struct nft_expr *expr, *dest = READ_ONCE(sk->sk_mark); } else { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } break; case NFT_SOCKET_WILDCARD: if (!sk_fullsock(sk)) { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } nft_socket_wildcard(pkt, regs, sk, dest); break; @@ -124,7 +147,7 @@ static void nft_socket_eval(const struct nft_expr *expr, case NFT_SOCKET_CGROUPV2: if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } break; #endif @@ -133,6 +156,7 @@ static void nft_socket_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +out_put_sk: if (sk != skb->sk) sock_gen_put(sk); } @@ -173,9 +197,10 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_MARK: len = sizeof(u32); break; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_SOCK_CGROUP_DATA case NFT_SOCKET_CGROUPV2: { unsigned int level; + int err; if (!tb[NFTA_SOCKET_LEVEL]) return -EINVAL; @@ -184,6 +209,17 @@ static int nft_socket_init(const struct nft_ctx *ctx, if (level > 255) return -EOPNOTSUPP; + err = nft_socket_cgroup_subtree_level(); + if (err < 0) + return err; + + priv->level_user = level; + + level += err; + /* Implies a giant cgroup tree */ + if (WARN_ON_ONCE(level > 255)) + return -EOPNOTSUPP; + priv->level = level; len = sizeof(u64); break; @@ -208,7 +244,7 @@ static int nft_socket_dump(struct sk_buff *skb, if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; if (priv->key == NFT_SOCKET_CGROUPV2 && - nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) + nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user))) return -1; return 0; } @@ -239,8 +275,7 @@ static bool nft_socket_reduce(struct nft_regs_track *track, } static int nft_socket_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 1d737f89dfc1..5d3e51825985 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -248,8 +248,7 @@ static void nft_synproxy_eval(const struct nft_expr *expr, } static int nft_synproxy_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index 71412adb73d4..50481280abd2 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -254,14 +254,14 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, } if (tb[NFTA_TPROXY_REG_ADDR]) { - err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR], + err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_ADDR], &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { - err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT], + err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_PORT], &priv->sreg_port, sizeof(u16)); if (err < 0) return err; @@ -313,8 +313,7 @@ static int nft_tproxy_dump(struct sk_buff *skb, } static int nft_tproxy_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index f735d79d8be5..a12486ae089d 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -174,8 +174,8 @@ struct nft_tunnel_opts { struct erspan_metadata erspan; u8 data[IP_TUNNEL_OPTS_MAX]; } u; + IP_TUNNEL_DECLARE_FLAGS(flags); u32 len; - __be16 flags; }; struct nft_tunnel_obj { @@ -271,7 +271,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP])); opts->len = sizeof(struct vxlan_metadata); - opts->flags = TUNNEL_VXLAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags); return 0; } @@ -325,7 +326,8 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, opts->u.erspan.version = version; opts->len = sizeof(struct erspan_metadata); - opts->flags = TUNNEL_ERSPAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags); return 0; } @@ -333,13 +335,13 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, - [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { - struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len; + struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len); struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; int err, data_len; @@ -366,7 +368,8 @@ static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, opt->length = data_len / 4; opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]); opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]); - opts->flags = TUNNEL_GENEVE_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags); return 0; } @@ -385,8 +388,8 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, struct nft_tunnel_opts *opts) { struct nlattr *nla; - __be16 type = 0; int err, rem; + u32 type = 0; err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX, nft_tunnel_opts_policy, NULL); @@ -401,7 +404,7 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_vxlan_init(nla, opts); if (err) return err; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_ERSPAN: if (type) @@ -409,15 +412,15 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_erspan_init(nla, opts); if (err) return err; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; err = nft_tunnel_obj_geneve_init(nla, opts); if (err) return err; - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; default: return -EOPNOTSUPP; @@ -454,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, memset(&info, 0, sizeof(info)); info.mode = IP_TUNNEL_INFO_TX; info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID])); - info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags); if (tb[NFTA_TUNNEL_KEY_IP]) { err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info); @@ -483,18 +488,16 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX) - info.key.tun_flags &= ~TUNNEL_CSUM; + __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT) - info.key.tun_flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER) - info.key.tun_flags |= TUNNEL_SEQ; + __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags); } if (tb[NFTA_TUNNEL_KEY_TOS]) info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]); - if (tb[NFTA_TUNNEL_KEY_TTL]) - info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]); - else - info.key.ttl = U8_MAX; + info.key.ttl = nla_get_u8_default(tb[NFTA_TUNNEL_KEY_TTL], U8_MAX); if (tb[NFTA_TUNNEL_KEY_OPTS]) { err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS], @@ -503,13 +506,14 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return err; } - md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL); + md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, + GFP_KERNEL_ACCOUNT); if (!md) return -ENOMEM; memcpy(&md->u.tun_info, &info, sizeof(info)); #ifdef CONFIG_DST_CACHE - err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL); + err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT); if (err < 0) { metadata_dst_free(md); return err; @@ -583,7 +587,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, if (!nest) return -1; - if (opts->flags & TUNNEL_VXLAN_OPT) { + if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); if (!inner) goto failure; @@ -591,7 +595,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, htonl(opts->u.vxlan.gbp))) goto inner_failure; nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); if (!inner) goto failure; @@ -613,15 +617,15 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, break; } nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_GENEVE_OPT) { + } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) { struct geneve_opt *opt; int offset = 0; - inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); - if (!inner) - goto failure; while (opts->len > offset) { - opt = (struct geneve_opt *)opts->u.data + offset; + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; + opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, @@ -630,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, opt->length * 4, opt->opt_data)) goto inner_failure; offset += sizeof(*opt) + opt->length * 4; + nla_nest_end(skb, inner); } - nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; @@ -658,11 +662,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb, { u32 flags = 0; - if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_DONT_FRAGMENT; - if (!(info->key.tun_flags & TUNNEL_CSUM)) + if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_ZERO_CSUM_TX; - if (info->key.tun_flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_SEQ_NUMBER; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0) diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 1c866757db55..3210cfc966ab 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) return true; } - return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || + mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, @@ -229,8 +230,7 @@ static int nft_xfrm_get_dump(struct sk_buff *skb, return 0; } -static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index da5d929c7c85..709840612f0d 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1269,7 +1269,7 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, /* and once again: */ list_for_each_entry(t, &xt_net->tables[af], list) - if (strcmp(t->name, name) == 0) + if (strcmp(t->name, name) == 0 && owner == t->me) return t; module_put(owner); diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index c8a639f56168..9d99f5a3d176 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -63,24 +63,37 @@ static int checksum_tg_check(const struct xt_tgchk_param *par) return 0; } -static struct xt_target checksum_tg_reg __read_mostly = { - .name = "CHECKSUM", - .family = NFPROTO_UNSPEC, - .target = checksum_tg, - .targetsize = sizeof(struct xt_CHECKSUM_info), - .table = "mangle", - .checkentry = checksum_tg_check, - .me = THIS_MODULE, +static struct xt_target checksum_tg_reg[] __read_mostly = { + { + .name = "CHECKSUM", + .family = NFPROTO_IPV4, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CHECKSUM", + .family = NFPROTO_IPV6, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, + }, +#endif }; static int __init checksum_tg_init(void) { - return xt_register_target(&checksum_tg_reg); + return xt_register_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } static void __exit checksum_tg_exit(void) { - xt_unregister_target(&checksum_tg_reg); + xt_unregister_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } module_init(checksum_tg_init); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 0accac98dea7..0ae8d8a1216e 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -38,9 +38,9 @@ static struct xt_target classify_tg_reg[] __read_mostly = { { .name = "CLASSIFY", .revision = 0, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING), + (1 << NF_INET_POST_ROUTING), .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), .me = THIS_MODULE, @@ -54,6 +54,18 @@ static struct xt_target classify_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_classify_target_info), .me = THIS_MODULE, }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, +#endif }; static int __init classify_tg_init(void) diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 76acecf3e757..1494b3ee30e1 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -114,25 +114,39 @@ static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target connsecmark_tg_reg __read_mostly = { - .name = "CONNSECMARK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = connsecmark_tg_check, - .destroy = connsecmark_tg_destroy, - .target = connsecmark_tg, - .targetsize = sizeof(struct xt_connsecmark_target_info), - .me = THIS_MODULE, +static struct xt_target connsecmark_tg_reg[] __read_mostly = { + { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, + }, +#endif }; static int __init connsecmark_tg_init(void) { - return xt_register_target(&connsecmark_tg_reg); + return xt_register_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg)); } static void __exit connsecmark_tg_exit(void) { - xt_unregister_target(&connsecmark_tg_reg); + xt_unregister_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg)); } module_init(connsecmark_tg_init); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 2be2f7a7b60f..3ba94c34297c 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -313,10 +313,30 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) xt_ct_tg_destroy(par, par->targinfo); } +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->_nfct != 0) + return XT_CONTINUE; + + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + + return XT_CONTINUE; +} + static struct xt_target xt_ct_tg_reg[] __read_mostly = { { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_IPV4, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, + }, + { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .targetsize = sizeof(struct xt_ct_target_info), .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v0, @@ -327,7 +347,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), @@ -339,7 +359,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), @@ -349,49 +369,61 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .table = "raw", .me = THIS_MODULE, }, -}; - -static unsigned int -notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - /* Previously seen (loopback)? Ignore. */ - if (skb->_nfct != 0) - return XT_CONTINUE; - - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - - return XT_CONTINUE; -} - -static struct xt_target notrack_tg_reg __read_mostly = { - .name = "NOTRACK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = notrack_tg, - .table = "raw", - .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_IPV6, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .targetsize = sizeof(struct xt_ct_target_info), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v0, + .destroy = xt_ct_tg_destroy_v0, + .target = xt_ct_target_v0, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .revision = 1, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v1, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .revision = 2, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v2, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, +#endif }; static int __init xt_ct_tg_init(void) { - int ret; - - ret = xt_register_target(¬rack_tg_reg); - if (ret < 0) - return ret; - - ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); - if (ret < 0) { - xt_unregister_target(¬rack_tg_reg); - return ret; - } - return 0; + return xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); } static void __exit xt_ct_tg_exit(void) { xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); - xt_unregister_target(¬rack_tg_reg); } module_init(xt_ct_tg_init); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index db720efa811d..d73957592c9d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -100,21 +100,19 @@ static void idletimer_tg_work(struct work_struct *work) static void idletimer_tg_expired(struct timer_list *t) { - struct idletimer_tg *timer = from_timer(timer, t, timer); + struct idletimer_tg *timer = timer_container_of(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } -static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm, - ktime_t now) +static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); - return ALARMTIMER_NORESTART; } static int idletimer_check_sysfs_name(const char *name, unsigned int size) @@ -170,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return 0; @@ -231,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return 0; @@ -256,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, info->label, info->timeout); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } @@ -277,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; @@ -322,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -384,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) } } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -409,21 +407,23 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) mutex_lock(&list_mutex); - if (--info->timer->refcnt == 0) { - pr_debug("deleting timer %s\n", info->label); - - list_del(&info->timer->entry); - timer_shutdown_sync(&info->timer->timer); - cancel_work_sync(&info->timer->work); - sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); - kfree(info->timer->attr.attr.name); - kfree(info->timer); - } else { + if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); + mutex_unlock(&list_mutex); + return; } + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); mutex_unlock(&list_mutex); + + timer_shutdown_sync(&info->timer->timer); + cancel_work_sync(&info->timer->work); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) @@ -434,52 +434,75 @@ static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) mutex_lock(&list_mutex); - if (--info->timer->refcnt == 0) { - pr_debug("deleting timer %s\n", info->label); - - list_del(&info->timer->entry); - if (info->timer->timer_type & XT_IDLETIMER_ALARM) { - alarm_cancel(&info->timer->alarm); - } else { - timer_shutdown_sync(&info->timer->timer); - } - cancel_work_sync(&info->timer->work); - sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); - kfree(info->timer->attr.attr.name); - kfree(info->timer); - } else { + if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); + mutex_unlock(&list_mutex); + return; } + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); mutex_unlock(&list_mutex); + + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + alarm_cancel(&info->timer->alarm); + } else { + timer_shutdown_sync(&info->timer->timer); + } + cancel_work_sync(&info->timer->work); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { - .name = "IDLETIMER", - .family = NFPROTO_UNSPEC, - .target = idletimer_tg_target, - .targetsize = sizeof(struct idletimer_tg_info), - .usersize = offsetof(struct idletimer_tg_info, timer), - .checkentry = idletimer_tg_checkentry, - .destroy = idletimer_tg_destroy, - .me = THIS_MODULE, + .name = "IDLETIMER", + .family = NFPROTO_IPV4, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, }, { - .name = "IDLETIMER", - .family = NFPROTO_UNSPEC, - .revision = 1, - .target = idletimer_tg_target_v1, - .targetsize = sizeof(struct idletimer_tg_info_v1), - .usersize = offsetof(struct idletimer_tg_info_v1, timer), - .checkentry = idletimer_tg_checkentry_v1, - .destroy = idletimer_tg_destroy_v1, - .me = THIS_MODULE, + .name = "IDLETIMER", + .family = NFPROTO_IPV4, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, }, - - +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "IDLETIMER", + .family = NFPROTO_IPV6, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "IDLETIMER", + .family = NFPROTO_IPV6, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, + }, +#endif }; static struct class *idletimer_tg_class; diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 36c9720ad8d6..90dcf088071a 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -72,8 +72,9 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) static void led_timeout_callback(struct timer_list *t) { - struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t, - timer); + struct xt_led_info_internal *ledinternal = timer_container_of(ledinternal, + t, + timer); led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); } @@ -96,7 +97,9 @@ static int led_tg_check(const struct xt_tgchk_param *par) struct xt_led_info_internal *ledinternal; int err; - if (ledinfo->id[0] == '\0') + /* Bail out if empty string or not a string at all. */ + if (ledinfo->id[0] == '\0' || + !memchr(ledinfo->id, '\0', sizeof(ledinfo->id))) return -EINVAL; mutex_lock(&xt_led_mutex); @@ -175,26 +178,41 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par) kfree(ledinternal); } -static struct xt_target led_tg_reg __read_mostly = { - .name = "LED", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = led_tg, - .targetsize = sizeof(struct xt_led_info), - .usersize = offsetof(struct xt_led_info, internal_data), - .checkentry = led_tg_check, - .destroy = led_tg_destroy, - .me = THIS_MODULE, +static struct xt_target led_tg_reg[] __read_mostly = { + { + .name = "LED", + .revision = 0, + .family = NFPROTO_IPV4, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "LED", + .revision = 0, + .family = NFPROTO_IPV6, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init led_tg_init(void) { - return xt_register_target(&led_tg_reg); + return xt_register_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg)); } static void __exit led_tg_exit(void) { - xt_unregister_target(&led_tg_reg); + xt_unregister_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg)); } module_init(led_tg_init); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index e660c3710a10..6dcf4bc7e30b 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -64,25 +64,39 @@ static void nflog_tg_destroy(const struct xt_tgdtor_param *par) nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } -static struct xt_target nflog_tg_reg __read_mostly = { - .name = "NFLOG", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = nflog_tg_check, - .destroy = nflog_tg_destroy, - .target = nflog_tg, - .targetsize = sizeof(struct xt_nflog_info), - .me = THIS_MODULE, +static struct xt_target nflog_tg_reg[] __read_mostly = { + { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, + }, +#endif }; static int __init nflog_tg_init(void) { - return xt_register_target(&nflog_tg_reg); + return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } static void __exit nflog_tg_exit(void) { - xt_unregister_target(&nflog_tg_reg); + xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } module_init(nflog_tg_init); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 80f6624e2355..4f49cfc27831 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -179,16 +179,31 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) xt_rateest_put(par->net, info->est); } -static struct xt_target xt_rateest_tg_reg __read_mostly = { - .name = "RATEEST", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = xt_rateest_tg, - .checkentry = xt_rateest_tg_checkentry, - .destroy = xt_rateest_tg_destroy, - .targetsize = sizeof(struct xt_rateest_target_info), - .usersize = offsetof(struct xt_rateest_target_info, est), - .me = THIS_MODULE, +static struct xt_target xt_rateest_tg_reg[] __read_mostly = { + { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_IPV4, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_IPV6, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), + .me = THIS_MODULE, + }, +#endif }; static __net_init int xt_rateest_net_init(struct net *net) @@ -214,12 +229,12 @@ static int __init xt_rateest_tg_init(void) if (err) return err; - return xt_register_target(&xt_rateest_tg_reg); + return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { - xt_unregister_target(&xt_rateest_tg_reg); + xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 498a0bf6f044..5bc5ea505eb9 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -157,7 +157,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .revision = 0, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v0, .destroy = secmark_tg_destroy, .target = secmark_tg_v0, @@ -167,7 +167,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .revision = 1, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v1, .destroy = secmark_tg_destroy, .target = secmark_tg_v1, @@ -175,6 +175,29 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { .usersize = offsetof(struct xt_secmark_target_info_v1, secid), .me = THIS_MODULE, }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = secmark_tg_check_v0, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v0, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, + }, + { + .name = "SECMARK", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = secmark_tg_check_v1, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v1, + .targetsize = sizeof(struct xt_secmark_target_info_v1), + .usersize = offsetof(struct xt_secmark_target_info_v1, secid), + .me = THIS_MODULE, + }, +#endif }; static int __init secmark_tg_init(void) diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 30e99464171b..93f064306901 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_tcpoptstrip_target_info), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TCPOPTSTRIP", .family = NFPROTO_IPV6, diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 5582dce98cae..a642ff09fc8e 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -29,25 +29,39 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static struct xt_target trace_tg_reg __read_mostly = { - .name = "TRACE", - .revision = 0, - .family = NFPROTO_UNSPEC, - .table = "raw", - .target = trace_tg, - .checkentry = trace_tg_check, - .destroy = trace_tg_destroy, - .me = THIS_MODULE, +static struct xt_target trace_tg_reg[] __read_mostly = { + { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_IPV4, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_IPV6, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init trace_tg_init(void) { - return xt_register_target(&trace_tg_reg); + return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } static void __exit trace_tg_exit(void) { - xt_unregister_target(&trace_tg_reg); + xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } module_init(trace_tg_init); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index e9b2181e8c42..a77088943107 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -208,13 +208,24 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { }, { .name = "addrtype", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE - } + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "addrtype", + .family = NFPROTO_IPV6, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct xt_addrtype_info_v1), + .me = THIS_MODULE + }, +#endif }; static int __init addrtype_mt_init(void) diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index c0f5e9a4f3c6..c437fbd59ec1 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); +#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n" + static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v0 *info = par->matchinfo; @@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) if (info->invert & ~1) return -EINVAL; + if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + return 0; } @@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -100,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { +#ifdef CONFIG_CGROUP_NET_CLASSID const struct xt_cgroup_info_v0 *info = par->matchinfo; struct sock *sk = skb->sk; @@ -108,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ info->invert; +#endif + return false; } static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) @@ -123,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) @@ -141,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index a047a545371e..908fd5f2c3c8 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -146,24 +146,37 @@ static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_match xt_cluster_match __read_mostly = { - .name = "cluster", - .family = NFPROTO_UNSPEC, - .match = xt_cluster_mt, - .checkentry = xt_cluster_mt_checkentry, - .matchsize = sizeof(struct xt_cluster_match_info), - .destroy = xt_cluster_mt_destroy, - .me = THIS_MODULE, +static struct xt_match xt_cluster_match[] __read_mostly = { + { + .name = "cluster", + .family = NFPROTO_IPV4, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "cluster", + .family = NFPROTO_IPV6, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init xt_cluster_mt_init(void) { - return xt_register_match(&xt_cluster_match); + return xt_register_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match)); } static void __exit xt_cluster_mt_fini(void) { - xt_unregister_match(&xt_cluster_match); + xt_unregister_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match)); } MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 93cb018c3055..2aabdcea8707 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -111,9 +111,11 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) + if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); + return ret; + } /* * This filter cannot function correctly unless connection tracking diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 5d04ef80a61d..0189f8b6b0bd 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -86,6 +86,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_connlimit_info *info = par->matchinfo; unsigned int keylen; + int ret; keylen = sizeof(u32); if (par->family == NFPROTO_IPV6) @@ -93,8 +94,17 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) else keylen += sizeof(struct in_addr); + ret = nf_ct_netns_get(par->net, par->family); + if (ret < 0) { + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + /* init private data */ - info->data = nf_conncount_init(par->net, par->family, keylen); + info->data = nf_conncount_init(par->net, keylen); + if (IS_ERR(info->data)) + nf_ct_netns_put(par->net, par->family); return PTR_ERR_OR_ZERO(info->data); } @@ -103,29 +113,45 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; - nf_conncount_destroy(par->net, par->family, info->data); + nf_conncount_destroy(par->net, info->data); + nf_ct_netns_put(par->net, par->family); } -static struct xt_match connlimit_mt_reg __read_mostly = { - .name = "connlimit", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .usersize = offsetof(struct xt_connlimit_info, data), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, +static struct xt_match connlimit_mt_reg[] __read_mostly = { + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_IPV4, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init connlimit_mt_init(void) { - return xt_register_match(&connlimit_mt_reg); + return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } static void __exit connlimit_mt_exit(void) { - xt_unregister_match(&connlimit_mt_reg); + xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index ad3c033db64e..4277084de2e7 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -151,7 +151,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), @@ -161,13 +161,35 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 2, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, - } + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "CONNMARK", + .revision = 2, + .family = NFPROTO_IPV6, + .checkentry = connmark_tg_check, + .target = connmark_tg_v2, + .targetsize = sizeof(struct xt_connmark_tginfo2), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static struct xt_match connmark_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 0859b8f76764..3b507694e81e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -15,7 +15,6 @@ #include <linux/random.h> #include <linux/jhash.h> #include <linux/slab.h> -#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/list.h> @@ -294,8 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, if (size < 16) size = 16; } - /* FIXME: don't use vmalloc() here or anywhere else -HW */ - hinfo = vmalloc(struct_size(hinfo, hash, size)); + hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; @@ -303,7 +301,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); if (ret) { - vfree(hinfo); + kvfree(hinfo); return ret; } @@ -322,7 +320,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, hinfo->rnd_initialized = false; hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); @@ -344,7 +342,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, ops, hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } hinfo->net = net; @@ -363,11 +361,15 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select unsigned int i; for (i = 0; i < ht->cfg.size; i++) { + struct hlist_head *head = &ht->hash[i]; struct dsthash_ent *dh; struct hlist_node *n; + if (hlist_empty(head)) + continue; + spin_lock_bh(&ht->lock); - hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { + hlist_for_each_entry_safe(dh, n, head, node) { if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } @@ -429,7 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) cancel_delayed_work_sync(&hinfo->gc_work); htable_selective_cleanup(hinfo, true); kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); } } diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 1ad74b5920b5..59b9d04400ca 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -39,13 +39,35 @@ mark_mt(const struct sk_buff *skb, struct xt_action_param *par) return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -static struct xt_target mark_tg_reg __read_mostly = { - .name = "MARK", - .revision = 2, - .family = NFPROTO_UNSPEC, - .target = mark_tg, - .targetsize = sizeof(struct xt_mark_tginfo2), - .me = THIS_MODULE, +static struct xt_target mark_tg_reg[] __read_mostly = { + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_IPV4, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP) + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_ARP, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#endif +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_IPV6, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#endif }; static struct xt_match mark_mt_reg __read_mostly = { @@ -61,12 +83,12 @@ static int __init mark_mt_init(void) { int ret; - ret = xt_register_target(&mark_tg_reg); + ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&mark_mt_reg); if (ret < 0) { - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); return ret; } return 0; @@ -75,7 +97,7 @@ static int __init mark_mt_init(void) static void __exit mark_mt_exit(void) { xt_unregister_match(&mark_mt_reg); - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); } module_init(mark_mt_init); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index ef93e0d3bee0..588a5e6ad899 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -59,9 +59,9 @@ MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* fil /* retained for backwards compatibility */ static unsigned int ip_pkt_list_tot __read_mostly; module_param(ip_pkt_list_tot, uint, 0400); -MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 65535)"); -#define XT_RECENT_MAX_NSTAMPS 256 +#define XT_RECENT_MAX_NSTAMPS 65536 struct recent_entry { struct list_head list; @@ -69,7 +69,7 @@ struct recent_entry { union nf_inet_addr addr; u_int16_t family; u_int8_t ttl; - u_int8_t index; + u_int16_t index; u_int16_t nstamps; unsigned long stamps[]; }; @@ -80,7 +80,7 @@ struct recent_table { union nf_inet_addr mask; unsigned int refcnt; unsigned int entries; - u8 nstamps_max_mask; + u_int16_t nstamps_max_mask; struct list_head lru_list; struct list_head iphash[]; }; diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 5d1fb7018dba..600060ca940a 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -29,7 +29,7 @@ if (tbl == NULL) \ return NULL; \ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ - strscpy_pad(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + strscpy(tbl->repl.name, info->name); \ *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ |