diff options
Diffstat (limited to 'net/netfilter')
-rw-r--r-- | net/netfilter/ipset/ip_set_hash_gen.h | 8 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 11 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 6 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_est.c | 16 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ftp.c | 4 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_ecache.c | 2 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_netlink.c | 39 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_standalone.c | 3 | ||||
-rw-r--r-- | net/netfilter/nf_tables_api.c | 47 | ||||
-rw-r--r-- | net/netfilter/nfnetlink.c | 2 | ||||
-rw-r--r-- | net/netfilter/nft_flow_offload.c | 4 | ||||
-rw-r--r-- | net/netfilter/nft_payload.c | 20 | ||||
-rw-r--r-- | net/netfilter/nft_set_hash.c | 100 | ||||
-rw-r--r-- | net/netfilter/nft_set_pipapo.c | 96 | ||||
-rw-r--r-- | net/netfilter/nft_set_pipapo.h | 8 | ||||
-rw-r--r-- | net/netfilter/nft_set_pipapo_avx2.c | 142 | ||||
-rw-r--r-- | net/netfilter/nft_set_pipapo_avx2.h | 4 | ||||
-rw-r--r-- | net/netfilter/nft_set_rbtree.c | 35 |
19 files changed, 348 insertions, 203 deletions
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5251524b96af..5e4453e9ef8e 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -63,7 +63,7 @@ struct hbucket { : jhash_size((htable_bits) - HTABLE_REGION_BITS)) #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) -#define ahash_region(n, htable_bits) \ +#define ahash_region(n) \ ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ @@ -702,7 +702,7 @@ retry: #endif key = HKEY(data, h->initval, htable_bits); m = __ipset_dereference(hbucket(t, key)); - nr = ahash_region(key, htable_bits); + nr = ahash_region(key); if (!m) { m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, @@ -852,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); elements = t->hregion[r].elements; maxelem = t->maxelem; @@ -1050,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); rcu_read_unlock_bh(); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 965f3c8e5089..37ebb0cb62b8 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -885,7 +885,7 @@ static void ip_vs_conn_expire(struct timer_list *t) * conntrack cleanup for the net. */ smp_rmb(); - if (ipvs->enable) + if (READ_ONCE(ipvs->enable)) ip_vs_conn_drop_conntrack(cp); } @@ -1439,7 +1439,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) cond_resched_rcu(); /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) break; } rcu_read_unlock(); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c7a8a08b7308..5ea7ab8bf4dc 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1353,9 +1353,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - if (!ipvs->enable) - return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1940,7 +1937,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); @@ -2108,7 +2105,7 @@ ip_vs_forward_icmp(void *priv, struct sk_buff *skb, int r; /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; if (state->pf == NFPROTO_IPV4) { @@ -2295,7 +2292,7 @@ static int __net_init __ip_vs_init(struct net *net) return -ENOMEM; /* Hold the beast until a service is registered */ - ipvs->enable = 0; + WRITE_ONCE(ipvs->enable, 0); ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); @@ -2367,7 +2364,7 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) ipvs = net_ipvs(net); ip_vs_unregister_hooks(ipvs, AF_INET); ip_vs_unregister_hooks(ipvs, AF_INET6); - ipvs->enable = 0; /* Disable packet reception */ + WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6a6fc4478533..4c8fa22be88a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -256,7 +256,7 @@ static void est_reload_work_handler(struct work_struct *work) struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock; if (!kd) continue; @@ -1483,9 +1483,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; - if (!ipvs->enable) { + if (!READ_ONCE(ipvs->enable)) { /* Now there is a service - full throttle */ - ipvs->enable = 1; + WRITE_ONCE(ipvs->enable, 1); /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 15049b826732..93a925f1ed9b 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -231,7 +231,7 @@ static int ip_vs_estimation_kthread(void *data) void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ @@ -306,7 +306,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - ipvs->enable && ipvs->est_max_threads) + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); @@ -343,7 +343,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) } /* Start kthread tasks only when services are present */ - if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { + if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; @@ -486,7 +486,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && ipvs->enable) + if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; @@ -663,7 +663,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; } @@ -681,7 +681,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) rcu_read_unlock(); local_bh_enable(); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; cond_resched(); @@ -757,7 +757,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) @@ -787,7 +787,7 @@ last_kt: id = ipvs->est_kt_count; next_kt: - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto unlock; id--; if (id < 0) diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index d8a284999544..206c6700e200 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -53,6 +53,7 @@ enum { IP_VS_FTP_EPSV, }; +static bool exiting_module; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. @@ -605,7 +606,7 @@ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!ipvs) + if (!ipvs || !exiting_module) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); @@ -627,6 +628,7 @@ static int __init ip_vs_ftp_init(void) */ static void __exit ip_vs_ftp_exit(void) { + exiting_module = true; unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index af68c64acaab..81baf2082604 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -301,7 +301,7 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) - mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 50fd6809380f..3a04665adf99 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -60,7 +60,7 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { - struct nf_conn *last; + unsigned long last_id; unsigned int cpu; bool done; }; @@ -1733,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } -static int ctnetlink_done_list(struct netlink_callback *cb) -{ - struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - - if (ctx->last) - nf_ct_put(ctx->last); - - return 0; -} - #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_one_entry(struct sk_buff *skb, struct netlink_callback *cb, @@ -1757,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, if (l3proto && nf_ct_l3num(ct) != l3proto) return 0; - if (ctx->last) { - if (ct != ctx->last) + if (ctx->last_id) { + if (ctnetlink_get_id(ct) != ctx->last_id) return 0; - ctx->last = NULL; + ctx->last_id = 0; } /* We can't dump extension info for the unconfirmed @@ -1775,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); - if (res < 0) { - if (!refcount_inc_not_zero(&ct->ct_general.use)) - return 0; - - ctx->last = ct; - } + if (res < 0) + ctx->last_id = ctnetlink_get_id(ct); return res; } @@ -1796,10 +1782,10 @@ static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - struct nf_conn *last = ctx->last; #ifdef CONFIG_NF_CONNTRACK_EVENTS const struct net *net = sock_net(skb->sk); struct nf_conntrack_net_ecache *ecache_net; + unsigned long last_id = ctx->last_id; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; #endif @@ -1807,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) if (ctx->done) return 0; - ctx->last = NULL; + ctx->last_id = 0; #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache_net = nf_conn_pernet_ecache(net); @@ -1818,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) int res; ct = nf_ct_tuplehash_to_ctrack(h); - if (last && last != ct) + if (last_id && last_id != ctnetlink_get_id(ct)) continue; res = ctnetlink_dump_one_entry(skb, cb, ct, true); if (res < 0) { spin_unlock_bh(&ecache_net->dying_lock); - nf_ct_put(last); return skb->len; } - nf_ct_put(last); - last = NULL; + last_id = 0; } spin_unlock_bh(&ecache_net->dying_lock); #endif ctx->done = true; - nf_ct_put(last); return skb->len; } @@ -1847,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } @@ -1862,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 1f14ef0436c6..708b79380f04 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -317,6 +317,9 @@ static int ct_seq_show(struct seq_file *s, void *v) smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) { + struct ct_iter_state *st = s->private; + + st->skip_elems--; nf_ct_kill(ct); goto release; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c3c73411c40c..eed434e0a970 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,12 +151,12 @@ static void nft_ctx_init(struct nft_ctx *ctx, bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } -static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, - int msg_type, u32 size, gfp_t gfp) +static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, + int msg_type, u32 size) { struct nft_trans *trans; - trans = kzalloc(size, gfp); + trans = kzalloc(size, GFP_KERNEL); if (trans == NULL) return NULL; @@ -172,12 +172,6 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return trans; } -static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, - int msg_type, u32 size) -{ - return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); -} - static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) { switch (trans->msg_type) { @@ -442,8 +436,7 @@ static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, struct nft_trans_elem *tail, - struct nft_trans_elem *trans, - gfp_t gfp) + struct nft_trans_elem *trans) { unsigned int nelems, old_nelems = tail->nelems; struct nft_trans_elem *new_trans; @@ -466,9 +459,11 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, /* krealloc might free tail which invalidates list pointers */ list_del_init(&tail->nft_trans.list); - new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp); + new_trans = krealloc(tail, struct_size(tail, elems, nelems), + GFP_KERNEL); if (!new_trans) { - list_add_tail(&tail->nft_trans.list, &nft_net->commit_list); + list_add_tail(&tail->nft_trans.list, + &nft_net->commit_list); return false; } @@ -484,7 +479,7 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, } static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, - struct nft_trans *trans, gfp_t gfp) + struct nft_trans *trans) { struct nft_trans *tail; @@ -501,7 +496,7 @@ static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, case NFT_MSG_DELSETELEM: return nft_trans_collapse_set_elem(nft_net, nft_trans_container_elem(tail), - nft_trans_container_elem(trans), gfp); + nft_trans_container_elem(trans)); } return false; @@ -537,17 +532,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr } } -static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, - gfp_t gfp) +static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && trans->msg_type != NFT_MSG_DELSETELEM); - might_alloc(gfp); - - if (nft_trans_try_collapse(nft_net, trans, gfp)) { + if (nft_trans_try_collapse(nft_net, trans)) { kfree(trans); return; } @@ -7573,7 +7565,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } ue->priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); goto err_elem_free; } } @@ -7597,7 +7589,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } nft_trans_container_elem(trans)->elems[0].priv = elem.priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; err_set_full: @@ -7863,7 +7855,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_data_deactivate(ctx->net, set, elem.priv); nft_trans_container_elem(trans)->elems[0].priv = elem.priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; fail_ops: @@ -7888,9 +7880,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, iter->genmask)) return 0; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - struct_size_t(struct nft_trans_elem, elems, 1), - GFP_ATOMIC); + trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM, + struct_size_t(struct nft_trans_elem, elems, 1)); if (!trans) return -ENOMEM; @@ -7901,7 +7892,7 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_trans_elem_set(trans) = set; nft_trans_container_elem(trans)->nelems = 1; nft_trans_container_elem(trans)->elems[0].priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } @@ -7918,7 +7909,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_container_elem(trans)->elems[0].priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..811d02b4c4f7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -376,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; + struct nlmsghdr *onlh = nlh; LIST_HEAD(err_list); u32 status; int err; @@ -386,6 +387,7 @@ replay: status = 0; replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); + nlh = onlh; if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 225ff293cd50..14dd1c0698c3 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -9,7 +9,7 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> -#include <net/inet_dscp.h> +#include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> @@ -236,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7dfc5343dae4..b0214418f75a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -40,7 +40,7 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, /* add vlan header into the user buffer for if tag was removed by offloads */ static bool -nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; @@ -212,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), @@ -684,7 +684,7 @@ static const struct nft_expr_ops nft_payload_inner_ops = { static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); + csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum); if (*sum == 0) *sum = CSUM_MANGLED_0; } @@ -797,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, struct nft_payload_set { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 sreg; u8 csum_type; @@ -812,7 +812,7 @@ struct nft_payload_vlan_hdr { }; static bool -nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, int *vlan_hlen) { struct nft_payload_vlan_hdr *vlanh; @@ -940,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); - u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); - priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return err; + priv->offset = offset; + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { @@ -1069,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return ERR_PTR(err); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 266d0c637225..ba01ce75d6de 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -30,6 +30,7 @@ struct nft_rhash { struct nft_rhash_elem { struct nft_elem_priv priv; struct rhash_head node; + struct llist_node walk_node; u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -144,6 +145,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key, goto err1; he = nft_elem_priv_cast(elem_priv); + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -180,6 +182,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, }; struct nft_rhash_elem *prev; + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -261,12 +264,12 @@ static bool nft_rhash_delete(const struct nft_set *set, return true; } -static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he; struct rhashtable_iter hti; + struct nft_rhash_elem *he; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -295,6 +298,97 @@ cont: rhashtable_walk_exit(&hti); } +static void nft_rhash_walk_update(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *tmp; + struct llist_node *first_node; + struct rhashtable_iter hti; + LLIST_HEAD(walk_list); + + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (set->in_update_walk) { + /* This can happen with bogus rulesets during ruleset validation + * when a verdict map causes a jump back to the same map. + * + * Without this extra check the walk_next loop below will see + * elems on the callers walk_list and skip (not validate) them. + */ + iter->err = -EMLINK; + return; + } + + /* walk happens under RCU. + * + * We create a snapshot list so ->iter callback can sleep. + * commit_mutex is held, elements can ... + * .. be added in parallel from dataplane (dynset) + * .. be marked as dead in parallel from dataplane (dynset). + * .. be queued for removal in parallel (gc timeout). + * .. not be freed: transaction mutex is held. + */ + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; + } + + continue; + } + + /* rhashtable resized during walk, skip */ + if (llist_on_list(&he->walk_node)) + continue; + + llist_add(&he->walk_node, &walk_list); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + first_node = __llist_del_all(&walk_list); + set->in_update_walk = true; + llist_for_each_entry_safe(he, tmp, first_node, walk_node) { + if (iter->err == 0) { + iter->err = iter->fn(ctx, set, iter, &he->priv); + if (iter->err == 0) + iter->count++; + } + + /* all entries must be cleared again, else next ->walk iteration + * will skip entries. + */ + init_llist_node(&he->walk_node); + } + set->in_update_walk = false; +} + +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + switch (iter->type) { + case NFT_ITER_UPDATE: + /* only relevant for netlink dumps which use READ type */ + WARN_ON_ONCE(iter->skip != 0); + + nft_rhash_walk_update(ctx, set, iter); + break; + case NFT_ITER_READ: + nft_rhash_walk_ro(ctx, set, iter); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, struct nft_set_ext *ext) { diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 793790d79d13..112fe46788b6 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -397,7 +397,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, } /** - * pipapo_get() - Get matching element reference given key data + * pipapo_get_slow() - Get matching element reference given key data * @m: storage containing the set elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask @@ -414,12 +414,12 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, * * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, - const u8 *data, u8 genmask, - u64 tstamp) +static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { + unsigned long *res_map, *fill_map, *map; struct nft_pipapo_scratch *scratch; - unsigned long *res_map, *fill_map; const struct nft_pipapo_field *f; bool map_index; int i; @@ -429,11 +429,13 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) goto out; + __local_lock_nested_bh(&scratch->bh_lock); map_index = scratch->map_index; - res_map = scratch->map + (map_index ? m->bsize_max : 0); - fill_map = scratch->map + (map_index ? 0 : m->bsize_max); + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res_map = map + (map_index ? m->bsize_max : 0); + fill_map = map + (map_index ? 0 : m->bsize_max); pipapo_resmap_init(m, res_map); @@ -464,6 +466,7 @@ next_match: last); if (b < 0) { scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return NULL; @@ -483,6 +486,7 @@ next_match: * *next* bitmap (not initial) for the next packet. */ scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return e; } @@ -497,12 +501,47 @@ next_match: data += NFT_PIPAPO_GROUPS_PADDING(f); } + __local_unlock_nested_bh(&scratch->bh_lock); out: local_bh_enable(); return NULL; } /** + * pipapo_get() - Get matching element reference given key data + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements + * + * This is a dispatcher function, either calling out the generic C + * implementation or, if available, the AVX2 one. + * This helper is only called from the control plane, with either RCU + * read lock or transaction mutex held. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) +{ + struct nft_pipapo_elem *e; + + local_bh_disable(); + +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) { + e = pipapo_get_avx2(m, data, genmask, tstamp); + local_bh_enable(); + return e; + } +#endif + e = pipapo_get_slow(m, data, genmask, tstamp); + local_bh_enable(); + return e; +} + +/** * nft_pipapo_lookup() - Dataplane fronted for main lookup function * @net: Network namespace * @set: nftables API set representation @@ -510,8 +549,7 @@ out: * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. - * Unlike other set types, this uses NFT_GENMASK_ANY instead of - * nft_genmask_cur(). + * Unlike other set types, this uses 0 instead of nft_genmask_cur(). * * This is because new (future) elements are not reachable from * priv->match, they get added to priv->clone instead. @@ -521,8 +559,8 @@ out: * inconsistent state: matching old entries get skipped but thew * newly matching entries are unreachable. * - * GENMASK will still find the 'now old' entries which ensures consistent - * priv->match view. + * GENMASK_ANY doesn't work for the same reason: old-gen entries get + * skipped, new-gen entries are only reachable from priv->clone. * * nft_pipapo_commit swaps ->clone and ->match shortly after the * genbit flip. As ->clone doesn't contain the old entries in the first @@ -539,7 +577,7 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); + e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64()); return e ? &e->ext : NULL; } @@ -1152,22 +1190,17 @@ static void pipapo_map(struct nft_pipapo_match *m, } /** - * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address + * pipapo_free_scratch() - Free per-CPU map at original address * @m: Matching data * @cpu: CPU number */ static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) { struct nft_pipapo_scratch *s; - void *mem; s = *per_cpu_ptr(m->scratch, cpu); - if (!s) - return; - mem = s; - mem -= s->align_off; - kvfree(mem); + kvfree(s); } /** @@ -1184,11 +1217,8 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, for_each_possible_cpu(i) { struct nft_pipapo_scratch *scratch; -#ifdef NFT_PIPAPO_ALIGN - void *scratch_aligned; - u32 align_off; -#endif - scratch = kvzalloc_node(struct_size(scratch, map, bsize_max * 2) + + + scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { @@ -1203,23 +1233,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, } pipapo_free_scratch(clone, i); - -#ifdef NFT_PIPAPO_ALIGN - /* Align &scratch->map (not the struct itself): the extra - * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() - * above guarantee we can waste up to those bytes in order - * to align the map field regardless of its offset within - * the struct. - */ - BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); - - scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); - scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); - align_off = scratch_aligned - (void *)scratch; - - scratch = scratch_aligned; - scratch->align_off = align_off; -#endif + local_lock_init(&scratch->bh_lock); *per_cpu_ptr(clone->scratch, i) = scratch; } diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 4a2ff85ce1c4..eaab422aa56a 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -124,14 +124,14 @@ struct nft_pipapo_field { /** * struct nft_pipapo_scratch - percpu data used for lookup and matching + * @bh_lock: PREEMPT_RT local spinlock * @map_index: Current working bitmap index, toggled between field matches - * @align_off: Offset to get the originally allocated address - * @map: store partial matching results during lookup + * @__map: store partial matching results during lookup */ struct nft_pipapo_scratch { + local_lock_t bh_lock; u8 map_index; - u32 align_off; - unsigned long map[]; + unsigned long __map[]; }; /** diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index c0884fa68c79..7ff90325c97f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1099,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; - if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) + if (!boot_cpu_has(X86_FEATURE_AVX2)) return false; est->size = pipapo_estimate_size(desc); @@ -1133,75 +1133,59 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns } /** - * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation - * @net: Network namespace - * @set: nftables API set representation - * @key: nftables API element representation containing key data + * pipapo_get_avx2() - Lookup function for AVX2 implementation + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * * This implementation exploits the repetitive characteristic of the algorithm * to provide a fast, vectorised version using the AVX2 SIMD instruction set. * - * Return: true on match, false otherwise. + * The caller must check that the FPU is usable. + * This function must be called with BH disabled. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -const struct nft_set_ext * -nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); - const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; - const u8 *rp = (const u8 *)key; - unsigned long *res, *fill; + unsigned long *res, *fill, *map; bool map_index; int i; - local_bh_disable(); - - if (unlikely(!irq_fpu_usable())) { - ext = nft_pipapo_lookup(net, set, key); + scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) + return NULL; - local_bh_enable(); - return ext; - } + __local_lock_nested_bh(&scratch->bh_lock); + map_index = scratch->map_index; + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res = map + (map_index ? m->bsize_max : 0); + fill = map + (map_index ? 0 : m->bsize_max); - m = rcu_dereference(priv->match); + pipapo_resmap_init_avx2(m, res); - /* This also protects access to all data related to scratch maps. - * - * Note that we don't need a valid MXCSR state for any of the + /* Note that we don't need a valid MXCSR state for any of the * operations we use here, so pass 0 as mask and spare a LDMXCSR * instruction. */ kernel_fpu_begin_mask(0); - scratch = *raw_cpu_ptr(m->scratch); - if (unlikely(!scratch)) { - kernel_fpu_end(); - local_bh_enable(); - return NULL; - } - - map_index = scratch->map_index; - - res = scratch->map + (map_index ? m->bsize_max : 0); - fill = scratch->map + (map_index ? 0 : m->bsize_max); - - pipapo_resmap_init_avx2(m, res); - nft_pipapo_avx2_prepare(); -next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; int ret = 0; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ - ret, rp, \ + ret, data, \ first, last)) if (likely(f->bb == 8)) { @@ -1217,7 +1201,7 @@ next_match: NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, - ret, rp, + ret, data, first, last); } } else { @@ -1233,7 +1217,7 @@ next_match: NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, - ret, rp, + ret, data, first, last); } } @@ -1241,28 +1225,78 @@ next_match: #undef NFT_SET_PIPAPO_AVX2_LOOKUP - if (ret < 0) - goto out; +next_match: + if (ret < 0) { + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; + } if (last) { - const struct nft_set_ext *e = &f->mt[ret].e->ext; + struct nft_pipapo_elem *e; - if (unlikely(nft_set_elem_expired(e))) + e = f->mt[ret].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) { + ret = pipapo_refill(res, f->bsize, f->rules, + fill, f->mt, last); goto next_match; + } - ext = e; - goto out; + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return e; } + map_index = !map_index; swap(res, fill); - rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } -out: - if (i % 2) - scratch->map_index = !map_index; kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; +} + +/** + * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation + * @net: Network namespace + * @set: nftables API set representation + * @key: nftables API element representation containing key data + * + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy using + * the AVX2 routines if the FPU is usable or fall back to the generic + * implementation of the algorithm otherwise. + * + * Return: nftables API extension pointer or NULL if no match. + */ +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + const u8 *rp = (const u8 *)key; + const struct nft_pipapo_elem *e; + + local_bh_disable(); + + if (unlikely(!irq_fpu_usable())) { + const struct nft_set_ext *ext; + + ext = nft_pipapo_lookup(net, set, key); + + local_bh_enable(); + return ext; + } + + m = rcu_dereference(priv->match); + + e = pipapo_get_avx2(m, rp, 0, get_jiffies_64()); local_bh_enable(); - return ext; + return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index dbb6aaca8a7a..c2999b63da3f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,8 +5,12 @@ #include <asm/fpu/xstate.h> #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) +struct nft_pipapo_match; bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ #endif /* _NFT_SET_PIPAPO_AVX2_H */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index b1f04168ec93..ca594161b840 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -584,15 +584,14 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, return NULL; } -static void nft_rbtree_walk(const struct nft_ctx *ctx, - struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rbtree_do_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; - read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -600,14 +599,34 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); - if (iter->err < 0) { - read_unlock_bh(&priv->lock); + if (iter->err < 0) return; - } cont: iter->count++; } - read_unlock_bh(&priv->lock); +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + switch (iter->type) { + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); + break; + case NFT_ITER_READ: + read_lock_bh(&priv->lock); + nft_rbtree_do_walk(ctx, set, iter); + read_unlock_bh(&priv->lock); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, |