summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/net/netfilter/nf_conntrack.h18
-rw-r--r--include/net/netfilter/nf_flow_table.h1
-rw-r--r--include/net/netfilter/nf_tables.h10
-rw-r--r--net/bridge/br_netfilter_hooks.c30
-rw-r--r--net/netfilter/nf_conntrack_amanda.c2
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c13
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c4
-rw-r--r--net/netfilter/nf_conntrack_sip.c4
-rw-r--r--net/netfilter/nf_flow_table_core.c187
-rw-r--r--net/netfilter/nf_flow_table_ip.c14
-rw-r--r--net/netfilter/nf_tables_api.c123
-rw-r--r--net/netfilter/nft_chain_filter.c48
-rw-r--r--net/netfilter/nft_ct.c2
-rw-r--r--net/netfilter/nft_flow_offload.c16
-rw-r--r--net/netfilter/nft_set_rbtree.c43
16 files changed, 332 insertions, 185 deletions
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index cba3ccf03fcc..3f02a45773e8 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -204,8 +204,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple);
void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb,
- u32 extra_jiffies, bool do_acct);
+ u32 extra_jiffies, unsigned int bytes);
/* Refresh conntrack for this many jiffies and do accounting */
static inline void nf_ct_refresh_acct(struct nf_conn *ct,
@@ -213,15 +212,14 @@ static inline void nf_ct_refresh_acct(struct nf_conn *ct,
const struct sk_buff *skb,
u32 extra_jiffies)
{
- __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true);
+ __nf_ct_refresh_acct(ct, ctinfo, extra_jiffies, skb->len);
}
/* Refresh conntrack for this many jiffies */
static inline void nf_ct_refresh(struct nf_conn *ct,
- const struct sk_buff *skb,
u32 extra_jiffies)
{
- __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, false);
+ __nf_ct_refresh_acct(ct, 0, extra_jiffies, 0);
}
/* kill conntrack and do accounting */
@@ -314,16 +312,6 @@ static inline bool nf_ct_should_gc(const struct nf_conn *ct)
#define NF_CT_DAY (86400 * HZ)
-/* Set an arbitrary timeout large enough not to ever expire, this save
- * us a check for the IPS_OFFLOAD_BIT from the packet path via
- * nf_ct_is_expired().
- */
-static inline void nf_ct_offload_timeout(struct nf_conn *ct)
-{
- if (nf_ct_expires(ct) < NF_CT_DAY / 2)
- WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY);
-}
-
struct kernel_param;
int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp);
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index b63d53bb9dd6..d711642e78b5 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -163,6 +163,7 @@ struct flow_offload_tuple_rhash {
enum nf_flow_flags {
NF_FLOW_SNAT,
NF_FLOW_DNAT,
+ NF_FLOW_CLOSING,
NF_FLOW_TEARDOWN,
NF_FLOW_HW,
NF_FLOW_HW_DYING,
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 0027beca5cd5..60d5dcdb289c 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -442,6 +442,9 @@ struct nft_set_ext;
* @remove: remove element from set
* @walk: iterate over all set elements
* @get: get set elements
+ * @ksize: kernel set size
+ * @usize: userspace set size
+ * @adjust_maxsize: delta to adjust maximum set size
* @commit: commit set elements
* @abort: abort set elements
* @privsize: function to return size of set private data
@@ -495,6 +498,9 @@ struct nft_set_ops {
const struct nft_set *set,
const struct nft_set_elem *elem,
unsigned int flags);
+ u32 (*ksize)(u32 size);
+ u32 (*usize)(u32 size);
+ u32 (*adjust_maxsize)(const struct nft_set *set);
void (*commit)(struct nft_set *set);
void (*abort)(const struct nft_set *set);
u64 (*privsize)(const struct nlattr * const nla[],
@@ -1195,6 +1201,8 @@ struct nft_hook {
struct list_head list;
struct nf_hook_ops ops;
struct rcu_head rcu;
+ char ifname[IFNAMSIZ];
+ u8 ifnamelen;
};
/**
@@ -1230,8 +1238,6 @@ static inline bool nft_is_base_chain(const struct nft_chain *chain)
return chain->flags & NFT_CHAIN_BASE;
}
-int __nft_release_basechain(struct nft_ctx *ctx);
-
unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
static inline bool nft_use_inc(u32 *use)
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 451e45b9a6a5..94cbe967d1c1 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -393,38 +393,10 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
reason = ip_route_input(skb, iph->daddr, iph->saddr,
ip4h_dscp(iph), dev);
if (reason) {
- struct in_device *in_dev = __in_dev_get_rcu(dev);
-
- /* If err equals -EHOSTUNREACH the error is due to a
- * martian destination or due to the fact that
- * forwarding is disabled. For most martian packets,
- * ip_route_output_key() will fail. It won't fail for 2 types of
- * martian destinations: loopback destinations and destination
- * 0.0.0.0. In both cases the packet will be dropped because the
- * destination is the loopback device and not the bridge. */
- if (reason != SKB_DROP_REASON_IP_INADDRERRORS || !in_dev ||
- IN_DEV_FORWARD(in_dev))
- goto free_skb;
-
- rt = ip_route_output(net, iph->daddr, 0,
- ip4h_dscp(iph), 0,
- RT_SCOPE_UNIVERSE);
- if (!IS_ERR(rt)) {
- /* - Bridged-and-DNAT'ed traffic doesn't
- * require ip_forwarding. */
- if (rt->dst.dev == dev) {
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- goto bridged_dnat;
- }
- ip_rt_put(rt);
- }
-free_skb:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
} else {
if (skb_dst(skb)->dev == dev) {
-bridged_dnat:
skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index d011d2eb0848..7be4c35e4795 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb,
/* increase the UDP timeout of the master connection as replies from
* Amanda clients to the server can be quite delayed */
- nf_ct_refresh(ct, skb, master_timeout * HZ);
+ nf_ct_refresh(ct, master_timeout * HZ);
/* No data? */
dataoff = protoff + sizeof(struct udphdr);
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index cfa0fe0356de..a7552a46d6ac 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -75,7 +75,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
nf_ct_expect_related(exp, 0);
nf_ct_expect_put(exp);
- nf_ct_refresh(ct, skb, timeout * HZ);
+ nf_ct_refresh(ct, timeout * HZ);
out:
return NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 456446d7af20..7f8b245e287a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1544,12 +1544,6 @@ static void gc_worker(struct work_struct *work)
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
- nf_ct_offload_timeout(tmp);
- if (!nf_conntrack_max95)
- continue;
- }
-
if (expired_count > GC_SCAN_EXPIRED_MAX) {
rcu_read_unlock();
@@ -2089,9 +2083,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_in);
/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
void __nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb,
u32 extra_jiffies,
- bool do_acct)
+ unsigned int bytes)
{
/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
@@ -2104,8 +2097,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
if (READ_ONCE(ct->timeout) != extra_jiffies)
WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
- if (do_acct)
- nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
+ if (bytes)
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 5a9bce24f3c3..14f73872f647 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1385,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
if (info->timeout > 0) {
pr_debug("nf_ct_ras: set RAS connection timeout to "
"%u seconds\n", info->timeout);
- nf_ct_refresh(ct, skb, info->timeout * HZ);
+ nf_ct_refresh(ct, info->timeout * HZ);
/* Set expect timeout */
spin_lock_bh(&nf_conntrack_expect_lock);
@@ -1433,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
info->sig_port[!dir] = 0;
/* Give it 30 seconds for UCF or URJ */
- nf_ct_refresh(ct, skb, 30 * HZ);
+ nf_ct_refresh(ct, 30 * HZ);
return 0;
}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index d0eac27f6ba0..ca748f8dbff1 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1553,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
@@ -1624,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index df72b0376970..9d8361526f82 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -161,42 +161,86 @@ void flow_offload_route_init(struct flow_offload *flow,
}
EXPORT_SYMBOL_GPL(flow_offload_route_init);
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+ return nf_flow_timeout_delta(flow->timeout) <= 0;
+}
+
+static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
{
+ struct ip_ct_tcp *tcp = &ct->proto.tcp;
+
+ spin_lock_bh(&ct->lock);
+ if (tcp->state != tcp_state)
+ tcp->state = tcp_state;
+
+ /* syn packet triggers the TCP reopen case from conntrack. */
+ if (tcp->state == TCP_CONNTRACK_CLOSE)
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
+ /* Conntrack state is outdated due to offload bypass.
+ * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
+ * TCP reset validation will fail.
+ */
tcp->seen[0].td_maxwin = 0;
+ tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
tcp->seen[1].td_maxwin = 0;
+ tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
+ spin_unlock_bh(&ct->lock);
}
-static void flow_offload_fixup_ct(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct flow_offload *flow)
{
+ struct nf_conn *ct = flow->ct;
struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
+ bool expired, closing = false;
+ u32 offload_timeout = 0;
s32 timeout;
if (l4num == IPPROTO_TCP) {
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
-
- flow_offload_fixup_tcp(&ct->proto.tcp);
+ const struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ u8 tcp_state;
+
+ /* Enter CLOSE state if fin/rst packet has been seen, this
+ * allows TCP reopen from conntrack. Otherwise, pick up from
+ * the last seen TCP state.
+ */
+ closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
+ if (closing) {
+ flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
+ timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
+ expired = false;
+ } else {
+ tcp_state = READ_ONCE(ct->proto.tcp.state);
+ flow_offload_fixup_tcp(ct, tcp_state);
+ timeout = READ_ONCE(tn->timeouts[tcp_state]);
+ expired = nf_flow_has_expired(flow);
+ }
+ offload_timeout = READ_ONCE(tn->offload_timeout);
- timeout = tn->timeouts[ct->proto.tcp.state];
- timeout -= tn->offload_timeout;
} else if (l4num == IPPROTO_UDP) {
- struct nf_udp_net *tn = nf_udp_pernet(net);
+ const struct nf_udp_net *tn = nf_udp_pernet(net);
enum udp_conntrack state =
test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
UDP_CT_REPLIED : UDP_CT_UNREPLIED;
- timeout = tn->timeouts[state];
- timeout -= tn->offload_timeout;
+ timeout = READ_ONCE(tn->timeouts[state]);
+ expired = nf_flow_has_expired(flow);
+ offload_timeout = READ_ONCE(tn->offload_timeout);
} else {
return;
}
+ if (expired)
+ timeout -= offload_timeout;
+
if (timeout < 0)
timeout = 0;
- if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
- WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
+ if (closing ||
+ nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+ nf_ct_refresh(ct, timeout);
}
static void flow_offload_route_release(struct flow_offload *flow)
@@ -294,7 +338,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
return err;
}
- nf_ct_offload_timeout(flow->ct);
+ nf_ct_refresh(flow->ct, NF_CT_DAY);
if (nf_flowtable_hw_offload(flow_table)) {
__set_bit(NF_FLOW_HW, &flow->flags);
@@ -316,18 +360,14 @@ void flow_offload_refresh(struct nf_flowtable *flow_table,
else
return;
- if (likely(!nf_flowtable_hw_offload(flow_table)))
+ if (likely(!nf_flowtable_hw_offload(flow_table)) ||
+ test_bit(NF_FLOW_CLOSING, &flow->flags))
return;
nf_flow_offload_add(flow_table, flow);
}
EXPORT_SYMBOL_GPL(flow_offload_refresh);
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
-{
- return nf_flow_timeout_delta(flow->timeout) <= 0;
-}
-
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
@@ -344,7 +384,7 @@ void flow_offload_teardown(struct flow_offload *flow)
{
clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
set_bit(NF_FLOW_TEARDOWN, &flow->flags);
- flow_offload_fixup_ct(flow->ct);
+ flow_offload_fixup_ct(flow);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -414,15 +454,116 @@ static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
return flow_table->type->gc && flow_table->type->gc(flow);
}
+/**
+ * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
+ * @ct: Flowtable offloaded tcp ct
+ *
+ * Return: number of seconds when ct entry should expire.
+ */
+static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
+{
+ u8 state = READ_ONCE(ct->proto.tcp.state);
+
+ switch (state) {
+ case TCP_CONNTRACK_SYN_SENT:
+ case TCP_CONNTRACK_SYN_RECV:
+ return 0;
+ case TCP_CONNTRACK_ESTABLISHED:
+ return NF_CT_DAY;
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ return 5 * 60 * HZ;
+ case TCP_CONNTRACK_CLOSE:
+ return 0;
+ }
+
+ return 0;
+}
+
+/**
+ * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
+ * @ct: Flowtable offloaded ct
+ *
+ * Datapath lookups in the conntrack table will evict nf_conn entries
+ * if they have expired.
+ *
+ * Once nf_conn entries have been offloaded, nf_conntrack might not see any
+ * packets anymore. Thus ct->timeout is no longer refreshed and ct can
+ * be evicted.
+ *
+ * To avoid the need for an additional check on the offload bit for every
+ * packet processed via nf_conntrack_in(), set an arbitrary timeout large
+ * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
+ * from the packet path via nf_ct_is_expired().
+ */
+static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
+{
+ static const u32 min_timeout = 5 * 60 * HZ;
+ u32 expires = nf_ct_expires(ct);
+
+ /* normal case: large enough timeout, nothing to do. */
+ if (likely(expires >= min_timeout))
+ return;
+
+ /* must check offload bit after this, we do not hold any locks.
+ * flowtable and ct entries could have been removed on another CPU.
+ */
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
+ return;
+
+ /* load ct->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
+ if (nf_ct_is_confirmed(ct) &&
+ test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+ u8 l4proto = nf_ct_protonum(ct);
+ u32 new_timeout = true;
+
+ switch (l4proto) {
+ case IPPROTO_UDP:
+ new_timeout = NF_CT_DAY;
+ break;
+ case IPPROTO_TCP:
+ new_timeout = nf_flow_table_tcp_timeout(ct);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /* Update to ct->timeout from nf_conntrack happens
+ * without holding ct->lock.
+ *
+ * Use cmpxchg to ensure timeout extension doesn't
+ * happen when we race with conntrack datapath.
+ *
+ * The inverse -- datapath updating ->timeout right
+ * after this -- is fine, datapath is authoritative.
+ */
+ if (new_timeout) {
+ new_timeout += nfct_time_stamp;
+ cmpxchg(&ct->timeout, expires, new_timeout);
+ }
+ }
+
+ nf_ct_put(ct);
+}
+
static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
struct flow_offload *flow, void *data)
{
+ bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
+
if (nf_flow_has_expired(flow) ||
nf_ct_is_dying(flow->ct) ||
nf_flow_custom_gc(flow_table, flow))
flow_offload_teardown(flow);
+ else if (!teardown)
+ nf_flow_table_extend_ct_timeout(flow->ct);
- if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+ if (teardown) {
if (test_bit(NF_FLOW_HW, &flow->flags)) {
if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
nf_flow_offload_del(flow_table, flow);
@@ -431,6 +572,10 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
} else {
flow_offload_del(flow_table, flow);
}
+ } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
+ test_bit(NF_FLOW_HW, &flow->flags) &&
+ !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
+ nf_flow_offload_del(flow_table, flow);
} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
nf_flow_offload_stats(flow_table, flow);
}
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 98edcaa37b38..97c6eb8847a0 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -28,11 +28,15 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
return 0;
tcph = (void *)(skb_network_header(skb) + thoff);
- if (unlikely(tcph->fin || tcph->rst)) {
+ if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) {
flow_offload_teardown(flow);
return -1;
}
+ if ((tcph->fin || tcph->rst) &&
+ !test_bit(NF_FLOW_CLOSING, &flow->flags))
+ set_bit(NF_FLOW_CLOSING, &flow->flags);
+
return 0;
}
@@ -377,8 +381,10 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
- if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) {
+ flow_offload_teardown(flow);
return 0;
+ }
iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
thoff = (iph->ihl * 4) + ctx->offset;
@@ -656,8 +662,10 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
- if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) {
+ flow_offload_teardown(flow);
return 0;
+ }
ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
thoff = sizeof(*ip6h) + ctx->offset;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 83f3face8bb3..667459256e4c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1956,15 +1956,16 @@ static int nft_dump_basechain_hook(struct sk_buff *skb,
if (!first)
first = hook;
- if (nla_put_string(skb, NFTA_DEVICE_NAME,
- hook->ops.dev->name))
+ if (nla_put(skb, NFTA_DEVICE_NAME,
+ hook->ifnamelen, hook->ifname))
goto nla_put_failure;
n++;
}
nla_nest_end(skb, nest_devs);
if (n == 1 &&
- nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name))
+ nla_put(skb, NFTA_HOOK_DEV,
+ first->ifnamelen, first->ifname))
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -2276,7 +2277,6 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
const struct nlattr *attr)
{
struct net_device *dev;
- char ifname[IFNAMSIZ];
struct nft_hook *hook;
int err;
@@ -2286,12 +2286,17 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
goto err_hook_alloc;
}
- nla_strscpy(ifname, attr, IFNAMSIZ);
+ err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);
+ if (err < 0)
+ goto err_hook_dev;
+
+ hook->ifnamelen = nla_len(attr);
+
/* nf_tables_netdev_event() is called under rtnl_mutex, this is
* indirectly serializing all the other holders of the commit_mutex with
* the rtnl_mutex.
*/
- dev = __dev_get_by_name(net, ifname);
+ dev = __dev_get_by_name(net, hook->ifname);
if (!dev) {
err = -ENOENT;
goto err_hook_dev;
@@ -2312,7 +2317,7 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
struct nft_hook *hook;
list_for_each_entry(hook, hook_list, list) {
- if (this->ops.dev == hook->ops.dev)
+ if (!strcmp(hook->ifname, this->ifname))
return hook;
}
@@ -4752,6 +4757,14 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb,
return 0;
}
+static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size)
+{
+ if (ops->usize)
+ return ops->usize(size);
+
+ return size;
+}
+
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
@@ -4822,7 +4835,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (!nest)
goto nla_put_failure;
if (set->size &&
- nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
+ nla_put_be32(skb, NFTA_SET_DESC_SIZE,
+ htonl(nft_set_userspace_size(set->ops, set->size))))
goto nla_put_failure;
if (set->field_count > 1 &&
@@ -5190,6 +5204,15 @@ static bool nft_set_is_same(const struct nft_set *set,
return true;
}
+static u32 nft_set_kernel_size(const struct nft_set_ops *ops,
+ const struct nft_set_desc *desc)
+{
+ if (ops->ksize)
+ return ops->ksize(desc->size);
+
+ return desc->size;
+}
+
static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
@@ -5372,6 +5395,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if (err < 0)
return err;
+ if (desc.size)
+ desc.size = nft_set_kernel_size(set->ops, &desc);
+
err = 0;
if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
@@ -5394,6 +5420,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if (IS_ERR(ops))
return PTR_ERR(ops);
+ if (desc.size)
+ desc.size = nft_set_kernel_size(ops, &desc);
+
udlen = 0;
if (nla[NFTA_SET_USERDATA])
udlen = nla_len(nla[NFTA_SET_USERDATA]);
@@ -7050,6 +7079,27 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set,
return true;
}
+static u32 nft_set_maxsize(const struct nft_set *set)
+{
+ u32 maxsize, delta;
+
+ if (!set->size)
+ return UINT_MAX;
+
+ if (set->ops->adjust_maxsize)
+ delta = set->ops->adjust_maxsize(set);
+ else
+ delta = 0;
+
+ if (check_add_overflow(set->size, set->ndeact, &maxsize))
+ return UINT_MAX;
+
+ if (check_add_overflow(maxsize, delta, &maxsize))
+ return UINT_MAX;
+
+ return maxsize;
+}
+
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
@@ -7422,7 +7472,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
}
if (!(flags & NFT_SET_ELEM_CATCHALL)) {
- unsigned int max = set->size ? set->size + set->ndeact : UINT_MAX;
+ unsigned int max = nft_set_maxsize(set);
if (!atomic_add_unless(&set->nelems, 1, max)) {
err = -ENFILE;
@@ -8850,7 +8900,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
struct list_head *hook_list,
struct nft_flowtable *flowtable)
{
- struct nft_hook *hook, *hook2, *next;
+ struct nft_hook *hook, *next;
struct nft_flowtable *ft;
int err, i = 0;
@@ -8859,12 +8909,9 @@ static int nft_register_flowtable_net_hooks(struct net *net,
if (!nft_is_active_next(net, ft))
continue;
- list_for_each_entry(hook2, &ft->hook_list, list) {
- if (hook->ops.dev == hook2->ops.dev &&
- hook->ops.pf == hook2->ops.pf) {
- err = -EEXIST;
- goto err_unregister_net_hooks;
- }
+ if (nft_hook_list_find(&ft->hook_list, hook)) {
+ err = -EEXIST;
+ goto err_unregister_net_hooks;
}
}
@@ -9278,7 +9325,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
list_for_each_entry_rcu(hook, hook_list, list,
lockdep_commit_lock_is_held(net)) {
- if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
+ if (nla_put(skb, NFTA_DEVICE_NAME,
+ hook->ifnamelen, hook->ifname))
goto nla_put_failure;
}
nla_nest_end(skb, nest_devs);
@@ -11693,47 +11741,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
}
EXPORT_SYMBOL_GPL(nft_data_dump);
-static void __nft_release_basechain_now(struct nft_ctx *ctx)
-{
- struct nft_rule *rule, *nr;
-
- list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
- list_del(&rule->list);
- nf_tables_rule_release(ctx, rule);
- }
- nf_tables_chain_destroy(ctx->chain);
-}
-
-int __nft_release_basechain(struct nft_ctx *ctx)
-{
- struct nft_rule *rule;
-
- if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain)))
- return 0;
-
- nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
- list_for_each_entry(rule, &ctx->chain->rules, list)
- nft_use_dec(&ctx->chain->use);
-
- nft_chain_del(ctx->chain);
- nft_use_dec(&ctx->table->use);
-
- if (!maybe_get_net(ctx->net)) {
- __nft_release_basechain_now(ctx);
- return 0;
- }
-
- /* wait for ruleset dumps to complete. Owning chain is no longer in
- * lists, so new dumps can't find any of these rules anymore.
- */
- synchronize_rcu();
-
- __nft_release_basechain_now(ctx);
- put_net(ctx->net);
- return 0;
-}
-EXPORT_SYMBOL_GPL(__nft_release_basechain);
-
static void __nft_release_hook(struct net *net, struct nft_table *table)
{
struct nft_flowtable *flowtable;
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index 7010541fcca6..19a553550c76 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -319,37 +319,21 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
};
static void nft_netdev_event(unsigned long event, struct net_device *dev,
- struct nft_ctx *ctx)
+ struct nft_base_chain *basechain)
{
- struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
- struct nft_hook *hook, *found = NULL;
- int n = 0;
+ struct nft_hook *hook;
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.dev == dev)
- found = hook;
-
- n++;
- }
- if (!found)
- return;
+ if (hook->ops.dev != dev)
+ continue;
- if (n > 1) {
- if (!(ctx->chain->table->flags & NFT_TABLE_F_DORMANT))
- nf_unregister_net_hook(ctx->net, &found->ops);
+ if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT))
+ nf_unregister_net_hook(dev_net(dev), &hook->ops);
- list_del_rcu(&found->list);
- kfree_rcu(found, rcu);
- return;
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
+ break;
}
-
- /* UNREGISTER events are also happening on netns exit.
- *
- * Although nf_tables core releases all tables/chains, only this event
- * handler provides guarantee that hook->ops.dev is still accessible,
- * so we cannot skip exiting net namespaces.
- */
- __nft_release_basechain(ctx);
}
static int nf_tables_netdev_event(struct notifier_block *this,
@@ -358,25 +342,20 @@ static int nf_tables_netdev_event(struct notifier_block *this,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nft_base_chain *basechain;
struct nftables_pernet *nft_net;
- struct nft_chain *chain, *nr;
+ struct nft_chain *chain;
struct nft_table *table;
- struct nft_ctx ctx = {
- .net = dev_net(dev),
- };
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
- nft_net = nft_pernet(ctx.net);
+ nft_net = nft_pernet(dev_net(dev));
mutex_lock(&nft_net->commit_mutex);
list_for_each_entry(table, &nft_net->tables, list) {
if (table->family != NFPROTO_NETDEV &&
table->family != NFPROTO_INET)
continue;
- ctx.family = table->family;
- ctx.table = table;
- list_for_each_entry_safe(chain, nr, &table->chains, list) {
+ list_for_each_entry(chain, &table->chains, list) {
if (!nft_is_base_chain(chain))
continue;
@@ -385,8 +364,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
basechain->ops.hooknum != NF_INET_INGRESS)
continue;
- ctx.chain = chain;
- nft_netdev_event(event, dev, &ctx);
+ nft_netdev_event(event, dev, basechain);
}
}
mutex_unlock(&nft_net->commit_mutex);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 67a41cd2baaf..2e59aba681a1 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -929,7 +929,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj,
*/
values = nf_ct_timeout_data(timeout);
if (values)
- nf_ct_refresh(ct, pkt->skb, values[0]);
+ nf_ct_refresh(ct, values[0]);
}
static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 3b474d235663..221d50223018 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -289,6 +289,15 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
return false;
}
+static void flow_offload_ct_tcp(struct nf_conn *ct)
+{
+ /* conntrack will not see all packets, disable tcp window validation. */
+ spin_lock_bh(&ct->lock);
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ spin_unlock_bh(&ct->lock);
+}
+
static void nft_flow_offload_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -356,11 +365,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
goto err_flow_alloc;
flow_offload_route_init(flow, &route);
-
- if (tcph) {
- ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
- ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
- }
+ if (tcph)
+ flow_offload_ct_tcp(ct);
__set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
ret = flow_offload_add(flowtable, flow);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index b7ea21327549..2e8ef16ff191 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -750,6 +750,46 @@ static void nft_rbtree_gc_init(const struct nft_set *set)
priv->last_gc = jiffies;
}
+/* rbtree stores ranges as singleton elements, each range is composed of two
+ * elements ...
+ */
+static u32 nft_rbtree_ksize(u32 size)
+{
+ return size * 2;
+}
+
+/* ... hide this detail to userspace. */
+static u32 nft_rbtree_usize(u32 size)
+{
+ if (!size)
+ return 0;
+
+ return size / 2;
+}
+
+static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct rb_node *node;
+ const void *key;
+
+ node = rb_last(&priv->root);
+ if (!node)
+ return 0;
+
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ if (!nft_rbtree_interval_end(rbe))
+ return 0;
+
+ key = nft_set_ext_key(&rbe->ext);
+ if (memchr(key, 1, set->klen))
+ return 0;
+
+ /* this is the all-zero no-match element. */
+ return 1;
+}
+
const struct nft_set_type nft_set_rbtree_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.ops = {
@@ -768,5 +808,8 @@ const struct nft_set_type nft_set_rbtree_type = {
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
.get = nft_rbtree_get,
+ .ksize = nft_rbtree_ksize,
+ .usize = nft_rbtree_usize,
+ .adjust_maxsize = nft_rbtree_adjust_maxsize,
},
};