diff options
| author | Paolo Abeni <pabeni@redhat.com> | 2025-05-26 18:53:40 +0200 | 
|---|---|---|
| committer | Paolo Abeni <pabeni@redhat.com> | 2025-05-26 18:53:41 +0200 | 
| commit | f5b60d6a575a7573a15e08aad129382aa39c228c (patch) | |
| tree | c1772462a9ed99597523e254c57a77ed27aa9c50 | |
| parent | fdb061195f53e5b6d12595fc32a1a9c1130f0c23 (diff) | |
| parent | 73db1b5dab6fe17baf9fe2b0d7c8dfd1d4a5b3e5 (diff) | |
Merge tag 'nf-next-25-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Pablo Neira Ayuso says:
====================
Netfilter updates for net-next
The following batch contains Netfilter updates for net-next,
specifically 26 patches: 5 patches adding/updating selftests,
4 fixes, 3 PREEMPT_RT fixes, and 14 patches to enhance nf_tables):
1) Improve selftest coverage for pipapo 4 bit group format, from
   Florian Westphal.
2) Fix incorrect dependencies when compiling a kernel without
   legacy ip{6}tables support, also from Florian.
3) Two patches to fix nft_fib vrf issues, including selftest updates
   to improve coverage, also from Florian Westphal.
4) Fix incorrect nesting in nft_tunnel's GENEVE support, from
   Fernando F. Mancera.
5) Three patches to fix PREEMPT_RT issues with nf_dup infrastructure
   and nft_inner to match in inner headers, from Sebastian Andrzej Siewior.
6) Integrate conntrack information into nft trace infrastructure,
   from Florian Westphal.
7) A series of 13 patches to allow to specify wildcard netdevice in
   netdev basechain and flowtables, eg.
   table netdev filter {
       chain ingress {
           type filter hook ingress devices = { eth0, eth1, vlan* } priority 0; policy accept;
       }
   }
   This also allows for runtime hook registration on NETDEV_{UN}REGISTER
   event, from Phil Sutter.
netfilter pull request 25-05-23
* tag 'nf-next-25-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next: (26 commits)
  selftests: netfilter: Torture nftables netdev hooks
  netfilter: nf_tables: Add notifications for hook changes
  netfilter: nf_tables: Support wildcard netdev hook specs
  netfilter: nf_tables: Sort labels in nft_netdev_hook_alloc()
  netfilter: nf_tables: Handle NETDEV_CHANGENAME events
  netfilter: nf_tables: Wrap netdev notifiers
  netfilter: nf_tables: Respect NETDEV_REGISTER events
  netfilter: nf_tables: Prepare for handling NETDEV_REGISTER events
  netfilter: nf_tables: Have a list of nf_hook_ops in nft_hook
  netfilter: nf_tables: Pass nf_hook_ops to nft_unregister_flowtable_hook()
  netfilter: nf_tables: Introduce nft_register_flowtable_ops()
  netfilter: nf_tables: Introduce nft_hook_find_ops{,_rcu}()
  netfilter: nf_tables: Introduce functions freeing nft_hook objects
  netfilter: nf_tables: add packets conntrack state to debug trace info
  netfilter: conntrack: make nf_conntrack_id callable without a module dependency
  netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit
  netfilter: nft_inner: Use nested-BH locking for nft_pcpu_tun_ctx
  netfilter: nf_dup{4, 6}: Move duplication check to task_struct
  netfilter: nft_tunnel: fix geneve_opt dump
  selftests: netfilter: nft_fib.sh: add type and oif tests with and without VRFs
  ...
====================
Link: https://patch.msgid.link/20250523132712.458507-1-pablo@netfilter.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
31 files changed, 1504 insertions, 230 deletions
| diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 848735b3a7c0..813a19122ebb 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -11,6 +11,9 @@ struct netdev_xmit {  #if IS_ENABLED(CONFIG_NET_ACT_MIRRED)  	u8 sched_mirred_nest;  #endif +#if IS_ENABLED(CONFIG_NF_DUP_NETDEV) +	u8 nf_dup_skb_recursion; +#endif  };  #endif diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2b8aac2c70ad..5f896fcc074d 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -95,6 +95,9 @@ enum nf_hook_ops_type {  };  struct nf_hook_ops { +	struct list_head	list; +	struct rcu_head		rcu; +  	/* User fills in from here down. */  	nf_hookfn		*hook;  	struct net_device	*dev; @@ -470,6 +473,7 @@ struct nf_ct_hook {  	void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);  	void (*set_closing)(struct nf_conntrack *nfct);  	int (*confirm)(struct sk_buff *skb); +	u32 (*get_id)(const struct nf_conntrack *nfct);  };  extern const struct nf_ct_hook __rcu *nf_ct_hook; @@ -498,17 +502,6 @@ extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook;  extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook;  /* - * nf_skb_duplicated - TEE target has sent a packet - * - * When a xtables target sends a packet, the OUTPUT and POSTROUTING - * hooks are traversed again, i.e. nft and xtables are invoked recursively. - * - * This is used by xtables TEE target to prevent the duplicated skb from - * being duplicated again. - */ -DECLARE_PER_CPU(bool, nf_skb_duplicated); - -/*   * Contains bitmask of ctnetlink event subscribers, if any.   * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag.   */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..52d9c52dc8f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1044,6 +1044,7 @@ struct task_struct {  	/* delay due to memory thrashing */  	unsigned                        in_thrashing:1;  #endif +	unsigned			in_nf_duplicate:1;  #ifdef CONFIG_PREEMPT_RT  	struct netdev_xmit		net_xmit;  #endif diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 803d5f1601f9..e4d8e451e935 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1142,6 +1142,11 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);  int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);  void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +struct nft_hook; +void nf_tables_chain_device_notify(const struct nft_chain *chain, +				   const struct nft_hook *hook, +				   const struct net_device *dev, int event); +  enum nft_chain_types {  	NFT_CHAIN_T_DEFAULT = 0,  	NFT_CHAIN_T_ROUTE, @@ -1199,12 +1204,17 @@ struct nft_stats {  struct nft_hook {  	struct list_head	list; -	struct nf_hook_ops	ops; +	struct list_head	ops_list;  	struct rcu_head		rcu;  	char			ifname[IFNAMSIZ];  	u8			ifnamelen;  }; +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, +				      const struct net_device *dev); +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, +					  const struct net_device *dev); +  /**   *	struct nft_base_chain - nf_tables base chain   * diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 6e202ed5e63f..7370fba844ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -2,6 +2,7 @@  #ifndef _NFT_FIB_H_  #define _NFT_FIB_H_ +#include <net/l3mdev.h>  #include <net/netfilter/nf_tables.h>  struct nft_fib { @@ -39,6 +40,14 @@ static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt)  	return nft_fib_is_loopback(pkt->skb, indev);  } +static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, +						    const struct net_device *iif) +{ +	const struct net_device *dev = iif ? iif : pkt->skb->dev; + +	return l3mdev_master_ifindex_rcu(dev); +} +  int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset);  int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,  		 const struct nlattr * const tb[]); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7d6bc19a0153..518ba144544c 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -142,6 +142,8 @@ enum nf_tables_msg_types {  	NFT_MSG_DESTROYOBJ,  	NFT_MSG_DESTROYFLOWTABLE,  	NFT_MSG_GETSETELEM_RESET, +	NFT_MSG_NEWDEV, +	NFT_MSG_DELDEV,  	NFT_MSG_MAX,  }; @@ -1784,10 +1786,18 @@ enum nft_synproxy_attributes {   * enum nft_device_attributes - nf_tables device netlink attributes   *   * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING) + * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING) + * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING) + * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING)   */  enum nft_devices_attributes {  	NFTA_DEVICE_UNSPEC,  	NFTA_DEVICE_NAME, +	NFTA_DEVICE_TABLE, +	NFTA_DEVICE_FLOWTABLE, +	NFTA_DEVICE_CHAIN, +	NFTA_DEVICE_SPEC,  	__NFTA_DEVICE_MAX  };  #define NFTA_DEVICE_MAX		(__NFTA_DEVICE_MAX - 1) @@ -1841,6 +1851,10 @@ enum nft_xfrm_keys {   * @NFTA_TRACE_MARK: nfmark (NLA_U32)   * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32)   * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32) + * @NFTA_TRACE_CT_ID: conntrack id (NLA_U32) + * @NFTA_TRACE_CT_DIRECTION: packets direction (NLA_U8) + * @NFTA_TRACE_CT_STATUS: conntrack status (NLA_U32) + * @NFTA_TRACE_CT_STATE: packet state (new, established, ...) (NLA_U32)   */  enum nft_trace_attributes {  	NFTA_TRACE_UNSPEC, @@ -1861,6 +1875,10 @@ enum nft_trace_attributes {  	NFTA_TRACE_NFPROTO,  	NFTA_TRACE_POLICY,  	NFTA_TRACE_PAD, +	NFTA_TRACE_CT_ID, +	NFTA_TRACE_CT_DIRECTION, +	NFTA_TRACE_CT_STATUS, +	NFTA_TRACE_CT_STATE,  	__NFTA_TRACE_MAX  };  #define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1) diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 6cd58cd2a6f0..50d807af2649 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -25,6 +25,8 @@ enum nfnetlink_groups {  #define NFNLGRP_ACCT_QUOTA		NFNLGRP_ACCT_QUOTA  	NFNLGRP_NFTRACE,  #define NFNLGRP_NFTRACE			NFNLGRP_NFTRACE +	NFNLGRP_NFT_DEV, +#define NFNLGRP_NFT_DEV			NFNLGRP_NFT_DEV  	__NFNLGRP_MAX,  };  #define NFNLGRP_MAX	(__NFNLGRP_MAX - 1) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 3d101613f27f..23c8deff8095 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -270,7 +270,7 @@ ipt_do_table(void *priv,  	 * but it is no problem since absolute verdict is issued by these.  	 */  	if (static_key_false(&xt_tee_enabled)) -		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); +		jumpstack += private->stacksize * current->in_nf_duplicate;  	e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 25e1e8eb18dd..ed08fb78cfa8 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,  	struct iphdr *iph;  	local_bh_disable(); -	if (this_cpu_read(nf_skb_duplicated)) +	if (current->in_nf_duplicate)  		goto out;  	/*  	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,  		--iph->ttl;  	if (nf_dup_ipv4_route(net, skb, gw, oif)) { -		__this_cpu_write(nf_skb_duplicated, true); +		current->in_nf_duplicate = true;  		ip_local_out(net, skb->sk, skb); -		__this_cpu_write(nf_skb_duplicated, false); +		current->in_nf_duplicate = false;  	} else {  		kfree_skb(skb);  	} diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 9082ca17e845..7e7c49535e3f 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -50,7 +50,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,  	else  		addr = iph->saddr; -	*dst = inet_dev_addr_type(nft_net(pkt), dev, addr); +	if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) { +		*dst = inet_dev_addr_type(nft_net(pkt), dev, addr); +		return; +	} + +	*dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr);  }  EXPORT_SYMBOL_GPL(nft_fib4_eval_type); @@ -65,8 +70,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,  	struct flowi4 fl4 = {  		.flowi4_scope = RT_SCOPE_UNIVERSE,  		.flowi4_iif = LOOPBACK_IFINDEX, +		.flowi4_proto = pkt->tprot,  		.flowi4_uid = sock_net_uid(nft_net(pkt), NULL), -		.flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)),  	};  	const struct net_device *oif;  	const struct net_device *found; @@ -90,6 +95,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,  	else  		oif = NULL; +	fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif); +  	iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);  	if (!iph) {  		regs->verdict.code = NFT_BREAK; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d5602950ae7..d585ac3c1113 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb,  	 * but it is no problem since absolute verdict is issued by these.  	 */  	if (static_key_false(&xt_tee_enabled)) -		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); +		jumpstack += private->stacksize * current->in_nf_duplicate;  	e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 0c39c77fe8a8..b903c62c00c9 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,  		 const struct in6_addr *gw, int oif)  {  	local_bh_disable(); -	if (this_cpu_read(nf_skb_duplicated)) +	if (current->in_nf_duplicate)  		goto out;  	skb = pskb_copy(skb, GFP_ATOMIC);  	if (skb == NULL) @@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,  		--iph->hop_limit;  	}  	if (nf_dup_ipv6_route(net, skb, gw, oif)) { -		__this_cpu_write(nf_skb_duplicated, true); +		current->in_nf_duplicate = true;  		ip6_local_out(net, skb->sk, skb); -		__this_cpu_write(nf_skb_duplicated, false); +		current->in_nf_duplicate = false;  	} else {  		kfree_skb(skb);  	} diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 7fd9d7b21cd4..421036a3605b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -50,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,  		fl6->flowi6_mark = pkt->skb->mark;  	fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; +	fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev);  	return lookup_flags;  } @@ -73,8 +74,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,  	else if (priv->flags & NFTA_FIB_F_OIF)  		dev = nft_out(pkt); -	fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); -  	nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);  	if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) @@ -158,6 +157,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,  {  	const struct nft_fib *priv = nft_expr_priv(expr);  	int noff = skb_network_offset(pkt->skb); +	const struct net_device *found = NULL;  	const struct net_device *oif = NULL;  	u32 *dest = ®s->data[priv->dreg];  	struct ipv6hdr *iph, _iph; @@ -165,7 +165,6 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,  		.flowi6_iif = LOOPBACK_IFINDEX,  		.flowi6_proto = pkt->tprot,  		.flowi6_uid = sock_net_uid(nft_net(pkt), NULL), -		.flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)),  	};  	struct rt6_info *rt;  	int lookup_flags; @@ -203,11 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,  	if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))  		goto put_rt_err; -	if (oif && oif != rt->rt6i_idev->dev && -	    l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) -		goto put_rt_err; +	if (!oif) { +		found = rt->rt6i_idev->dev; +	} else { +		if (oif == rt->rt6i_idev->dev || +		    l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) +			found = oif; +	} -	nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); +	nft_fib_store_result(dest, priv, found);   put_rt_err:  	ip6_rt_put(rt);  } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b9f551f02c81..11a702065bab 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@  const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;  EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); -  #ifdef CONFIG_JUMP_LABEL  struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];  EXPORT_SYMBOL(nf_hooks_needed); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index de8d50af9b5b..201d3c4ec623 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -505,6 +505,11 @@ u32 nf_ct_get_id(const struct nf_conn *ct)  }  EXPORT_SYMBOL_GPL(nf_ct_get_id); +static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) +{ +	return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); +} +  static void  clean_from_lists(struct nf_conn *ct)  { @@ -2710,6 +2715,7 @@ static const struct nf_ct_hook nf_conntrack_hook = {  	.attach		= nf_conntrack_attach,  	.set_closing	= nf_conntrack_set_closing,  	.confirm	= __nf_conntrack_confirm, +	.get_id		= nf_conntrack_get_id,  };  void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a8e2425e43b0..fab8b9011098 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -15,12 +15,26 @@  #define NF_RECURSION_LIMIT	2 -static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ +	return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ +	return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif  static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,  				enum nf_dev_hooks hook)  { -	if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) +	u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + +	if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)  		goto err;  	if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { @@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,  	skb->dev = dev;  	skb_clear_tstamp(skb); -	__this_cpu_inc(nf_dup_skb_recursion); +	(*nf_dup_skb_recursion)++;  	dev_queue_xmit(skb); -	__this_cpu_dec(nf_dup_skb_recursion); +	(*nf_dup_skb_recursion)--;  	return;  err:  	kfree_skb(skb); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b28f6730e26d..24c71ecb2179 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -300,40 +300,75 @@ void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)  static int nft_netdev_register_hooks(struct net *net,  				     struct list_head *hook_list)  { +	struct nf_hook_ops *ops;  	struct nft_hook *hook;  	int err, j;  	j = 0;  	list_for_each_entry(hook, hook_list, list) { -		err = nf_register_net_hook(net, &hook->ops); -		if (err < 0) -			goto err_register; +		list_for_each_entry(ops, &hook->ops_list, list) { +			err = nf_register_net_hook(net, ops); +			if (err < 0) +				goto err_register; -		j++; +			j++; +		}  	}  	return 0;  err_register:  	list_for_each_entry(hook, hook_list, list) { -		if (j-- <= 0) -			break; +		list_for_each_entry(ops, &hook->ops_list, list) { +			if (j-- <= 0) +				break; -		nf_unregister_net_hook(net, &hook->ops); +			nf_unregister_net_hook(net, ops); +		}  	}  	return err;  } +static void nft_netdev_hook_free_ops(struct nft_hook *hook) +{ +	struct nf_hook_ops *ops, *next; + +	list_for_each_entry_safe(ops, next, &hook->ops_list, list) { +		list_del(&ops->list); +		kfree(ops); +	} +} + +static void nft_netdev_hook_free(struct nft_hook *hook) +{ +	nft_netdev_hook_free_ops(hook); +	kfree(hook); +} + +static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu) +{ +	struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu); + +	nft_netdev_hook_free(hook); +} + +static void nft_netdev_hook_free_rcu(struct nft_hook *hook) +{ +	call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); +} +  static void nft_netdev_unregister_hooks(struct net *net,  					struct list_head *hook_list,  					bool release_netdev)  {  	struct nft_hook *hook, *next; +	struct nf_hook_ops *ops;  	list_for_each_entry_safe(hook, next, hook_list, list) { -		nf_unregister_net_hook(net, &hook->ops); +		list_for_each_entry(ops, &hook->ops_list, list) +			nf_unregister_net_hook(net, ops);  		if (release_netdev) {  			list_del(&hook->list); -			kfree_rcu(hook, rcu); +			nft_netdev_hook_free_rcu(hook);  		}  	}  } @@ -2253,7 +2288,7 @@ void nf_tables_chain_destroy(struct nft_chain *chain)  			list_for_each_entry_safe(hook, next,  						 &basechain->hook_list, list) {  				list_del_rcu(&hook->list); -				kfree_rcu(hook, rcu); +				nft_netdev_hook_free_rcu(hook);  			}  		}  		module_put(basechain->type->owner); @@ -2274,19 +2309,20 @@ void nf_tables_chain_destroy(struct nft_chain *chain)  static struct nft_hook *nft_netdev_hook_alloc(struct net *net,  					      const struct nlattr *attr)  { +	struct nf_hook_ops *ops;  	struct net_device *dev;  	struct nft_hook *hook;  	int err;  	hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); -	if (!hook) { -		err = -ENOMEM; -		goto err_hook_alloc; -	} +	if (!hook) +		return ERR_PTR(-ENOMEM); + +	INIT_LIST_HEAD(&hook->ops_list);  	err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);  	if (err < 0) -		goto err_hook_dev; +		goto err_hook_free;  	hook->ifnamelen = nla_len(attr); @@ -2294,18 +2330,22 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,  	 * indirectly serializing all the other holders of the commit_mutex with  	 * the rtnl_mutex.  	 */ -	dev = __dev_get_by_name(net, hook->ifname); -	if (!dev) { -		err = -ENOENT; -		goto err_hook_dev; -	} -	hook->ops.dev = dev; +	for_each_netdev(net, dev) { +		if (strncmp(dev->name, hook->ifname, hook->ifnamelen)) +			continue; +		ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); +		if (!ops) { +			err = -ENOMEM; +			goto err_hook_free; +		} +		ops->dev = dev; +		list_add_tail(&ops->list, &hook->ops_list); +	}  	return hook; -err_hook_dev: -	kfree(hook); -err_hook_alloc: +err_hook_free: +	nft_netdev_hook_free(hook);  	return ERR_PTR(err);  } @@ -2315,7 +2355,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,  	struct nft_hook *hook;  	list_for_each_entry(hook, hook_list, list) { -		if (!strcmp(hook->ifname, this->ifname)) +		if (!strncmp(hook->ifname, this->ifname, +			     min(hook->ifnamelen, this->ifnamelen)))  			return hook;  	} @@ -2345,7 +2386,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,  		}  		if (nft_hook_list_find(hook_list, hook)) {  			NL_SET_BAD_ATTR(extack, tmp); -			kfree(hook); +			nft_netdev_hook_free(hook);  			err = -EEXIST;  			goto err_hook;  		} @@ -2363,7 +2404,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,  err_hook:  	list_for_each_entry_safe(hook, next, hook_list, list) {  		list_del(&hook->list); -		kfree(hook); +		nft_netdev_hook_free(hook);  	}  	return err;  } @@ -2506,7 +2547,7 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)  	list_for_each_entry_safe(h, next, &hook->list, list) {  		list_del(&h->list); -		kfree(h); +		nft_netdev_hook_free(h);  	}  	module_put(hook->type->owner);  } @@ -2559,6 +2600,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,  			      struct nft_chain_hook *hook, u32 flags)  {  	struct nft_chain *chain; +	struct nf_hook_ops *ops;  	struct nft_hook *h;  	basechain->type = hook->type; @@ -2567,8 +2609,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,  	if (nft_base_chain_netdev(family, hook->num)) {  		list_splice_init(&hook->list, &basechain->hook_list); -		list_for_each_entry(h, &basechain->hook_list, list) -			nft_basechain_hook_init(&h->ops, family, hook, chain); +		list_for_each_entry(h, &basechain->hook_list, list) { +			list_for_each_entry(ops, &h->ops_list, list) +				nft_basechain_hook_init(ops, family, hook, chain); +		}  	}  	nft_basechain_hook_init(&basechain->ops, family, hook, chain); @@ -2787,15 +2831,17 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,  		if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {  			list_for_each_entry_safe(h, next, &hook.list, list) { -				h->ops.pf	= basechain->ops.pf; -				h->ops.hooknum	= basechain->ops.hooknum; -				h->ops.priority	= basechain->ops.priority; -				h->ops.priv	= basechain->ops.priv; -				h->ops.hook	= basechain->ops.hook; +				list_for_each_entry(ops, &h->ops_list, list) { +					ops->pf		= basechain->ops.pf; +					ops->hooknum	= basechain->ops.hooknum; +					ops->priority	= basechain->ops.priority; +					ops->priv	= basechain->ops.priv; +					ops->hook	= basechain->ops.hook; +				}  				if (nft_hook_list_find(&basechain->hook_list, h)) {  					list_del(&h->list); -					kfree(h); +					nft_netdev_hook_free(h);  				}  			}  		} else { @@ -2913,10 +2959,12 @@ err_trans:  err_hooks:  	if (nla[NFTA_CHAIN_HOOK]) {  		list_for_each_entry_safe(h, next, &hook.list, list) { -			if (unregister) -				nf_unregister_net_hook(ctx->net, &h->ops); +			if (unregister) { +				list_for_each_entry(ops, &h->ops_list, list) +					nf_unregister_net_hook(ctx->net, ops); +			}  			list_del(&h->list); -			kfree_rcu(h, rcu); +			nft_netdev_hook_free_rcu(h);  		}  		module_put(hook.type->owner);  	} @@ -8785,6 +8833,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,  				    struct netlink_ext_ack *extack, bool add)  {  	struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; +	struct nf_hook_ops *ops;  	struct nft_hook *hook;  	int hooknum, priority;  	int err; @@ -8839,11 +8888,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,  	}  	list_for_each_entry(hook, &flowtable_hook->list, list) { -		hook->ops.pf		= NFPROTO_NETDEV; -		hook->ops.hooknum	= flowtable_hook->num; -		hook->ops.priority	= flowtable_hook->priority; -		hook->ops.priv		= &flowtable->data; -		hook->ops.hook		= flowtable->data.type->hook; +		list_for_each_entry(ops, &hook->ops_list, list) { +			ops->pf		= NFPROTO_NETDEV; +			ops->hooknum	= flowtable_hook->num; +			ops->priority	= flowtable_hook->priority; +			ops->priv	= &flowtable->data; +			ops->hook	= flowtable->data.type->hook; +		}  	}  	return err; @@ -8885,12 +8936,12 @@ nft_flowtable_type_get(struct net *net, u8 family)  }  /* Only called from error and netdev event paths. */ -static void nft_unregister_flowtable_hook(struct net *net, -					  struct nft_flowtable *flowtable, -					  struct nft_hook *hook) +static void nft_unregister_flowtable_ops(struct net *net, +					 struct nft_flowtable *flowtable, +					 struct nf_hook_ops *ops)  { -	nf_unregister_net_hook(net, &hook->ops); -	flowtable->data.type->setup(&flowtable->data, hook->ops.dev, +	nf_unregister_net_hook(net, ops); +	flowtable->data.type->setup(&flowtable->data, ops->dev,  				    FLOW_BLOCK_UNBIND);  } @@ -8900,14 +8951,14 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net,  					         bool release_netdev)  {  	struct nft_hook *hook, *next; +	struct nf_hook_ops *ops;  	list_for_each_entry_safe(hook, next, hook_list, list) { -		nf_unregister_net_hook(net, &hook->ops); -		flowtable->data.type->setup(&flowtable->data, hook->ops.dev, -					    FLOW_BLOCK_UNBIND); +		list_for_each_entry(ops, &hook->ops_list, list) +			nft_unregister_flowtable_ops(net, flowtable, ops);  		if (release_netdev) {  			list_del(&hook->list); -			kfree_rcu(hook, rcu); +			nft_netdev_hook_free_rcu(hook);  		}  	}  } @@ -8919,6 +8970,26 @@ static void nft_unregister_flowtable_net_hooks(struct net *net,  	__nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false);  } +static int nft_register_flowtable_ops(struct net *net, +				      struct nft_flowtable *flowtable, +				      struct nf_hook_ops *ops) +{ +	int err; + +	err = flowtable->data.type->setup(&flowtable->data, +					  ops->dev, FLOW_BLOCK_BIND); +	if (err < 0) +		return err; + +	err = nf_register_net_hook(net, ops); +	if (!err) +		return 0; + +	flowtable->data.type->setup(&flowtable->data, +				    ops->dev, FLOW_BLOCK_UNBIND); +	return err; +} +  static int nft_register_flowtable_net_hooks(struct net *net,  					    struct nft_table *table,  					    struct list_head *hook_list, @@ -8926,6 +8997,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,  {  	struct nft_hook *hook, *next;  	struct nft_flowtable *ft; +	struct nf_hook_ops *ops;  	int err, i = 0;  	list_for_each_entry(hook, hook_list, list) { @@ -8939,33 +9011,27 @@ static int nft_register_flowtable_net_hooks(struct net *net,  			}  		} -		err = flowtable->data.type->setup(&flowtable->data, -						  hook->ops.dev, -						  FLOW_BLOCK_BIND); -		if (err < 0) -			goto err_unregister_net_hooks; +		list_for_each_entry(ops, &hook->ops_list, list) { +			err = nft_register_flowtable_ops(net, flowtable, ops); +			if (err < 0) +				goto err_unregister_net_hooks; -		err = nf_register_net_hook(net, &hook->ops); -		if (err < 0) { -			flowtable->data.type->setup(&flowtable->data, -						    hook->ops.dev, -						    FLOW_BLOCK_UNBIND); -			goto err_unregister_net_hooks; +			i++;  		} - -		i++;  	}  	return 0;  err_unregister_net_hooks:  	list_for_each_entry_safe(hook, next, hook_list, list) { -		if (i-- <= 0) -			break; +		list_for_each_entry(ops, &hook->ops_list, list) { +			if (i-- <= 0) +				break; -		nft_unregister_flowtable_hook(net, flowtable, hook); +			nft_unregister_flowtable_ops(net, flowtable, ops); +		}  		list_del_rcu(&hook->list); -		kfree_rcu(hook, rcu); +		nft_netdev_hook_free_rcu(hook);  	}  	return err; @@ -8977,7 +9043,7 @@ static void nft_hooks_destroy(struct list_head *hook_list)  	list_for_each_entry_safe(hook, next, hook_list, list) {  		list_del_rcu(&hook->list); -		kfree_rcu(hook, rcu); +		nft_netdev_hook_free_rcu(hook);  	}  } @@ -8988,6 +9054,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,  	const struct nlattr * const *nla = ctx->nla;  	struct nft_flowtable_hook flowtable_hook;  	struct nft_hook *hook, *next; +	struct nf_hook_ops *ops;  	struct nft_trans *trans;  	bool unregister = false;  	u32 flags; @@ -9001,7 +9068,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,  	list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {  		if (nft_hook_list_find(&flowtable->hook_list, hook)) {  			list_del(&hook->list); -			kfree(hook); +			nft_netdev_hook_free(hook);  		}  	} @@ -9045,10 +9112,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,  err_flowtable_update_hook:  	list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { -		if (unregister) -			nft_unregister_flowtable_hook(ctx->net, flowtable, hook); +		if (unregister) { +			list_for_each_entry(ops, &hook->ops_list, list) +				nft_unregister_flowtable_ops(ctx->net, +							     flowtable, ops); +		}  		list_del_rcu(&hook->list); -		kfree_rcu(hook, rcu); +		nft_netdev_hook_free_rcu(hook);  	}  	return err; @@ -9194,7 +9264,7 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook  	list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {  		list_del(&this->list); -		kfree(this); +		nft_netdev_hook_free(this);  	}  } @@ -9557,7 +9627,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)  	flowtable->data.type->free(&flowtable->data);  	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {  		list_del_rcu(&hook->list); -		kfree_rcu(hook, rcu); +		nft_netdev_hook_free_rcu(hook);  	}  	kfree(flowtable->name);  	module_put(flowtable->data.type->owner); @@ -9590,46 +9660,190 @@ nla_put_failure:  	return -EMSGSIZE;  } -static void nft_flowtable_event(unsigned long event, struct net_device *dev, -				struct nft_flowtable *flowtable) +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, +				      const struct net_device *dev) +{ +	struct nf_hook_ops *ops; + +	list_for_each_entry(ops, &hook->ops_list, list) { +		if (ops->dev == dev) +			return ops; +	} +	return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops); + +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, +					  const struct net_device *dev)  { +	struct nf_hook_ops *ops; + +	list_for_each_entry_rcu(ops, &hook->ops_list, list) { +		if (ops->dev == dev) +			return ops; +	} +	return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); + +static void +nf_tables_device_notify(const struct nft_table *table, int attr, +			const char *name, const struct nft_hook *hook, +			const struct net_device *dev, int event) +{ +	struct net *net = dev_net(dev); +	struct nlmsghdr *nlh; +	struct sk_buff *skb; +	u16 flags = 0; + +	if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV)) +		return; + +	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (!skb) +		goto err; + +	event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV; +	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); +	nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family, +			   NFNETLINK_V0, nft_base_seq(net)); +	if (!nlh) +		goto err; + +	if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) || +	    nla_put_string(skb, attr, name) || +	    nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) || +	    nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) +		goto err; + +	nlmsg_end(skb, nlh); +	nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV, +		       nlmsg_report(nlh), GFP_KERNEL); +	return; +err: +	if (skb) +		kfree_skb(skb); +	nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS); +} + +void +nf_tables_chain_device_notify(const struct nft_chain *chain, +			      const struct nft_hook *hook, +			      const struct net_device *dev, int event) +{ +	nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN, +				chain->name, hook, dev, event); +} + +static void +nf_tables_flowtable_device_notify(const struct nft_flowtable *ft, +				  const struct nft_hook *hook, +				  const struct net_device *dev, int event) +{ +	nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE, +				ft->name, hook, dev, event); +} + +static int nft_flowtable_event(unsigned long event, struct net_device *dev, +			       struct nft_flowtable *flowtable, bool changename) +{ +	struct nf_hook_ops *ops;  	struct nft_hook *hook; +	bool match;  	list_for_each_entry(hook, &flowtable->hook_list, list) { -		if (hook->ops.dev != dev) -			continue; +		ops = nft_hook_find_ops(hook, dev); +		match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); -		/* flow_offload_netdev_event() cleans up entries for us. */ -		nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); -		list_del_rcu(&hook->list); -		kfree_rcu(hook, rcu); +		switch (event) { +		case NETDEV_UNREGISTER: +			/* NOP if not found or new name still matching */ +			if (!ops || (changename && match)) +				continue; + +			/* flow_offload_netdev_event() cleans up entries for us. */ +			nft_unregister_flowtable_ops(dev_net(dev), +						     flowtable, ops); +			list_del_rcu(&ops->list); +			kfree_rcu(ops, rcu); +			break; +		case NETDEV_REGISTER: +			/* NOP if not matching or already registered */ +			if (!match || (changename && ops)) +				continue; + +			ops = kzalloc(sizeof(struct nf_hook_ops), +				      GFP_KERNEL_ACCOUNT); +			if (!ops) +				return 1; + +			ops->pf		= NFPROTO_NETDEV; +			ops->hooknum	= flowtable->hooknum; +			ops->priority	= flowtable->data.priority; +			ops->priv	= &flowtable->data; +			ops->hook	= flowtable->data.type->hook; +			ops->dev	= dev; +			if (nft_register_flowtable_ops(dev_net(dev), +						       flowtable, ops)) { +				kfree(ops); +				return 1; +			} +			list_add_tail_rcu(&ops->list, &hook->ops_list); +			break; +		} +		nf_tables_flowtable_device_notify(flowtable, hook, dev, event);  		break;  	} +	return 0; +} + +static int __nf_tables_flowtable_event(unsigned long event, +				       struct net_device *dev, +				       bool changename) +{ +	struct nftables_pernet *nft_net = nft_pernet(dev_net(dev)); +	struct nft_flowtable *flowtable; +	struct nft_table *table; + +	list_for_each_entry(table, &nft_net->tables, list) { +		list_for_each_entry(flowtable, &table->flowtables, list) { +			if (nft_flowtable_event(event, dev, +						flowtable, changename)) +				return 1; +		} +	} +	return 0;  }  static int nf_tables_flowtable_event(struct notifier_block *this,  				     unsigned long event, void *ptr)  {  	struct net_device *dev = netdev_notifier_info_to_dev(ptr); -	struct nft_flowtable *flowtable;  	struct nftables_pernet *nft_net; -	struct nft_table *table; +	int ret = NOTIFY_DONE;  	struct net *net; -	if (event != NETDEV_UNREGISTER) -		return 0; +	if (event != NETDEV_REGISTER && +	    event != NETDEV_UNREGISTER && +	    event != NETDEV_CHANGENAME) +		return NOTIFY_DONE;  	net = dev_net(dev);  	nft_net = nft_pernet(net);  	mutex_lock(&nft_net->commit_mutex); -	list_for_each_entry(table, &nft_net->tables, list) { -		list_for_each_entry(flowtable, &table->flowtables, list) { -			nft_flowtable_event(event, dev, flowtable); + +	if (event == NETDEV_CHANGENAME) { +		if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) { +			ret = NOTIFY_BAD; +			goto out_unlock;  		} +		__nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true); +	} else if (__nf_tables_flowtable_event(event, dev, false)) { +		ret = NOTIFY_BAD;  	} +out_unlock:  	mutex_unlock(&nft_net->commit_mutex); - -	return NOTIFY_DONE; +	return ret;  }  static struct notifier_block nf_tables_flowtable_notifier = { diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 64675f1c7f29..fd30e205de84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -220,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain)  bool nft_chain_offload_support(const struct nft_base_chain *basechain)  { +	struct nf_hook_ops *ops;  	struct net_device *dev;  	struct nft_hook *hook; @@ -227,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain)  		return false;  	list_for_each_entry(hook, &basechain->hook_list, list) { -		if (hook->ops.pf != NFPROTO_NETDEV || -		    hook->ops.hooknum != NF_NETDEV_INGRESS) -			return false; - -		dev = hook->ops.dev; -		if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) -			return false; +		list_for_each_entry(ops, &hook->ops_list, list) { +			if (ops->pf != NFPROTO_NETDEV || +			    ops->hooknum != NF_NETDEV_INGRESS) +				return false; + +			dev = ops->dev; +			if (!dev->netdev_ops->ndo_setup_tc && +			    !flow_indr_dev_exists()) +				return false; +		}  	}  	return true; @@ -455,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain,  				const struct net_device *this_dev,  				enum flow_block_command cmd)  { -	struct net_device *dev; +	struct nf_hook_ops *ops;  	struct nft_hook *hook;  	int err, i = 0;  	list_for_each_entry(hook, &basechain->hook_list, list) { -		dev = hook->ops.dev; -		if (this_dev && this_dev != dev) -			continue; +		list_for_each_entry(ops, &hook->ops_list, list) { +			if (this_dev && this_dev != ops->dev) +				continue; -		err = nft_chain_offload_cmd(basechain, dev, cmd); -		if (err < 0 && cmd == FLOW_BLOCK_BIND) { -			if (!this_dev) -				goto err_flow_block; +			err = nft_chain_offload_cmd(basechain, ops->dev, cmd); +			if (err < 0 && cmd == FLOW_BLOCK_BIND) { +				if (!this_dev) +					goto err_flow_block; -			return err; +				return err; +			} +			i++;  		} -		i++;  	}  	return 0;  err_flow_block:  	list_for_each_entry(hook, &basechain->hook_list, list) { -		if (i-- <= 0) -			break; +		list_for_each_entry(ops, &hook->ops_list, list) { +			if (i-- <= 0) +				break; -		dev = hook->ops.dev; -		nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); +			nft_chain_offload_cmd(basechain, ops->dev, +					      FLOW_BLOCK_UNBIND); +		}  	}  	return err;  } @@ -638,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n  			found = NULL;  			basechain = nft_base_chain(chain);  			list_for_each_entry(hook, &basechain->hook_list, list) { -				if (hook->ops.dev != dev) +				if (!nft_hook_find_ops(hook, dev))  					continue;  				found = hook; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 580c55268f65..ae3fe87195ab 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -15,6 +15,7 @@  #include <linux/netfilter.h>  #include <linux/netfilter/nfnetlink.h>  #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h>  #include <net/netfilter/nf_tables_core.h>  #include <net/netfilter/nf_tables.h> @@ -90,6 +91,49 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb,  	return 0;  } +static int nf_trace_fill_ct_info(struct sk_buff *nlskb, +				 const struct sk_buff *skb) +{ +	const struct nf_ct_hook *ct_hook; +	enum ip_conntrack_info ctinfo; +	const struct nf_conn *ct; +	u32 state; + +	ct_hook = rcu_dereference(nf_ct_hook); +	if (!ct_hook) +		return 0; + +	ct = nf_ct_get(skb, &ctinfo); +	if (!ct) { +		if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */ +			return 0; + +		state = NF_CT_STATE_UNTRACKED_BIT; +	} else { +		state = NF_CT_STATE_BIT(ctinfo); +	} + +	if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state))) +		return -1; + +	if (ct) { +		u32 id = ct_hook->get_id(&ct->ct_general); +		u32 status = READ_ONCE(ct->status); +		u8 dir = CTINFO2DIR(ctinfo); + +		if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir)) +			return -1; + +		if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id)) +			return -1; + +		if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status))) +			return -1; +	} + +	return 0; +} +  static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,  				  const struct nft_pktinfo *pkt)  { @@ -210,7 +254,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,  		nla_total_size(sizeof(__be32)) +	/* trace type */  		nla_total_size(0) +			/* VERDICT, nested */  			nla_total_size(sizeof(u32)) +	/* verdict code */ -		nla_total_size(sizeof(u32)) +		/* id */ +		nla_total_size(sizeof(u32)) +		/* ct id */ +		nla_total_size(sizeof(u8)) +		/* ct direction */ +		nla_total_size(sizeof(u32)) +		/* ct state */ +		nla_total_size(sizeof(u32)) +		/* ct status */ +		nla_total_size(sizeof(u32)) +		/* trace id */  		nla_total_size(NFT_TRACETYPE_LL_HSIZE) +  		nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +  		nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + @@ -291,6 +339,10 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,  		if (nf_trace_fill_pkt_info(skb, pkt))  			goto nla_put_failure; + +		if (nf_trace_fill_ct_info(skb, pkt->skb)) +			goto nla_put_failure; +  		info->packet_dumped = true;  	} diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..ac77fc21632d 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -86,6 +86,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {  	[NFNLGRP_NFTABLES]		= NFNL_SUBSYS_NFTABLES,  	[NFNLGRP_ACCT_QUOTA]		= NFNL_SUBSYS_ACCT,  	[NFNLGRP_NFTRACE]		= NFNL_SUBSYS_NFTABLES, +	[NFNLGRP_NFT_DEV]		= NFNL_SUBSYS_NFTABLES,  };  static struct nfnl_net *nfnl_pernet(struct net *net) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 19a553550c76..846d48ba8965 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -318,38 +318,68 @@ static const struct nft_chain_type nft_chain_filter_netdev = {  	},  }; -static void nft_netdev_event(unsigned long event, struct net_device *dev, -			     struct nft_base_chain *basechain) +static int nft_netdev_event(unsigned long event, struct net_device *dev, +			    struct nft_base_chain *basechain, bool changename)  { +	struct nft_table *table = basechain->chain.table; +	struct nf_hook_ops *ops;  	struct nft_hook *hook; +	bool match;  	list_for_each_entry(hook, &basechain->hook_list, list) { -		if (hook->ops.dev != dev) -			continue; +		ops = nft_hook_find_ops(hook, dev); +		match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); + +		switch (event) { +		case NETDEV_UNREGISTER: +			/* NOP if not found or new name still matching */ +			if (!ops || (changename && match)) +				continue; + +			if (!(table->flags & NFT_TABLE_F_DORMANT)) +				nf_unregister_net_hook(dev_net(dev), ops); -		if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT)) -			nf_unregister_net_hook(dev_net(dev), &hook->ops); +			list_del_rcu(&ops->list); +			kfree_rcu(ops, rcu); +			break; +		case NETDEV_REGISTER: +			/* NOP if not matching or already registered */ +			if (!match || (changename && ops)) +				continue; -		list_del_rcu(&hook->list); -		kfree_rcu(hook, rcu); +			ops = kmemdup(&basechain->ops, +				      sizeof(struct nf_hook_ops), +				      GFP_KERNEL_ACCOUNT); +			if (!ops) +				return 1; + +			ops->dev = dev; + +			if (!(table->flags & NFT_TABLE_F_DORMANT) && +			    nf_register_net_hook(dev_net(dev), ops)) { +				kfree(ops); +				return 1; +			} +			list_add_tail_rcu(&ops->list, &hook->ops_list); +			break; +		} +		nf_tables_chain_device_notify(&basechain->chain, +					      hook, dev, event);  		break;  	} +	return 0;  } -static int nf_tables_netdev_event(struct notifier_block *this, -				  unsigned long event, void *ptr) +static int __nf_tables_netdev_event(unsigned long event, +				    struct net_device *dev, +				    bool changename)  { -	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct nft_base_chain *basechain;  	struct nftables_pernet *nft_net;  	struct nft_chain *chain;  	struct nft_table *table; -	if (event != NETDEV_UNREGISTER) -		return NOTIFY_DONE; -  	nft_net = nft_pernet(dev_net(dev)); -	mutex_lock(&nft_net->commit_mutex);  	list_for_each_entry(table, &nft_net->tables, list) {  		if (table->family != NFPROTO_NETDEV &&  		    table->family != NFPROTO_INET) @@ -364,12 +394,40 @@ static int nf_tables_netdev_event(struct notifier_block *this,  			    basechain->ops.hooknum != NF_INET_INGRESS)  				continue; -			nft_netdev_event(event, dev, basechain); +			if (nft_netdev_event(event, dev, basechain, changename)) +				return 1;  		}  	} -	mutex_unlock(&nft_net->commit_mutex); +	return 0; +} + +static int nf_tables_netdev_event(struct notifier_block *this, +				  unsigned long event, void *ptr) +{ +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct nftables_pernet *nft_net; +	int ret = NOTIFY_DONE; -	return NOTIFY_DONE; +	if (event != NETDEV_REGISTER && +	    event != NETDEV_UNREGISTER && +	    event != NETDEV_CHANGENAME) +		return NOTIFY_DONE; + +	nft_net = nft_pernet(dev_net(dev)); +	mutex_lock(&nft_net->commit_mutex); + +	if (event == NETDEV_CHANGENAME) { +		if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) { +			ret = NOTIFY_BAD; +			goto out_unlock; +		} +		__nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true); +	} else if (__nf_tables_netdev_event(event, dev, false)) { +		ret = NOTIFY_BAD; +	} +out_unlock: +	mutex_unlock(&nft_net->commit_mutex); +	return ret;  }  static struct notifier_block nf_tables_netdev_notifier = { diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 221d50223018..225ff293cd50 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -175,7 +175,7 @@ static bool nft_flowtable_find_dev(const struct net_device *dev,  	bool found = false;  	list_for_each_entry_rcu(hook, &ft->hook_list, list) { -		if (hook->ops.dev != dev) +		if (!nft_hook_find_ops_rcu(hook, dev))  			continue;  		found = true; diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 817ab978d24a..c4569d4b9228 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -23,7 +23,14 @@  #include <linux/ip.h>  #include <linux/ipv6.h> -static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); +struct nft_inner_tun_ctx_locked { +	struct nft_inner_tun_ctx ctx; +	local_lock_t bh_lock; +}; + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = { +	.bh_lock = INIT_LOCAL_LOCK(bh_lock), +};  /* Same layout as nft_expr but it embeds the private expression data area. */  struct __nft_expr { @@ -237,12 +244,15 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,  	struct nft_inner_tun_ctx *this_cpu_tun_ctx;  	local_bh_disable(); -	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); +	local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); +	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);  	if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) {  		local_bh_enable(); +		local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);  		return false;  	}  	*tun_ctx = *this_cpu_tun_ctx; +	local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);  	local_bh_enable();  	return true; @@ -254,9 +264,11 @@ static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,  	struct nft_inner_tun_ctx *this_cpu_tun_ctx;  	local_bh_disable(); -	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); +	local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); +	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);  	if (this_cpu_tun_ctx->cookie != tun_ctx->cookie)  		*this_cpu_tun_ctx = *tun_ctx; +	local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);  	local_bh_enable();  } diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 0c63d1367cf7..a12486ae089d 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -621,10 +621,10 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,  		struct geneve_opt *opt;  		int offset = 0; -		inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); -		if (!inner) -			goto failure;  		while (opts->len > offset) { +			inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); +			if (!inner) +				goto failure;  			opt = (struct geneve_opt *)(opts->u.data + offset);  			if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,  					 opt->opt_class) || @@ -634,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,  				    opt->length * 4, opt->opt_data))  				goto inner_failure;  			offset += sizeof(*opt) + opt->length * 4; +			nla_nest_end(skb, inner);  		} -		nla_nest_end(skb, inner);  	}  	nla_nest_end(skb, nest);  	return 0; diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 30e99464171b..93f064306901 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)  	return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb));  } -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  static unsigned int  tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)  { @@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {  		.targetsize = sizeof(struct xt_tcpoptstrip_target_info),  		.me         = THIS_MODULE,  	}, -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)  	{  		.name       = "TCPOPTSTRIP",  		.family     = NFPROTO_IPV6, diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 65b965ca40ea..59b9d04400ca 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -48,7 +48,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {  		.targetsize     = sizeof(struct xt_mark_tginfo2),  		.me             = THIS_MODULE,  	}, -#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) +#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP)  	{  		.name           = "MARK",  		.revision       = 2, diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index 3bdcbbdba925..e9b2f553588d 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -24,6 +24,7 @@ TEST_PROGS += nft_concat_range.sh  TEST_PROGS += nft_conntrack_helper.sh  TEST_PROGS += nft_fib.sh  TEST_PROGS += nft_flowtable.sh +TEST_PROGS += nft_interface_stress.sh  TEST_PROGS += nft_meta.sh  TEST_PROGS += nft_nat.sh  TEST_PROGS += nft_nat_zones.sh diff --git a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh index 025b58f2ae91..207b79932d91 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh @@ -32,7 +32,6 @@ source lib.sh  IP0=172.30.30.1  IP1=172.30.30.2 -DUMMYNET=10.9.9  PFXL=30  ret=0 @@ -52,8 +51,6 @@ trap cleanup EXIT  setup_ns ns0 ns1 -ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.forwarding=1 -  if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then  	echo "SKIP: Could not add veth device"  	exit $ksft_skip @@ -64,18 +61,13 @@ if ! ip -net "$ns0" li add tvrf type vrf table 9876; then  	exit $ksft_skip  fi -ip -net "$ns0" link add dummy0 type dummy -  ip -net "$ns0" li set veth0 master tvrf -ip -net "$ns0" li set dummy0 master tvrf  ip -net "$ns0" li set tvrf up  ip -net "$ns0" li set veth0 up -ip -net "$ns0" li set dummy0 up  ip -net "$ns1" li set veth0 up  ip -net "$ns0" addr add $IP0/$PFXL dev veth0  ip -net "$ns1" addr add $IP1/$PFXL dev veth0 -ip -net "$ns0" addr add $DUMMYNET.1/$PFXL dev dummy0  listener_ready()  { @@ -216,35 +208,9 @@ EOF  	fi  } -test_fib() -{ -ip netns exec "$ns0" nft -f - <<EOF -flush ruleset -table ip t { -	counter fibcount { } - -	chain prerouting { -		type filter hook prerouting priority 0; -		meta iifname veth0 ip daddr $DUMMYNET.2 fib daddr oif dummy0 counter name fibcount notrack -	} -} -EOF -	ip -net "$ns1" route add 10.9.9.0/24 via "$IP0" dev veth0 -	ip netns exec "$ns1" ping -q -w 1 -c 1 "$DUMMYNET".2 > /dev/null - -	if ip netns exec "$ns0" nft list counter t fibcount | grep -q "packets 1"; then -		echo "PASS: fib lookup returned exepected output interface" -	else -		echo "FAIL: fib lookup did not return exepected output interface" -		ret=1 -		return -	fi -} -  test_ct_zone_in  test_masquerade_vrf "default"  test_masquerade_vrf "pfifo"  test_masquerade_veth -test_fib  exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index 1f5979c1510c..efea93cf23d4 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -15,10 +15,12 @@ source lib.sh  # Available test groups:  # - reported_issues: check for issues that were reported in the past  # - correctness: check that packets match given entries, and only those +# - correctness_large: same but with additional non-matching entries  # - concurrency: attempt races between insertion, deletion and lookup  # - timeout: check that packets match entries until they expire  # - performance: estimate matching rate, compare with rbtree and hash baselines -TESTS="reported_issues correctness concurrency timeout" +TESTS="reported_issues correctness correctness_large concurrency timeout" +  [ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}"  # Set types, defined by TYPE_ variables below @@ -1257,9 +1259,7 @@ send_nomatch() {  # - add ranged element, check that packets match it  # - check that packets outside range don't match it  # - remove some elements, check that packets don't match anymore -test_correctness() { -	setup veth send_"${proto}" set || return ${ksft_skip} - +test_correctness_main() {  	range_size=1  	for i in $(seq "${start}" $((start + count))); do  		end=$((start + range_size)) @@ -1293,6 +1293,163 @@ test_correctness() {  	done  } +test_correctness() { +	setup veth send_"${proto}" set || return ${ksft_skip} + +	test_correctness_main +} + +# Repeat the correctness tests, but add extra non-matching entries. +# This exercises the more compact '4 bit group' representation that +# gets picked when the default 8-bit representation exceed +# NFT_PIPAPO_LT_SIZE_HIGH bytes of memory. +# See usage of NFT_PIPAPO_LT_SIZE_HIGH in pipapo_lt_bits_adjust(). +# +# The format() helper is way too slow when generating lots of +# entries so its not used here. +test_correctness_large() { +	setup veth send_"${proto}" set || return ${ksft_skip} +	# number of dummy (filler) entries to add. +	local dcount=16385 + +	( +	echo -n "add element inet filter test { " + +	case "$type_spec" in +	"ether_addr . ipv4_addr") +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			format_mac $((1000000 + i)) +			printf ". 172.%i.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) +		done +		;; +	"inet_proto . ipv6_addr") +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			printf "%i . " $((RANDOM%256)) +			format_addr6 $((1000000 + i)) +		done +		;; +	"inet_service . inet_proto") +		# smaller key sizes, need more entries to hit the +		# 4-bit threshold. +		dcount=65536 +		for i in $(seq 1 $dcount); do +			local proto=$((RANDOM%256)) + +			# Test uses UDP to match, as it also fails when matching +			# an entry that doesn't exist, so skip 'udp' entries +			# to not trigger a wrong failure. +			[ $proto -eq 17 ] && proto=18 +			[ $i -gt 1 ] && echo ", " +			printf "%i . %i " $(((i%65534) + 1)) $((proto)) +		done +		;; +	"inet_service . ipv4_addr") +		dcount=32768 +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			printf "%i . 172.%i.%i.%i " $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) $((i%256)) +		done +		;; +	"ipv4_addr . ether_addr") +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			printf "172.%i.%i.%i . " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) +			format_mac $((1000000 + i)) +		done +		;; +	"ipv4_addr . inet_service") +		dcount=32768 +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			printf "172.%i.%i.%i . %i" $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) +		done +		;; +	"ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr") +		dcount=65536 +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			printf "172.%i.%i.%i . %i . " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) +			format_mac $((1000000 + i)) +			printf ". %i . 192.168.%i.%i" $((RANDOM%256)) $((RANDOM%256)) $((i%256)) +		done +		;; +	"ipv4_addr . inet_service . inet_proto") +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			printf "172.%i.%i.%i . %i . %i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) +		done +		;; +	"ipv4_addr . inet_service . inet_proto . ipv4_addr") +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			printf "172.%i.%i.%i . %i . %i . 192.168.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) $((RANDOM%256)) +		done +		;; +	"ipv4_addr . inet_service . ipv4_addr") +		dcount=32768 +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			printf "172.%i.%i.%i . %i . 192.168.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) +		done +		;; +	"ipv6_addr . ether_addr") +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			format_addr6 $((i + 1000000)) +			echo -n " . " +			format_mac $((1000000 + i)) +		done +		;; +	"ipv6_addr . inet_service") +		dcount=32768 +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			format_addr6 $((i + 1000000)) +			echo -n " .  $(((RANDOM%65534) + 1))" +		done +		;; +	"ipv6_addr . inet_service . ether_addr") +		dcount=32768 +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			format_addr6 $((i + 1000000)) +			echo -n " .  $(((RANDOM%65534) + 1)) . " +			format_mac $((i + 1000000)) +		done +		;; +	"ipv6_addr . inet_service . ether_addr . inet_proto") +		dcount=65536 +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			format_addr6 $((i + 1000000)) +			echo -n " .  $(((RANDOM%65534) + 1)) . " +			format_mac $((i + 1000000)) +			echo -n " .  $((RANDOM%256))" +		done +		;; +	"ipv6_addr . inet_service . ipv6_addr . inet_service") +		dcount=32768 +		for i in $(seq 1 $dcount); do +			[ $i -gt 1 ] && echo ", " +			format_addr6 $((i + 1000000)) +			echo -n " .  $(((RANDOM%65534) + 1)) . " +			format_addr6 $((i + 2123456)) +			echo -n " .  $((RANDOM%256))" +		done +		;; +	*) +		"Unhandled $type_spec" +		return 1 +	esac +	echo -n "}" + +	) | nft -f - || return 1 + +	test_correctness_main +} +  # Concurrency test template:  # - add all the elements  # - start a thread for each physical thread that: diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh index 82780b39277c..9929a9ffef65 100755 --- a/tools/testing/selftests/net/netfilter/nft_fib.sh +++ b/tools/testing/selftests/net/netfilter/nft_fib.sh @@ -3,6 +3,10 @@  # This tests the fib expression.  #  # Kselftest framework requirement - SKIP code is 4. +# +#  10.0.1.99     10.0.1.1           10.0.2.1         10.0.2.99 +# dead:1::99    dead:1::1          dead:2::1        dead:2::99 +# ns1 <-------> [ veth0 ] nsrouter [veth1] <-------> ns2  source lib.sh @@ -72,6 +76,89 @@ table inet filter {  EOF  } +load_type_ruleset() { +	local netns=$1 + +	for family in ip ip6;do +ip netns exec "$netns" nft -f /dev/stdin <<EOF +table $family filter { +	chain type_match_in { +		fib daddr type local counter comment "daddr configured on other iface" +		fib daddr . iif type local counter comment "daddr configured on iif" +		fib daddr type unicast counter comment "daddr not local" +		fib daddr . iif type unicast counter comment "daddr not configured on iif" +	} + +	chain type_match_out { +		fib daddr type unicast counter +		fib daddr . oif type unicast counter +		fib daddr type local counter +		fib daddr . oif type local counter +	} + +	chain prerouting { +		type filter hook prerouting priority 0; +		icmp type echo-request counter jump type_match_in +		icmpv6 type echo-request counter jump type_match_in +	} + +	chain input { +		type filter hook input priority 0; +		icmp type echo-request counter jump type_match_in +		icmpv6 type echo-request counter jump type_match_in +	} + +	chain forward { +		type filter hook forward priority 0; +		icmp type echo-request counter jump type_match_in +		icmpv6 type echo-request counter jump type_match_in +	} + +	chain output { +		type filter hook output priority 0; +		icmp type echo-request counter jump type_match_out +		icmpv6 type echo-request counter jump type_match_out +	} + +	chain postrouting { +		type filter hook postrouting priority 0; +		icmp type echo-request counter jump type_match_out +		icmpv6 type echo-request counter jump type_match_out +	} +} +EOF +done +} + +reload_type_ruleset() { +	ip netns exec "$1" nft flush table ip filter +	ip netns exec "$1" nft flush table ip6 filter +	load_type_ruleset "$1" +} + +check_fib_type_counter_family() { +	local family="$1" +	local want="$2" +	local ns="$3" +	local chain="$4" +	local what="$5" +	local errmsg="$6" + +	if ! ip netns exec "$ns" nft list chain "$family" filter "$chain" | grep "$what" | grep -q "packets $want";then +		echo "Netns $ns $family fib type counter doesn't match expected packet count of $want for $what $errmsg" 1>&2 +		ip netns exec "$ns" nft list chain "$family" filter "$chain" +		ret=1 +		return 1 +	fi + +	return 0 +} + +check_fib_type_counter() { +	check_fib_type_counter_family "ip" "$@" || return 1 +	check_fib_type_counter_family "ip6" "$@" || return 1 +} +  load_ruleset_count() {  	local netns=$1 @@ -90,6 +177,7 @@ check_drops() {  	if dmesg | grep -q ' nft_rpfilter: ';then  		dmesg | grep ' nft_rpfilter: '  		echo "FAIL: rpfilter did drop packets" +		ret=1  		return 1  	fi @@ -164,17 +252,496 @@ test_ping() {    return 0  } +test_ping_unreachable() { +  local daddr4=$1 +  local daddr6=$2 + +  if ip netns exec "$ns1" ping -c 1 -w 1 -q "$daddr4" > /dev/null; then +	echo "FAIL: ${ns1} could reach $daddr4" 1>&2 +	return 1 +  fi + +  if ip netns exec "$ns1" ping -c 1 -w 1 -q "$daddr6" > /dev/null; then +	echo "FAIL: ${ns1} could reach $daddr6" 1>&2 +	return 1 +  fi + +  return 0 +} + +test_fib_type() { +	local notice="$1" +	local errmsg="addr-on-if" +	local lret=0 + +	if ! load_type_ruleset "$nsrouter";then +		echo "SKIP: Could not load fib type ruleset" +		[ $ret -eq 0 ] && ret=$ksft_skip +		return +	fi + +	# makes router receive packet for addresses configured on incoming +	# interface. +	test_ping 10.0.1.1 dead:1::1 || return 1 + +	# expectation: triggers all 'local' in prerouting/input. +	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type local" "$errmsg" || lret=1 +	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type local" "$errmsg" || lret=1 + +	reload_type_ruleset "$nsrouter" +	# makes router receive packet for address configured on a different (but local) +	# interface. +	test_ping 10.0.2.1 dead:2::1 || return 1 + +	# expectation: triggers 'unicast' in prerouting/input for daddr . iif and local for 'daddr'. +	errmsg="addr-on-host" +	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type local" "$errmsg" || lret=1 +	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type unicast" "$errmsg" || lret=1 + +	reload_type_ruleset "$nsrouter" +	test_ping 10.0.2.99 dead:2::99 || return 1 +	errmsg="addr-on-otherhost" +	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type unicast" "$errmsg" || lret=1 +	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type unicast" "$errmsg" || lret=1 + +	if [ $lret -eq 0 ];then +		echo "PASS: fib expression address types match ($notice)" +	else +		echo "FAIL: fib expression address types match ($notice)" +		ret=1 +	fi +} + +test_fib_vrf_dev_add_dummy() +{ +	if ! ip -net "$nsrouter" link add dummy0 type dummy ;then +		echo "SKIP: VRF tests: dummy device type not supported" +		return 1 +	fi + +	if ! ip -net "$nsrouter" link add tvrf type vrf table 9876;then +		echo "SKIP: VRF tests: vrf device type not supported" +		return 1 +	fi + +	ip -net "$nsrouter" link set dummy0 master tvrf +	ip -net "$nsrouter" link set dummy0 up +	ip -net "$nsrouter" link set tvrf up +} + +load_ruleset_vrf() +{ +# Due to the many different possible combinations using named counters +# or one-rule-per-expected-result is complex. +# +# Instead, add dynamic sets for the fib modes +# (fib address type, fib output interface lookup .. ), +# and then add the obtained fib results to them. +# +# The test is successful if the sets contain the expected results +# and no unexpected extra entries existed. +ip netns exec "$nsrouter" nft -f - <<EOF +flush ruleset +table inet t { +	set fibif4 { +		typeof meta iif . ip daddr . fib daddr oif +		flags dynamic +		counter +	} + +	set fibif4iif { +		typeof meta iif . ip daddr . fib daddr . iif oif +		flags dynamic +		counter +	} + +	set fibif6 { +		typeof meta iif . ip6 daddr . fib daddr oif +		flags dynamic +		counter +	} + +	set fibif6iif { +		typeof meta iif . ip6 daddr . fib daddr . iif oif +		flags dynamic +		counter +	} + +	set fibtype4 { +		typeof meta iif . ip daddr . fib daddr type +		flags dynamic +		counter +	} + +	set fibtype4iif { +		typeof meta iif . ip daddr . fib daddr . iif type +		flags dynamic +		counter +	} + +	set fibtype6 { +		typeof meta iif . ip6 daddr . fib daddr type +		flags dynamic +		counter +	} + +	set fibtype6iif { +		typeof meta iif . ip6 daddr . fib daddr . iif type +		flags dynamic +		counter +	} + +	chain fib_test { +		meta nfproto ipv4 jump { +			add @fibif4 { meta iif . ip daddr . fib daddr oif } +			add @fibif4iif { meta iif . ip daddr . fib daddr . iif oif } +			add @fibtype4 { meta iif . ip daddr . fib daddr type } +			add @fibtype4iif { meta iif . ip daddr . fib daddr . iif type } + +			add @fibif4 { meta iif . ip saddr . fib saddr oif } +			add @fibif4iif { meta iif . ip saddr . fib saddr . iif oif } +		} + +		meta nfproto ipv6 jump { +			add @fibif6    { meta iif . ip6 daddr . fib daddr oif } +			add @fibif6iif { meta iif . ip6 daddr . fib daddr . iif oif } +			add @fibtype6    { meta iif . ip6 daddr . fib daddr type } +			add @fibtype6iif { meta iif . ip6 daddr . fib daddr . iif type } + +			add @fibif6 { meta iif . ip6 saddr . fib saddr oif } +			add @fibif6iif { meta iif . ip6 saddr . fib saddr . iif oif } +		} +	} + +	chain prerouting { +		type filter hook prerouting priority 0; +		icmp type echo-request counter jump fib_test + +		# neighbour discovery to be ignored. +		icmpv6 type echo-request counter jump fib_test +	} +} +EOF + +if [ $? -ne 0 ] ;then +	echo "SKIP: Could not load ruleset for fib vrf test" +	[ $ret -eq 0 ] && ret=$ksft_skip +	return 1 +fi +} + +check_type() +{ +	local setname="$1" +	local iifname="$2" +	local addr="$3" +	local type="$4" +	local count="$5" + +	[ -z "$count" ] && count=1 + +	if ! ip netns exec "$nsrouter" nft get element inet t "$setname" { "$iifname" . "$addr" . "$type" } |grep -q "counter packets $count";then +		echo "FAIL: did not find $iifname . $addr . $type in $setname" +		ip netns exec "$nsrouter" nft list set inet t "$setname" +		ret=1 +		return 1 +	fi + +	# delete the entry, this allows to check if anything unexpected appeared +	# at the end of the test run: all dynamic sets should be empty by then. +	if ! ip netns exec "$nsrouter" nft delete element inet t "$setname" { "$iifname" . "$addr" . "$type" } ; then +		echo "FAIL: can't delete $iifname . $addr . $type in $setname" +		ip netns exec "$nsrouter" nft list set inet t "$setname" +		ret=1 +		return 1 +	fi + +	return 0 +} + +check_local() +{ +	check_type $@ "local" 1 +} + +check_unicast() +{ +	check_type $@ "unicast" 1 +} + +check_rpf() +{ +	check_type $@ +} + +check_fib_vrf_sets_empty() +{ +	local setname="" +	local lret=0 + +	# A non-empty set means that we have seen unexpected packets OR +	# that a fib lookup provided unexpected results. +	for setname in "fibif4" "fibif4iif" "fibif6" "fibif6iif" \ +		       "fibtype4" "fibtype4iif" "fibtype6" "fibtype6iif";do +		if ip netns exec "$nsrouter" nft list set inet t "$setname" | grep -q elements;then +			echo "FAIL: $setname not empty" +	                ip netns exec "$nsrouter" nft list set inet t "$setname" +			ret=1 +			lret=1 +		fi +	done + +	return $lret +} + +check_fib_vrf_type() +{ +	local msg="$1" + +	local addr +	# the incoming interface is always veth0.  As its not linked to a VRF, +	# the 'tvrf' device should NOT show up anywhere. +	local ifname="veth0" +	local lret=0 + +	# local_veth0, local_veth1 +	for addr in "10.0.1.1" "10.0.2.1"; do +		check_local fibtype4  "$ifname" "$addr" || lret=1 +		check_type  fibif4    "$ifname" "$addr" "0" || lret=1 +	done +	for addr in "dead:1::1" "dead:2::1";do +		check_local fibtype6  "$ifname" "$addr" || lret=1 +		check_type  fibif6    "$ifname" "$addr" "0" || lret=1 +	done + +	# when restricted to the incoming interface, 10.0.1.1 should +	# be 'local', but 10.0.2.1 unicast. +	check_local fibtype4iif   "$ifname" "10.0.1.1" || lret=1 +	check_unicast fibtype4iif "$ifname" "10.0.2.1" || lret=1 + +	# same for the ipv6 addresses. +	check_local fibtype6iif   "$ifname" "dead:1::1" || lret=1 +	check_unicast fibtype6iif "$ifname" "dead:2::1" || lret=1 + +	# None of these addresses should find a valid route when restricting +	# to the incoming interface (we ask for daddr - 10.0.1.1/2.1 are +	# reachable via 'lo'. +	for addr in "10.0.1.1" "10.0.2.1" "10.9.9.1" "10.9.9.2";do +		check_type fibif4iif "$ifname" "$addr" "0" || lret=1 +	done + +	# expect default route (veth1), dummy0 is part of VRF but iif isn't. +	for addr in "10.9.9.1" "10.9.9.2";do +		check_unicast fibtype4    "$ifname" "$addr" || lret=1 +		check_unicast fibtype4iif "$ifname" "$addr" || lret=1 +		check_type fibif4 "$ifname" "$addr" "veth1" || lret=1 +	done +	for addr in "dead:9::1" "dead:9::2";do +		check_unicast fibtype6    "$ifname" "$addr" || lret=1 +		check_unicast fibtype6iif "$ifname" "$addr" || lret=1 +		check_type fibif6 "$ifname" "$addr" "veth1" || lret=1 +	done + +	# same for the IPv6 equivalent addresses. +	for addr in "dead:1::1" "dead:2::1" "dead:9::1" "dead:9::2";do +		check_type  fibif6iif "$ifname" "$addr" "0" || lret=1 +	done + +	check_unicast fibtype4    "$ifname" "10.0.2.99" || lret=1 +	check_unicast fibtype4iif "$ifname" "10.0.2.99" || lret=1 +	check_unicast fibtype6    "$ifname" "dead:2::99" || lret=1 +	check_unicast fibtype6iif "$ifname" "dead:2::99" || lret=1 + +	check_type fibif4 "$ifname" "10.0.2.99" "veth1" || lret=1 +	check_type fibif4iif "$ifname" "10.0.2.99" 0 || lret=1 +	check_type fibif6 "$ifname" "dead:2::99" "veth1" || lret=1 +	check_type fibif6iif "$ifname" "dead:2::99" 0 || lret=1 + +	check_rpf  fibif4    "$ifname" "10.0.1.99" "veth0" 5 || lret=1 +	check_rpf  fibif4iif "$ifname" "10.0.1.99" "veth0" 5 || lret=1 +	check_rpf  fibif6    "$ifname" "dead:1::99" "veth0" 5 || lret=1 +	check_rpf  fibif6iif "$ifname" "dead:1::99" "veth0" 5 || lret=1 + +	check_fib_vrf_sets_empty || lret=1 + +	if [ $lret -eq 0 ];then +		echo "PASS: $msg" +	else +		echo "FAIL: $msg" +		ret=1 +	fi +} + +check_fib_veth_vrf_type() +{ +	local msg="$1" + +	local addr +	local ifname +	local setname +	local lret=0 + +	# as veth0 is now part of tvrf interface, packets will be seen +	# twice, once with iif veth0, then with iif tvrf. + +	for ifname in "veth0" "tvrf"; do +		for addr in "10.0.1.1" "10.9.9.1"; do +			check_local fibtype4  "$ifname" "$addr" || lret=1 +			# addr local, but nft_fib doesn't return routes with RTN_LOCAL. +			check_type  fibif4    "$ifname" "$addr" 0 || lret=1 +			check_type  fibif4iif "$ifname" "$addr" 0 || lret=1 +		done + +		for addr in "dead:1::1" "dead:9::1"; do +			check_local fibtype6 "$ifname" "$addr" || lret=1 +			# same, address is local but no route is returned for lo. +			check_type  fibif6    "$ifname" "$addr" 0 || lret=1 +			check_type  fibif6iif "$ifname" "$addr" 0 || lret=1 +		done + +		for t in fibtype4 fibtype4iif; do +			check_unicast "$t" "$ifname" 10.9.9.2 || lret=1 +		done +		for t in fibtype6 fibtype6iif; do +			check_unicast "$t" "$ifname" dead:9::2 || lret=1 +		done + +		check_unicast fibtype4iif "$ifname" "10.9.9.1" || lret=1 +		check_unicast fibtype6iif "$ifname" "dead:9::1" || lret=1 + +		check_unicast fibtype4    "$ifname" "10.0.2.99" || lret=1 +		check_unicast fibtype4iif "$ifname" "10.0.2.99" || lret=1 + +		check_unicast fibtype6    "$ifname" "dead:2::99" || lret=1 +		check_unicast fibtype6iif "$ifname" "dead:2::99" || lret=1 + +		check_type fibif4    "$ifname"  "10.0.2.99" "veth1" || lret=1 +		check_type fibif6    "$ifname" "dead:2::99" "veth1" || lret=1 +		check_type fibif4    "$ifname"   "10.9.9.2" "dummy0" || lret=1 +		check_type fibif6    "$ifname"  "dead:9::2" "dummy0" || lret=1 + +		# restricted to iif -- MUST NOT provide result, its != $ifname. +		check_type fibif4iif "$ifname"  "10.0.2.99" 0 || lret=1 +		check_type fibif6iif "$ifname" "dead:2::99" 0 || lret=1 + +		check_rpf  fibif4 "$ifname" "10.0.1.99" "veth0" 4 || lret=1 +		check_rpf  fibif6 "$ifname" "dead:1::99" "veth0" 4 || lret=1 +		check_rpf  fibif4iif "$ifname" "10.0.1.99" "$ifname" 4 || lret=1 +		check_rpf  fibif6iif "$ifname" "dead:1::99" "$ifname" 4 || lret=1 +	done + +	check_local fibtype4iif "veth0" "10.0.1.1" || lret=1 +	check_local fibtype6iif "veth0" "dead:1::1" || lret=1 + +	check_unicast fibtype4iif "tvrf" "10.0.1.1" || lret=1 +	check_unicast fibtype6iif "tvrf" "dead:1::1" || lret=1 + +	# 10.9.9.2 should not provide a result for iif veth, but +	# should when iif is tvrf. +	# This is because its reachable via dummy0 which is part of +	# tvrf.  iif veth0 MUST conceal the dummy0 result (i.e. return oif 0). +	check_type fibif4iif "veth0" "10.9.9.2" 0 || lret=1 +	check_type fibif6iif "veth0"  "dead:9::2" 0 || lret=1 + +	check_type fibif4iif "tvrf" "10.9.9.2" "tvrf" || lret=1 +	check_type fibif6iif "tvrf" "dead:9::2" "tvrf" || lret=1 + +	check_fib_vrf_sets_empty || lret=1 + +	if [ $lret -eq 0 ];then +		echo "PASS: $msg" +	else +		echo "FAIL: $msg" +		ret=1 +	fi +} + +# Extends nsrouter config by adding dummy0+vrf. +# +#  10.0.1.99     10.0.1.1           10.0.2.1         10.0.2.99 +# dead:1::99    dead:1::1          dead:2::1        dead:2::99 +# ns1 <-------> [ veth0 ] nsrouter [veth1] <-------> ns2 +#                         [dummy0] +#                         10.9.9.1 +#                        dead:9::1 +#                          [tvrf] +test_fib_vrf() +{ +	local cntname="" + +	if ! test_fib_vrf_dev_add_dummy; then +		[ $ret -eq 0 ] && ret=$ksft_skip +		return +	fi + +	ip -net "$nsrouter" addr add "10.9.9.1"/24 dev dummy0 +	ip -net "$nsrouter" addr add "dead:9::1"/64 dev dummy0 nodad + +	ip -net "$nsrouter" route add default via 10.0.2.99 +	ip -net "$nsrouter" route add default via dead:2::99 + +	load_ruleset_vrf || return + +	# no echo reply for these addresses: The dummy interface is part of tvrf, +	# but veth0 (incoming interface) isn't linked to it. +	test_ping_unreachable "10.9.9.1" "dead:9::1" & +	test_ping_unreachable "10.9.9.2" "dead:9::2" & + +	# expect replies from these. +	test_ping "10.0.1.1" "dead:1::1" +	test_ping "10.0.2.1" "dead:2::1" +	test_ping "10.0.2.99" "dead:2::99" + +	wait + +	check_fib_vrf_type "fib expression address types match (iif not in vrf)" + +	# second round: this time, make veth0 (rx interface) part of the vrf. +	# 10.9.9.1 / dead:9::1 become reachable from ns1, while ns2 +	# becomes unreachable. +	ip -net "$nsrouter" link set veth0 master tvrf +	ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +	# this reload should not be needed, but in case +	# there is some error (missing or unexpected entry) this will prevent them +	# from leaking into round 2. +	load_ruleset_vrf || return + +	test_ping "10.0.1.1" "dead:1::1" +	test_ping "10.9.9.1" "dead:9::1" + +	# ns2 should no longer be reachable (veth1 not in vrf) +	test_ping_unreachable "10.0.2.99" "dead:2::99" & + +	# vrf via dummy0, but host doesn't exist +	test_ping_unreachable "10.9.9.2" "dead:9::2" & + +	wait + +	check_fib_veth_vrf_type "fib expression address types match (iif in vrf)" +} +  ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null  ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null  ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null  test_ping 10.0.2.1 dead:2::1 || exit 1 -check_drops || exit 1 +check_drops  test_ping 10.0.2.99 dead:2::99 || exit 1 -check_drops || exit 1 +check_drops + +[ $ret -eq 0 ] && echo "PASS: fib expression did not cause unwanted packet drops" + +load_input_ruleset "$ns1" + +test_ping 127.0.0.1 ::1 +check_drops + +test_ping 10.0.1.99 dead:1::99 +check_drops -echo "PASS: fib expression did not cause unwanted packet drops" +[ $ret -eq 0 ] && echo "PASS: fib expression did not discard loopback packets"  load_input_ruleset "$ns1" @@ -234,7 +801,7 @@ ip -net "$nsrouter" addr del dead:2::1/64 dev veth0  # ... pbr ruleset for the router, check iif+oif.  if ! load_pbr_ruleset "$nsrouter";then  	echo "SKIP: Could not load fib forward ruleset" -	exit $ksft_skip +	[ "$ret" -eq 0 ] && ret=$ksft_skip  fi  ip -net "$nsrouter" rule add from all table 128 @@ -245,11 +812,36 @@ ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1  # drop main ipv4 table  ip -net "$nsrouter" -4 rule delete table main -if ! test_ping 10.0.2.99 dead:2::99;then -	ip -net "$nsrouter" nft list ruleset -	echo "FAIL: fib mismatch in pbr setup" -	exit 1 +if test_ping 10.0.2.99 dead:2::99;then +	echo "PASS: fib expression forward check with policy based routing" +else +	echo "FAIL: fib expression forward check with policy based routing" +	ret=1  fi -echo "PASS: fib expression forward check with policy based routing" -exit 0 +test_fib_type "policy routing" +ip netns exec "$nsrouter" nft delete table ip filter +ip netns exec "$nsrouter" nft delete table ip6 filter + +# Un-do policy routing changes +ip -net "$nsrouter" rule del from all table 128 +ip -net "$nsrouter" rule del from all iif veth0 table 129 + +ip -net "$nsrouter" route del table 128 to 10.0.1.0/24 dev veth0 +ip -net "$nsrouter" route del table 129 to 10.0.2.0/24 dev veth1 + +ip -net "$ns1" -4 route del default +ip -net "$ns1" -6 route del default + +ip -net "$ns1" -4 route add default via 10.0.1.1 +ip -net "$ns1" -6 route add default via dead:1::1 + +ip -net "$nsrouter" -4 rule add from all table main priority 32766 + +test_fib_type "default table" +ip netns exec "$nsrouter" nft delete table ip filter +ip netns exec "$nsrouter" nft delete table ip6 filter + +test_fib_vrf + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_interface_stress.sh b/tools/testing/selftests/net/netfilter/nft_interface_stress.sh new file mode 100755 index 000000000000..11d82d11495e --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_interface_stress.sh @@ -0,0 +1,151 @@ +#!/bin/bash -e +# +# SPDX-License-Identifier: GPL-2.0 +# +# Torture nftables' netdevice notifier callbacks and related code by frequent +# renaming of interfaces which netdev-family chains and flowtables hook into. + +source lib.sh + +checktool "nft --version" "run test without nft tool" +checktool "iperf3 --version" "run test without iperf3 tool" + +# how many seconds to torture the kernel? +# default to 80% of max run time but don't exceed 48s +TEST_RUNTIME=$((${kselftest_timeout:-60} * 8 / 10)) +[[ $TEST_RUNTIME -gt 48 ]] && TEST_RUNTIME=48 + +trap "cleanup_all_ns" EXIT + +setup_ns nsc nsr nss + +ip -net $nsc link add cr0 type veth peer name rc0 netns $nsr +ip -net $nsc addr add 10.0.0.1/24 dev cr0 +ip -net $nsc link set cr0 up +ip -net $nsc route add default via 10.0.0.2 + +ip -net $nss link add sr0 type veth peer name rs0 netns $nsr +ip -net $nss addr add 10.1.0.1/24 dev sr0 +ip -net $nss link set sr0 up +ip -net $nss route add default via 10.1.0.2 + +ip -net $nsr addr add 10.0.0.2/24 dev rc0 +ip -net $nsr link set rc0 up +ip -net $nsr addr add 10.1.0.2/24 dev rs0 +ip -net $nsr link set rs0 up +ip netns exec $nsr sysctl -q net.ipv4.ip_forward=1 +ip netns exec $nsr sysctl -q net.ipv4.conf.all.forwarding=1 + +{ +	echo "table netdev t {" +	for ((i = 0; i < 10; i++)); do +		cat <<-EOF +		chain chain_rc$i { +			type filter hook ingress device rc$i priority 0 +			counter +		} +		chain chain_rs$i { +			type filter hook ingress device rs$i priority 0 +			counter +		} +		EOF +	done +	echo "}" +	echo "table ip t {" +	for ((i = 0; i < 10; i++)); do +		cat <<-EOF +		flowtable ft_${i} { +			hook ingress priority 0 +			devices = { rc$i, rs$i } +		} +		EOF +	done +	echo "chain c {" +	echo "type filter hook forward priority 0" +	for ((i = 0; i < 10; i++)); do +		echo -n "iifname rc$i oifname rs$i " +		echo    "ip protocol tcp counter flow add @ft_${i}" +	done +	echo "counter" +	echo "}" +	echo "}" +} | ip netns exec $nsr nft -f - || { +	echo "SKIP: Could not load nft ruleset" +	exit $ksft_skip +} + +for ((o=0, n=1; ; o=n, n++, n %= 10)); do +	ip -net $nsr link set rc$o name rc$n +	ip -net $nsr link set rs$o name rs$n +done & +rename_loop_pid=$! + +while true; do ip netns exec $nsr nft list ruleset >/dev/null 2>&1; done & +nft_list_pid=$! + +ip netns exec $nsr nft monitor >/dev/null & +nft_monitor_pid=$! + +ip netns exec $nss iperf3 --server --daemon -1 +summary_expr='s,^\[SUM\] .* \([0-9\.]\+\) Kbits/sec .* receiver,\1,p' +rate=$(ip netns exec $nsc iperf3 \ +	--format k -c 10.1.0.1 --time $TEST_RUNTIME \ +	--length 56 --parallel 10 -i 0 | sed -n "$summary_expr") + +kill $nft_list_pid +kill $nft_monitor_pid +kill $rename_loop_pid +wait + +ip netns exec $nsr nft -f - <<EOF +table ip t { +	flowtable ft_wild { +		hook ingress priority 0 +		devices = { wild* } +	} +} +EOF +if [[ $? -ne 0 ]]; then +	echo "SKIP wildcard tests: not supported by host's nft?" +else +	for ((i = 0; i < 100; i++)); do +		ip -net $nsr link add wild$i type dummy & +	done +	wait +	for ((i = 80; i < 100; i++)); do +		ip -net $nsr link del wild$i & +	done +	for ((i = 0; i < 80; i++)); do +		ip -net $nsr link del wild$i & +	done +	wait +	for ((i = 0; i < 100; i += 10)); do +		( +		for ((j = 0; j < 10; j++)); do +			ip -net $nsr link add wild$((i + j)) type dummy +		done +		for ((j = 0; j < 10; j++)); do +			ip -net $nsr link del wild$((i + j)) +		done +		) & +	done +	wait +fi + +[[ $(</proc/sys/kernel/tainted) -eq 0 ]] || { +	echo "FAIL: Kernel is tainted!" +	exit $ksft_fail +} + +[[ $rate -gt 0 ]] || { +	echo "FAIL: Zero throughput in iperf3" +	exit $ksft_fail +} + +[[ -f /sys/kernel/debug/kmemleak && \ +   -n $(</sys/kernel/debug/kmemleak) ]] && { +	echo "FAIL: non-empty kmemleak report" +	exit $ksft_fail +} + +exit $ksft_pass | 
