netfilter: nf_tables: implement proper set selection

The current set selection simply choses the first set type that provides the requested features, which always results in the rbtree being chosen by virtue of being the first set in the list. What we actually want to do is choose the implementation that can provide the requested features and is optimal from either a performance or memory perspective depending on the characteristics of the elements and the preferences specified by the user. The elements are not known when creating a set. Even if we would provide them for anonymous (literal) sets, we'd still have standalone sets where the elements are not known in advance. We therefore need an abstract description of the data charcteristics. The kernel already knows the size of the key, this patch starts by introducing a nested set description which so far contains only the maximum amount of elements. Based on this the set implementations are changed to provide an estimate of the required amount of memory and the lookup complexity class. The set ops have a new callback ->estimate() that is invoked during set selection. It receives a structure containing the attributes known to the kernel and is supposed to populate a struct nft_set_estimate with the complexity class and, in case the size is known, the complete amount of memory required, or the amount of memory required per element otherwise. Based on the policy specified by the user (performance/memory, defaulting to performance) the kernel will then select the best suited implementation. Even if the set implementation would allow to add more than the specified maximum amount of elements, they are enforced since new implementations might not be able to add more than maximum based on which they were selected. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
author: Patrick McHardy <kaber@trash.net> 2014-03-28 10:19:47 +0000
committer: Pablo Neira Ayuso <pablo@gnumonks.org> 2014-04-02 21:32:57 +0200
commit: c50b960ccc5981627628302701e93e6aceccdb1c (patch)
tree: 3621dae1299222f46a2096694cb8c936abc02f5e /net/netfilter/nft_hash.c
parent: fe92ca45a170cb8d09c163db23d46634110b3c2f (diff)
1 files changed, 43 insertions, 2 deletions
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3b1ad876d6b0..01884b315c4a 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/list.h>
+#include <linux/log2.h>
 #include <linux/jhash.h>
 #include <linux/netlink.h>
 #include <linux/vmalloc.h>
@@ -19,7 +20,7 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
 
-#define NFT_HASH_MIN_SIZE	4
+#define NFT_HASH_MIN_SIZE	4UL
 
 struct nft_hash {
 	struct nft_hash_table __rcu	*tbl;
@@ -82,6 +83,11 @@ static void nft_hash_tbl_free(const struct nft_hash_table *tbl)
 		kfree(tbl);
 }
 
+static unsigned int nft_hash_tbl_size(unsigned int nelem)
+{
+	return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE);
+}
+
 static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets)
 {
 	struct nft_hash_table *tbl;
@@ -335,17 +341,23 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
 }
 
 static int nft_hash_init(const struct nft_set *set,
+			 const struct nft_set_desc *desc,
 			 const struct nlattr * const tb[])
 {
 	struct nft_hash *priv = nft_set_priv(set);
 	struct nft_hash_table *tbl;
+	unsigned int size;
 
 	if (unlikely(!nft_hash_rnd_initted)) {
 		get_random_bytes(&nft_hash_rnd, 4);
 		nft_hash_rnd_initted = true;
 	}
 
-	tbl = nft_hash_tbl_alloc(NFT_HASH_MIN_SIZE);
+	size = NFT_HASH_MIN_SIZE;
+	if (desc->size)
+		size = nft_hash_tbl_size(desc->size);
+
+	tbl = nft_hash_tbl_alloc(size);
 	if (tbl == NULL)
 		return -ENOMEM;
 	RCU_INIT_POINTER(priv->tbl, tbl);
@@ -369,8 +381,37 @@ static void nft_hash_destroy(const struct nft_set *set)
 	kfree(tbl);
 }
 
+static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
+			      struct nft_set_estimate *est)
+{
+	unsigned int esize;
+
+	esize = sizeof(struct nft_hash_elem);
+	if (features & NFT_SET_MAP)
+		esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]);
+
+	if (desc->size) {
+		est->size = sizeof(struct nft_hash) +
+			    nft_hash_tbl_size(desc->size) *
+			    sizeof(struct nft_hash_elem *) +
+			    desc->size * esize;
+	} else {
+		/* Resizing happens when the load drops below 30% or goes
+		 * above 75%. The average of 52.5% load (approximated by 50%)
+		 * is used for the size estimation of the hash buckets,
+		 * meaning we calculate two buckets per element.
+		 */
+		est->size = esize + 2 * sizeof(struct nft_hash_elem *);
+	}
+
+	est->class = NFT_SET_CLASS_O_1;
+
+	return true;
+}
+
 static struct nft_set_ops nft_hash_ops __read_mostly = {
 	.privsize       = nft_hash_privsize,
+	.estimate	= nft_hash_estimate,
 	.init		= nft_hash_init,
 	.destroy	= nft_hash_destroy,
 	.get		= nft_hash_get,
author	Patrick McHardy <kaber@trash.net>	2014-03-28 10:19:47 +0000
committer	Pablo Neira Ayuso <pablo@gnumonks.org>	2014-04-02 21:32:57 +0200
commit	c50b960ccc5981627628302701e93e6aceccdb1c (patch)
tree	3621dae1299222f46a2096694cb8c936abc02f5e /net/netfilter/nft_hash.c
parent	fe92ca45a170cb8d09c163db23d46634110b3c2f (diff)