summaryrefslogtreecommitdiff
path: root/net/openvswitch/flow_table.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 20:02:57 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 20:02:57 -0800
commit386403a115f95997c2715691226e11a7b5cffcfd (patch)
treea685df70bd3d5b295683713818ddf0752c3d75b6 /net/openvswitch/flow_table.c
parent642356cb5f4a8c82b5ca5ebac288c327d10df236 (diff)
parent622dc5ad8052f4f0c6b7a12787696a5caa3c6a58 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller: "Another merge window, another pull full of stuff: 1) Support alternative names for network devices, from Jiri Pirko. 2) Introduce per-netns netdev notifiers, also from Jiri Pirko. 3) Support MSG_PEEK in vsock/virtio, from Matias Ezequiel Vara Larsen. 4) Allow compiling out the TLS TOE code, from Jakub Kicinski. 5) Add several new tracepoints to the kTLS code, also from Jakub. 6) Support set channels ethtool callback in ena driver, from Sameeh Jubran. 7) New SCTP events SCTP_ADDR_ADDED, SCTP_ADDR_REMOVED, SCTP_ADDR_MADE_PRIM, and SCTP_SEND_FAILED_EVENT. From Xin Long. 8) Add XDP support to mvneta driver, from Lorenzo Bianconi. 9) Lots of netfilter hw offload fixes, cleanups and enhancements, from Pablo Neira Ayuso. 10) PTP support for aquantia chips, from Egor Pomozov. 11) Add UDP segmentation offload support to igb, ixgbe, and i40e. From Josh Hunt. 12) Add smart nagle to tipc, from Jon Maloy. 13) Support L2 field rewrite by TC offloads in bnxt_en, from Venkat Duvvuru. 14) Add a flow mask cache to OVS, from Tonghao Zhang. 15) Add XDP support to ice driver, from Maciej Fijalkowski. 16) Add AF_XDP support to ice driver, from Krzysztof Kazimierczak. 17) Support UDP GSO offload in atlantic driver, from Igor Russkikh. 18) Support it in stmmac driver too, from Jose Abreu. 19) Support TIPC encryption and auth, from Tuong Lien. 20) Introduce BPF trampolines, from Alexei Starovoitov. 21) Make page_pool API more numa friendly, from Saeed Mahameed. 22) Introduce route hints to ipv4 and ipv6, from Paolo Abeni. 23) Add UDP segmentation offload to cxgb4, Rahul Lakkireddy" * git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1857 commits) libbpf: Fix usage of u32 in userspace code mm: Implement no-MMU variant of vmalloc_user_node_flags slip: Fix use-after-free Read in slip_open net: dsa: sja1105: fix sja1105_parse_rgmii_delays() macvlan: schedule bc_work even if error enetc: add support Credit Based Shaper(CBS) for hardware offload net: phy: add helpers phy_(un)lock_mdio_bus mdio_bus: don't use managed reset-controller ax88179_178a: add ethtool_op_get_ts_info() mlxsw: spectrum_router: Fix use of uninitialized adjacency index mlxsw: spectrum_router: After underlay moves, demote conflicting tunnels bpf: Simplify __bpf_arch_text_poke poke type handling bpf: Introduce BPF_TRACE_x helper for the tracing tests bpf: Add bpf_jit_blinding_enabled for !CONFIG_BPF_JIT bpf, testing: Add various tail call test cases bpf, x86: Emit patchable direct jump as tail call bpf: Constant map key tracking for prog array pokes bpf: Add poke dependency tracking for prog array maps bpf: Add initial poke descriptor table for jit images bpf: Move owner type, jited info into array auxiliary data ...
Diffstat (limited to 'net/openvswitch/flow_table.c')
-rw-r--r--net/openvswitch/flow_table.c381
1 files changed, 304 insertions, 77 deletions
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index cf3582c5ed70..5904e93e5765 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -34,8 +34,13 @@
#include <net/ndisc.h>
#define TBL_MIN_BUCKETS 1024
+#define MASK_ARRAY_SIZE_MIN 16
#define REHASH_INTERVAL (10 * 60 * HZ)
+#define MC_HASH_SHIFT 8
+#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT)
+#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)
+
static struct kmem_cache *flow_cache;
struct kmem_cache *flow_stats_cache __read_mostly;
@@ -164,14 +169,133 @@ static struct table_instance *table_instance_alloc(int new_size)
return ti;
}
+static struct mask_array *tbl_mask_array_alloc(int size)
+{
+ struct mask_array *new;
+
+ size = max(MASK_ARRAY_SIZE_MIN, size);
+ new = kzalloc(sizeof(struct mask_array) +
+ sizeof(struct sw_flow_mask *) * size, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->count = 0;
+ new->max = size;
+
+ return new;
+}
+
+static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
+{
+ struct mask_array *old;
+ struct mask_array *new;
+
+ new = tbl_mask_array_alloc(size);
+ if (!new)
+ return -ENOMEM;
+
+ old = ovsl_dereference(tbl->mask_array);
+ if (old) {
+ int i;
+
+ for (i = 0; i < old->max; i++) {
+ if (ovsl_dereference(old->masks[i]))
+ new->masks[new->count++] = old->masks[i];
+ }
+ }
+
+ rcu_assign_pointer(tbl->mask_array, new);
+ kfree_rcu(old, rcu);
+
+ return 0;
+}
+
+static int tbl_mask_array_add_mask(struct flow_table *tbl,
+ struct sw_flow_mask *new)
+{
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+ int err, ma_count = READ_ONCE(ma->count);
+
+ if (ma_count >= ma->max) {
+ err = tbl_mask_array_realloc(tbl, ma->max +
+ MASK_ARRAY_SIZE_MIN);
+ if (err)
+ return err;
+
+ ma = ovsl_dereference(tbl->mask_array);
+ }
+
+ BUG_ON(ovsl_dereference(ma->masks[ma_count]));
+
+ rcu_assign_pointer(ma->masks[ma_count], new);
+ WRITE_ONCE(ma->count, ma_count +1);
+
+ return 0;
+}
+
+static void tbl_mask_array_del_mask(struct flow_table *tbl,
+ struct sw_flow_mask *mask)
+{
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+ int i, ma_count = READ_ONCE(ma->count);
+
+ /* Remove the deleted mask pointers from the array */
+ for (i = 0; i < ma_count; i++) {
+ if (mask == ovsl_dereference(ma->masks[i]))
+ goto found;
+ }
+
+ BUG();
+ return;
+
+found:
+ WRITE_ONCE(ma->count, ma_count -1);
+
+ rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]);
+ RCU_INIT_POINTER(ma->masks[ma_count -1], NULL);
+
+ kfree_rcu(mask, rcu);
+
+ /* Shrink the mask array if necessary. */
+ if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&
+ ma_count <= (ma->max / 3))
+ tbl_mask_array_realloc(tbl, ma->max / 2);
+}
+
+/* Remove 'mask' from the mask list, if it is not needed any more. */
+static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+ if (mask) {
+ /* ovs-lock is required to protect mask-refcount and
+ * mask list.
+ */
+ ASSERT_OVSL();
+ BUG_ON(!mask->ref_count);
+ mask->ref_count--;
+
+ if (!mask->ref_count)
+ tbl_mask_array_del_mask(tbl, mask);
+ }
+}
+
int ovs_flow_tbl_init(struct flow_table *table)
{
struct table_instance *ti, *ufid_ti;
+ struct mask_array *ma;
- ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) *
+ MC_HASH_ENTRIES,
+ __alignof__(struct mask_cache_entry));
+ if (!table->mask_cache)
+ return -ENOMEM;
+ ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
+ if (!ma)
+ goto free_mask_cache;
+
+ ti = table_instance_alloc(TBL_MIN_BUCKETS);
if (!ti)
- return -ENOMEM;
+ goto free_mask_array;
ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
if (!ufid_ti)
@@ -179,7 +303,7 @@ int ovs_flow_tbl_init(struct flow_table *table)
rcu_assign_pointer(table->ti, ti);
rcu_assign_pointer(table->ufid_ti, ufid_ti);
- INIT_LIST_HEAD(&table->mask_list);
+ rcu_assign_pointer(table->mask_array, ma);
table->last_rehash = jiffies;
table->count = 0;
table->ufid_count = 0;
@@ -187,6 +311,10 @@ int ovs_flow_tbl_init(struct flow_table *table)
free_ti:
__table_instance_destroy(ti);
+free_mask_array:
+ kfree(ma);
+free_mask_cache:
+ free_percpu(table->mask_cache);
return -ENOMEM;
}
@@ -197,7 +325,28 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
__table_instance_destroy(ti);
}
-static void table_instance_destroy(struct table_instance *ti,
+static void table_instance_flow_free(struct flow_table *table,
+ struct table_instance *ti,
+ struct table_instance *ufid_ti,
+ struct sw_flow *flow,
+ bool count)
+{
+ hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
+ if (count)
+ table->count--;
+
+ if (ovs_identifier_is_ufid(&flow->id)) {
+ hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
+
+ if (count)
+ table->ufid_count--;
+ }
+
+ flow_mask_remove(table, flow->mask);
+}
+
+static void table_instance_destroy(struct flow_table *table,
+ struct table_instance *ti,
struct table_instance *ufid_ti,
bool deferred)
{
@@ -214,13 +363,12 @@ static void table_instance_destroy(struct table_instance *ti,
struct sw_flow *flow;
struct hlist_head *head = &ti->buckets[i];
struct hlist_node *n;
- int ver = ti->node_ver;
- int ufid_ver = ufid_ti->node_ver;
- hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) {
- hlist_del_rcu(&flow->flow_table.node[ver]);
- if (ovs_identifier_is_ufid(&flow->id))
- hlist_del_rcu(&flow->ufid_table.node[ufid_ver]);
+ hlist_for_each_entry_safe(flow, n, head,
+ flow_table.node[ti->node_ver]) {
+
+ table_instance_flow_free(table, ti, ufid_ti,
+ flow, false);
ovs_flow_free(flow, deferred);
}
}
@@ -243,7 +391,9 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
struct table_instance *ti = rcu_dereference_raw(table->ti);
struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
- table_instance_destroy(ti, ufid_ti, false);
+ free_percpu(table->mask_cache);
+ kfree_rcu(rcu_dereference_raw(table->mask_array), rcu);
+ table_instance_destroy(table, ti, ufid_ti, false);
}
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
@@ -359,7 +509,7 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)
flow_table->count = 0;
flow_table->ufid_count = 0;
- table_instance_destroy(old_ti, old_ufid_ti, true);
+ table_instance_destroy(flow_table, old_ti, old_ufid_ti, true);
return 0;
err_free_ti:
@@ -370,13 +520,10 @@ err_free_ti:
static u32 flow_hash(const struct sw_flow_key *key,
const struct sw_flow_key_range *range)
{
- int key_start = range->start;
- int key_end = range->end;
- const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
- int hash_u32s = (key_end - key_start) >> 2;
+ const u32 *hash_key = (const u32 *)((const u8 *)key + range->start);
/* Make sure number of hash bytes are multiple of u32. */
- BUILD_BUG_ON(sizeof(long) % sizeof(u32));
+ int hash_u32s = range_n_bytes(range) >> 2;
return jhash2(hash_key, hash_u32s, 0);
}
@@ -425,7 +572,8 @@ static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
const struct sw_flow_key *unmasked,
- const struct sw_flow_mask *mask)
+ const struct sw_flow_mask *mask,
+ u32 *n_mask_hit)
{
struct sw_flow *flow;
struct hlist_head *head;
@@ -435,6 +583,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
ovs_flow_mask_key(&masked_key, unmasked, false, mask);
hash = flow_hash(&masked_key, &mask->range);
head = find_bucket(ti, hash);
+ (*n_mask_hit)++;
+
hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
@@ -443,46 +593,147 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
return NULL;
}
-struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
- const struct sw_flow_key *key,
- u32 *n_mask_hit)
+/* Flow lookup does full lookup on flow table. It starts with
+ * mask from index passed in *index.
+ */
+static struct sw_flow *flow_lookup(struct flow_table *tbl,
+ struct table_instance *ti,
+ struct mask_array *ma,
+ const struct sw_flow_key *key,
+ u32 *n_mask_hit,
+ u32 *index)
{
- struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct sw_flow *flow;
struct sw_flow_mask *mask;
+ int i;
+
+ if (likely(*index < ma->max)) {
+ mask = rcu_dereference_ovsl(ma->masks[*index]);
+ if (mask) {
+ flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
+ if (flow)
+ return flow;
+ }
+ }
+
+ for (i = 0; i < ma->max; i++) {
+
+ if (i == *index)
+ continue;
+
+ mask = rcu_dereference_ovsl(ma->masks[i]);
+ if (unlikely(!mask))
+ break;
+
+ flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
+ if (flow) { /* Found */
+ *index = i;
+ return flow;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * mask_cache maps flow to probable mask. This cache is not tightly
+ * coupled cache, It means updates to mask list can result in inconsistent
+ * cache entry in mask cache.
+ * This is per cpu cache and is divided in MC_HASH_SEGS segments.
+ * In case of a hash collision the entry is hashed in next segment.
+ * */
+struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
+ const struct sw_flow_key *key,
+ u32 skb_hash,
+ u32 *n_mask_hit)
+{
+ struct mask_array *ma = rcu_dereference(tbl->mask_array);
+ struct table_instance *ti = rcu_dereference(tbl->ti);
+ struct mask_cache_entry *entries, *ce;
struct sw_flow *flow;
+ u32 hash;
+ int seg;
*n_mask_hit = 0;
- list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
- (*n_mask_hit)++;
- flow = masked_flow_lookup(ti, key, mask);
- if (flow) /* Found */
+ if (unlikely(!skb_hash)) {
+ u32 mask_index = 0;
+
+ return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);
+ }
+
+ /* Pre and post recirulation flows usually have the same skb_hash
+ * value. To avoid hash collisions, rehash the 'skb_hash' with
+ * 'recirc_id'. */
+ if (key->recirc_id)
+ skb_hash = jhash_1word(skb_hash, key->recirc_id);
+
+ ce = NULL;
+ hash = skb_hash;
+ entries = this_cpu_ptr(tbl->mask_cache);
+
+ /* Find the cache entry 'ce' to operate on. */
+ for (seg = 0; seg < MC_HASH_SEGS; seg++) {
+ int index = hash & (MC_HASH_ENTRIES - 1);
+ struct mask_cache_entry *e;
+
+ e = &entries[index];
+ if (e->skb_hash == skb_hash) {
+ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
+ &e->mask_index);
+ if (!flow)
+ e->skb_hash = 0;
return flow;
+ }
+
+ if (!ce || e->skb_hash < ce->skb_hash)
+ ce = e; /* A better replacement cache candidate. */
+
+ hash >>= MC_HASH_SHIFT;
}
- return NULL;
+
+ /* Cache miss, do full lookup. */
+ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
+ if (flow)
+ ce->skb_hash = skb_hash;
+
+ return flow;
}
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
const struct sw_flow_key *key)
{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
u32 __always_unused n_mask_hit;
+ u32 index = 0;
- return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
+ return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index);
}
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
const struct sw_flow_match *match)
{
- struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
- struct sw_flow_mask *mask;
- struct sw_flow *flow;
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+ int i;
/* Always called under ovs-mutex. */
- list_for_each_entry(mask, &tbl->mask_list, list) {
- flow = masked_flow_lookup(ti, match->key, mask);
+ for (i = 0; i < ma->max; i++) {
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ u32 __always_unused n_mask_hit;
+ struct sw_flow_mask *mask;
+ struct sw_flow *flow;
+
+ mask = ovsl_dereference(ma->masks[i]);
+ if (!mask)
+ continue;
+
+ flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit);
if (flow && ovs_identifier_is_key(&flow->id) &&
- ovs_flow_cmp_unmasked_key(flow, match))
+ ovs_flow_cmp_unmasked_key(flow, match)) {
return flow;
+ }
}
+
return NULL;
}
@@ -528,13 +779,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
int ovs_flow_tbl_num_masks(const struct flow_table *table)
{
- struct sw_flow_mask *mask;
- int num = 0;
-
- list_for_each_entry(mask, &table->mask_list, list)
- num++;
-
- return num;
+ struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
+ return READ_ONCE(ma->count);
}
static struct table_instance *table_instance_expand(struct table_instance *ti,
@@ -543,24 +789,6 @@ static struct table_instance *table_instance_expand(struct table_instance *ti,
return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
}
-/* Remove 'mask' from the mask list, if it is not needed any more. */
-static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
-{
- if (mask) {
- /* ovs-lock is required to protect mask-refcount and
- * mask list.
- */
- ASSERT_OVSL();
- BUG_ON(!mask->ref_count);
- mask->ref_count--;
-
- if (!mask->ref_count) {
- list_del_rcu(&mask->list);
- kfree_rcu(mask, rcu);
- }
- }
-}
-
/* Must be called with OVS mutex held. */
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
{
@@ -568,17 +796,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
BUG_ON(table->count == 0);
- hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
- table->count--;
- if (ovs_identifier_is_ufid(&flow->id)) {
- hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
- table->ufid_count--;
- }
-
- /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
- * accessible as long as the RCU read lock is held.
- */
- flow_mask_remove(table, flow->mask);
+ table_instance_flow_free(table, ti, ufid_ti, flow, true);
}
static struct sw_flow_mask *mask_alloc(void)
@@ -606,13 +824,16 @@ static bool mask_equal(const struct sw_flow_mask *a,
static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
const struct sw_flow_mask *mask)
{
- struct list_head *ml;
+ struct mask_array *ma;
+ int i;
+
+ ma = ovsl_dereference(tbl->mask_array);
+ for (i = 0; i < ma->max; i++) {
+ struct sw_flow_mask *t;
+ t = ovsl_dereference(ma->masks[i]);
- list_for_each(ml, &tbl->mask_list) {
- struct sw_flow_mask *m;
- m = container_of(ml, struct sw_flow_mask, list);
- if (mask_equal(mask, m))
- return m;
+ if (t && mask_equal(mask, t))
+ return t;
}
return NULL;
@@ -623,6 +844,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
const struct sw_flow_mask *new)
{
struct sw_flow_mask *mask;
+
mask = flow_mask_find(tbl, new);
if (!mask) {
/* Allocate a new mask if none exsits. */
@@ -631,7 +853,12 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
return -ENOMEM;
mask->key = new->key;
mask->range = new->range;
- list_add_rcu(&mask->list, &tbl->mask_list);
+
+ /* Add mask to mask-list. */
+ if (tbl_mask_array_add_mask(tbl, mask)) {
+ kfree(mask);
+ return -ENOMEM;
+ }
} else {
BUG_ON(!mask->ref_count);
mask->ref_count++;