summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/intel/i40e/i40e_xsk.h
diff options
context:
space:
mode:
authorMagnus Karlsson <magnus.karlsson@intel.com>2020-11-16 12:12:47 +0100
committerDaniel Borkmann <daniel@iogearbox.net>2020-11-17 22:07:40 +0100
commit3106c580fb7cf26691c1ce3aba2223f3ae56d846 (patch)
treefd8b06d91a34b79b6f9b456ed2d6509033f39e1b /drivers/net/ethernet/intel/i40e/i40e_xsk.h
parent9349eb3a9d2ae0151510dd98b6640dfaeebee9cc (diff)
i40e: Use batched xsk Tx interfaces to increase performance
Use the new batched xsk interfaces for the Tx path in the i40e driver to improve performance. On my machine, this yields a throughput increase of 4% for the l2fwd sample app in xdpsock. If we instead just look at the Tx part, this patch set increases throughput with above 20% for Tx. Note that I had to explicitly loop unroll the inner loop to get to this performance level, by using a pragma. It is honored by both clang and gcc and should be ignored by versions that do not support it. Using the -funroll-loops compiler command line switch on the source file resulted in a loop unrolling on a higher level that lead to a performance decrease instead of an increase. Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: John Fastabend <john.fastabend@gmail.com> Link: https://lore.kernel.org/bpf/1605525167-14450-6-git-send-email-magnus.karlsson@gmail.com
Diffstat (limited to 'drivers/net/ethernet/intel/i40e/i40e_xsk.h')
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.h16
1 files changed, 16 insertions, 0 deletions
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
index 7adfd8539247..ea88f4597a07 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
@@ -4,6 +4,22 @@
#ifndef _I40E_XSK_H_
#define _I40E_XSK_H_
+/* This value should match the pragma in the loop_unrolled_for
+ * macro. Why 4? It is strictly empirical. It seems to be a good
+ * compromise between the advantage of having simultaneous outstanding
+ * reads to the DMA array that can hide each others latency and the
+ * disadvantage of having a larger code path.
+ */
+#define PKTS_PER_BATCH 4
+
+#ifdef __clang__
+#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
+#elif __GNUC__ >= 8
+#define loop_unrolled_for _Pragma("GCC unroll 4") for
+#else
+#define loop_unrolled_for for
+#endif
+
struct i40e_vsi;
struct xsk_buff_pool;
struct zero_copy_allocator;