net/mlx5e: RX, Break the wqe bulk refill in smaller chunks

To avoid overflowing the page pool's cache, don't release the whole bulk which is usually larger than the cache refill size. Group release+alloc instead into cache refill units that allow releasing to the cache and then allocating from the cache. A refill_unit variable is added as a iteration unit over the wqe_bulk when doing release+alloc. For a single ring, single core, default MTU (1500) TCP stream test the number of pages allocated from the cache directly (rx_pp_recycle_cached) increases from 0% to 52%: +---------------------------------------------+ | Page Pool stats (/sec) | Before | After | +-------------------------+---------+---------+ |rx_pp_alloc_fast | 2145422 | 2193802 | |rx_pp_alloc_slow | 2 | 0 | |rx_pp_alloc_empty | 2 | 0 | |rx_pp_alloc_refill | 34059 | 16634 | |rx_pp_alloc_waive | 0 | 0 | |rx_pp_recycle_cached | 0 | 1145818 | |rx_pp_recycle_cache_full | 0 | 0 | |rx_pp_recycle_ring | 2179361 | 1064616 | |rx_pp_recycle_ring_full | 121 | 0 | +---------------------------------------------+ With this patch, the performance for legacy rq for the above test is back to baseline. Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
author: Dragos Tatulea <dtatulea@nvidia.com> 2023-02-21 21:05:07 +0200
committer: Saeed Mahameed <saeedm@nvidia.com> 2023-03-28 13:43:59 -0700
commit: cd640b050368d5be6bccf1edb51b1e4c553555e6 (patch)
tree: 4911c91ea21916632daf68dd26846b8ed72771b3 /drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
parent: 4ba2b4988c98ce9b56b77a1610c3a7b70ee30b57 (diff)
1 files changed, 26 insertions, 2 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 9c5270eb9dc6..df5dbef9e5ec 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -449,6 +449,31 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
 	return i;
 }
 
+static int mlx5e_refill_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
+{
+	int remaining = wqe_bulk;
+	int i = 0;
+
+	/* The WQE bulk is split into smaller bulks that are sized
+	 * according to the page pool cache refill size to avoid overflowing
+	 * the page pool cache due to too many page releases at once.
+	 */
+	do {
+		int refill = min_t(u16, rq->wqe.info.refill_unit, remaining);
+		int alloc_count;
+
+		mlx5e_free_rx_wqes(rq, ix + i, refill);
+		alloc_count = mlx5e_alloc_rx_wqes(rq, ix + i, refill);
+		i += alloc_count;
+		if (unlikely(alloc_count != refill))
+			break;
+
+		remaining -= refill;
+	} while (remaining);
+
+	return i;
+}
+
 static inline void
 mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
 		   struct page *page, u32 frag_offset, u32 len,
@@ -837,8 +862,7 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	wqe_bulk -= (head + wqe_bulk) & rq->wqe.info.wqe_index_mask;
 
 	if (!rq->xsk_pool) {
-		mlx5e_free_rx_wqes(rq, head, wqe_bulk);
-		count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
+		count = mlx5e_refill_rx_wqes(rq, head, wqe_bulk);
 	} else if (likely(!rq->xsk_pool->dma_need_sync)) {
 		mlx5e_xsk_free_rx_wqes(rq, head, wqe_bulk);
 		count = mlx5e_xsk_alloc_rx_wqes_batched(rq, head, wqe_bulk);
author	Dragos Tatulea <dtatulea@nvidia.com>	2023-02-21 21:05:07 +0200
committer	Saeed Mahameed <saeedm@nvidia.com>	2023-03-28 13:43:59 -0700
commit	cd640b050368d5be6bccf1edb51b1e4c553555e6 (patch)
tree	4911c91ea21916632daf68dd26846b8ed72771b3 /drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
parent	4ba2b4988c98ce9b56b77a1610c3a7b70ee30b57 (diff)