diff options
author | Daniel Borkmann <daniel@iogearbox.net> | 2023-02-13 19:13:13 +0100 |
---|---|---|
committer | Daniel Borkmann <daniel@iogearbox.net> | 2023-02-13 19:13:18 +0100 |
commit | 39c536acc3cf89b49c22f9e7f3f6578d3332944e (patch) | |
tree | a4711ebfbe3eda0cdc10b3c91d0d2c13762132e4 | |
parent | 0b0757244754ea1d0721195c824770f5576e119e (diff) | |
parent | ad07f29b9c9a29eba04b19116c6db51387a638d7 (diff) |
Merge branch 'xdp-ice-mbuf'
Alexander Lobakin says:
====================
The set grew from the poor performance of %BPF_F_TEST_XDP_LIVE_FRAMES
when the ice-backed device is a sender. Initially there were around
3.3 Mpps / thread, while I have 5.5 on skb-based pktgen ...
After fixing 0005 (0004 is a prereq for it) first (strange thing nobody
noticed that earlier), I started catching random OOMs. This is how 0002
(and partially 0001) appeared.
0003 is a suggestion from Maciej to not waste time on refactoring dead
lines. 0006 is a "cherry on top" to get away with the final 6.7 Mpps.
4.5 of 6 are fixes, but only the first three are tagged, since it then
starts being tricky. I may backport them manually later on.
TL;DR for the series is that shortcuts are good, but only as long as
they don't make the driver miss important things. %XDP_TX is purely
driver-local, however .ndo_xdp_xmit() is not, and sometimes assumptions
can be unsafe there.
With that series and also one core code patch[0], "live frames" and
xdp-trafficgen are now safe'n'fast on ice (probably more to come).
[0] https://lore.kernel.org/all/20230209172827.874728-1-alexandr.lobakin@intel.com
====================
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r-- | drivers/net/ethernet/intel/ice/ice_txrx.c | 67 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ice/ice_txrx.h | 37 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 88 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 4 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ice/ice_xsk.c | 12 |
5 files changed, 136 insertions, 72 deletions
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 466113c86e6f..aaf313a95368 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -85,7 +85,7 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc, td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY | ICE_TX_DESC_CMD_RE; - tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT; + tx_buf->type = ICE_TX_BUF_DUMMY; tx_buf->raw_buf = raw_packet; tx_desc->cmd_type_offset_bsz = @@ -112,31 +112,29 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc, static void ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf) { - if (tx_buf->skb) { - if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) { - devm_kfree(ring->dev, tx_buf->raw_buf); - } else if (ice_ring_is_xdp(ring)) { - if (ring->xsk_pool) - xsk_buff_free(tx_buf->xdp); - else - page_frag_free(tx_buf->raw_buf); - } else { - dev_kfree_skb_any(tx_buf->skb); - } - if (dma_unmap_len(tx_buf, len)) - dma_unmap_single(ring->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - } else if (dma_unmap_len(tx_buf, len)) { + if (dma_unmap_len(tx_buf, len)) dma_unmap_page(ring->dev, dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); + + switch (tx_buf->type) { + case ICE_TX_BUF_DUMMY: + devm_kfree(ring->dev, tx_buf->raw_buf); + break; + case ICE_TX_BUF_SKB: + dev_kfree_skb_any(tx_buf->skb); + break; + case ICE_TX_BUF_XDP_TX: + page_frag_free(tx_buf->raw_buf); + break; + case ICE_TX_BUF_XDP_XMIT: + xdp_return_frame(tx_buf->xdpf); + break; } tx_buf->next_to_watch = NULL; - tx_buf->skb = NULL; + tx_buf->type = ICE_TX_BUF_EMPTY; dma_unmap_len_set(tx_buf, len, 0); /* tx_buf must be completely set up in the transmit path */ } @@ -269,7 +267,7 @@ static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget) DMA_TO_DEVICE); /* clear tx_buf data */ - tx_buf->skb = NULL; + tx_buf->type = ICE_TX_BUF_EMPTY; dma_unmap_len_set(tx_buf, len, 0); /* unmap remaining buffers */ @@ -580,7 +578,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, case XDP_TX: if (static_branch_unlikely(&ice_xdp_locking_key)) spin_lock(&xdp_ring->tx_lock); - ret = __ice_xmit_xdp_ring(xdp, xdp_ring); + ret = __ice_xmit_xdp_ring(xdp, xdp_ring, false); if (static_branch_unlikely(&ice_xdp_locking_key)) spin_unlock(&xdp_ring->tx_lock); if (ret == ICE_XDP_CONSUMED) @@ -608,6 +606,25 @@ exit: } /** + * ice_xmit_xdp_ring - submit frame to XDP ring for transmission + * @xdpf: XDP frame that will be converted to XDP buff + * @xdp_ring: XDP ring for transmission + */ +static int ice_xmit_xdp_ring(const struct xdp_frame *xdpf, + struct ice_tx_ring *xdp_ring) +{ + struct xdp_buff xdp; + + xdp.data_hard_start = (void *)xdpf; + xdp.data = xdpf->data; + xdp.data_end = xdp.data + xdpf->len; + xdp.frame_sz = xdpf->frame_sz; + xdp.flags = xdpf->flags; + + return __ice_xmit_xdp_ring(&xdp, xdp_ring, true); +} + +/** * ice_xdp_xmit - submit packets to XDP ring for transmission * @dev: netdev * @n: number of XDP frames to be transmitted @@ -652,7 +669,7 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, tx_buf = &xdp_ring->tx_buf[xdp_ring->next_to_use]; for (i = 0; i < n; i++) { - struct xdp_frame *xdpf = frames[i]; + const struct xdp_frame *xdpf = frames[i]; int err; err = ice_xmit_xdp_ring(xdpf, xdp_ring); @@ -1712,6 +1729,7 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first, DMA_TO_DEVICE); tx_buf = &tx_ring->tx_buf[i]; + tx_buf->type = ICE_TX_BUF_FRAG; } /* record SW timestamp if HW timestamp is not available */ @@ -2355,6 +2373,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring) /* record the location of the first descriptor for this packet */ first = &tx_ring->tx_buf[tx_ring->next_to_use]; first->skb = skb; + first->type = ICE_TX_BUF_SKB; first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); first->gso_segs = 1; first->tx_flags = 0; @@ -2527,11 +2546,11 @@ void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring) dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); - if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) + if (tx_buf->type == ICE_TX_BUF_DUMMY) devm_kfree(tx_ring->dev, tx_buf->raw_buf); /* clear next_to_watch to prevent false hangs */ - tx_buf->raw_buf = NULL; + tx_buf->type = ICE_TX_BUF_EMPTY; tx_buf->tx_flags = 0; tx_buf->next_to_watch = NULL; dma_unmap_len_set(tx_buf, len, 0); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index efa3d378f19e..fff0efe28373 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -121,10 +121,7 @@ static inline int ice_skb_pad(void) #define ICE_TX_FLAGS_TSO BIT(0) #define ICE_TX_FLAGS_HW_VLAN BIT(1) #define ICE_TX_FLAGS_SW_VLAN BIT(2) -/* ICE_TX_FLAGS_DUMMY_PKT is used to mark dummy packets that should be - * freed instead of returned like skb packets. - */ -#define ICE_TX_FLAGS_DUMMY_PKT BIT(3) +/* Free, was ICE_TX_FLAGS_DUMMY_PKT */ #define ICE_TX_FLAGS_TSYN BIT(4) #define ICE_TX_FLAGS_IPV4 BIT(5) #define ICE_TX_FLAGS_IPV6 BIT(6) @@ -149,22 +146,44 @@ static inline int ice_skb_pad(void) #define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS) +/** + * enum ice_tx_buf_type - type of &ice_tx_buf to act on Tx completion + * @ICE_TX_BUF_EMPTY: unused OR XSk frame, no action required + * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree() + * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA + * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats + * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats + * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats + * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats + */ +enum ice_tx_buf_type { + ICE_TX_BUF_EMPTY = 0U, + ICE_TX_BUF_DUMMY, + ICE_TX_BUF_FRAG, + ICE_TX_BUF_SKB, + ICE_TX_BUF_XDP_TX, + ICE_TX_BUF_XDP_XMIT, + ICE_TX_BUF_XSK_TX, +}; + struct ice_tx_buf { union { struct ice_tx_desc *next_to_watch; u32 rs_idx; }; union { - struct sk_buff *skb; - void *raw_buf; /* used for XDP */ - struct xdp_buff *xdp; /* used for XDP_TX ZC */ + void *raw_buf; /* used for XDP_TX and FDir rules */ + struct sk_buff *skb; /* used for .ndo_start_xmit() */ + struct xdp_frame *xdpf; /* used for .ndo_xdp_xmit() */ + struct xdp_buff *xdp; /* used for XDP_TX ZC */ }; unsigned int bytecount; union { unsigned int gso_segs; - unsigned int nr_frags; /* used for mbuf XDP */ + unsigned int nr_frags; /* used for mbuf XDP */ }; - u32 tx_flags; + u32 type:16; /* &ice_tx_buf_type */ + u32 tx_flags:16; DEFINE_DMA_UNMAP_LEN(len); DEFINE_DMA_UNMAP_ADDR(dma); }; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 9bbed3f14e42..7bc5aa340c7d 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -222,18 +222,28 @@ ice_receive_skb(struct ice_rx_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag) /** * ice_clean_xdp_tx_buf - Free and unmap XDP Tx buffer - * @xdp_ring: XDP Tx ring + * @dev: device for DMA mapping * @tx_buf: Tx buffer to clean + * @bq: XDP bulk flush struct */ static void -ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) +ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf *tx_buf, + struct xdp_frame_bulk *bq) { - dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma), + dma_unmap_single(dev, dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); dma_unmap_len_set(tx_buf, len, 0); - xdp_ring->xdp_tx_active--; - page_frag_free(tx_buf->raw_buf); - tx_buf->raw_buf = NULL; + + switch (tx_buf->type) { + case ICE_TX_BUF_XDP_TX: + page_frag_free(tx_buf->raw_buf); + break; + case ICE_TX_BUF_XDP_XMIT: + xdp_return_frame_bulk(tx_buf->xdpf, bq); + break; + } + + tx_buf->type = ICE_TX_BUF_EMPTY; } /** @@ -243,11 +253,13 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) { int total_bytes = 0, total_pkts = 0; + struct device *dev = xdp_ring->dev; u32 ntc = xdp_ring->next_to_clean; struct ice_tx_desc *tx_desc; u32 cnt = xdp_ring->count; + struct xdp_frame_bulk bq; + u32 frags, xdp_tx = 0; u32 ready_frames = 0; - u32 frags; u32 idx; u32 ret; @@ -261,12 +273,16 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) ready_frames = idx + cnt - ntc + 1; } - if (!ready_frames) + if (unlikely(!ready_frames)) return 0; ret = ready_frames; + xdp_frame_bulk_init(&bq); + rcu_read_lock(); /* xdp_return_frame_bulk() */ + while (ready_frames) { struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc]; + struct ice_tx_buf *head = tx_buf; /* bytecount holds size of head + frags */ total_bytes += tx_buf->bytecount; @@ -274,11 +290,8 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) total_pkts++; /* count head + frags */ ready_frames -= frags + 1; + xdp_tx++; - if (xdp_ring->xsk_pool) - xsk_buff_free(tx_buf->xdp); - else - ice_clean_xdp_tx_buf(xdp_ring, tx_buf); ntc++; if (ntc == cnt) ntc = 0; @@ -286,15 +299,21 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) for (int i = 0; i < frags; i++) { tx_buf = &xdp_ring->tx_buf[ntc]; - ice_clean_xdp_tx_buf(xdp_ring, tx_buf); + ice_clean_xdp_tx_buf(dev, tx_buf, &bq); ntc++; if (ntc == cnt) ntc = 0; } + + ice_clean_xdp_tx_buf(dev, head, &bq); } + xdp_flush_frame_bulk(&bq); + rcu_read_unlock(); + tx_desc->cmd_type_offset_bsz = 0; xdp_ring->next_to_clean = ntc; + xdp_ring->xdp_tx_active -= xdp_tx; ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes); return ret; @@ -304,8 +323,10 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring) * __ice_xmit_xdp_ring - submit frame to XDP ring for transmission * @xdp: XDP buffer to be placed onto Tx descriptors * @xdp_ring: XDP ring for transmission + * @frame: whether this comes from .ndo_xdp_xmit() */ -int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) +int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring, + bool frame) { struct skb_shared_info *sinfo = NULL; u32 size = xdp->data_end - xdp->data; @@ -321,17 +342,17 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) u32 frag = 0; free_space = ICE_DESC_UNUSED(xdp_ring); - - if (ICE_DESC_UNUSED(xdp_ring) < ICE_RING_QUARTER(xdp_ring)) + if (free_space < ICE_RING_QUARTER(xdp_ring)) free_space += ice_clean_xdp_irq(xdp_ring); + if (unlikely(!free_space)) + goto busy; + if (unlikely(xdp_buff_has_frags(xdp))) { sinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = sinfo->nr_frags; - if (free_space < nr_frags + 1) { - xdp_ring->ring_stats->tx_stats.tx_busy++; - return ICE_XDP_CONSUMED; - } + if (free_space < nr_frags + 1) + goto busy; } tx_desc = ICE_TX_DESC(xdp_ring, ntu); @@ -349,9 +370,15 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) dma_unmap_len_set(tx_buf, len, size); dma_unmap_addr_set(tx_buf, dma, dma); + if (frame) { + tx_buf->type = ICE_TX_BUF_FRAG; + } else { + tx_buf->type = ICE_TX_BUF_XDP_TX; + tx_buf->raw_buf = data; + } + tx_desc->buf_addr = cpu_to_le64(dma); tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0); - tx_buf->raw_buf = data; ntu++; if (ntu == cnt) @@ -372,6 +399,11 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring) tx_head->bytecount = xdp_get_buff_len(xdp); tx_head->nr_frags = nr_frags; + if (frame) { + tx_head->type = ICE_TX_BUF_XDP_XMIT; + tx_head->xdpf = xdp->data_hard_start; + } + /* update last descriptor from a frame with EOP */ tx_desc->cmd_type_offset_bsz |= cpu_to_le64(ICE_TX_DESC_CMD_EOP << ICE_TXD_QW1_CMD_S); @@ -395,19 +427,11 @@ dma_unmap: ntu--; } return ICE_XDP_CONSUMED; -} -/** - * ice_xmit_xdp_ring - submit frame to XDP ring for transmission - * @xdpf: XDP frame that will be converted to XDP buff - * @xdp_ring: XDP ring for transmission - */ -int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring) -{ - struct xdp_buff xdp; +busy: + xdp_ring->ring_stats->tx_stats.tx_busy++; - xdp_convert_frame_to_buff(xdpf, &xdp); - return __ice_xmit_xdp_ring(&xdp, xdp_ring); + return ICE_XDP_CONSUMED; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h index ea977f283c22..115969ecdf7b 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h @@ -142,8 +142,8 @@ static inline u32 ice_set_rs_bit(const struct ice_tx_ring *xdp_ring) void ice_finalize_xdp_rx(struct ice_tx_ring *xdp_ring, unsigned int xdp_res, u32 first_idx); int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring); -int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring); -int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring); +int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring, + bool frame); void ice_release_rx_desc(struct ice_rx_ring *rx_ring, u16 val); void ice_process_skb_fields(struct ice_rx_ring *rx_ring, diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index a25a68c69f22..917c75e530ca 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -631,7 +631,8 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) for (i = 0; i < xsk_frames; i++) { tx_buf = &xdp_ring->tx_buf[ntc]; - if (tx_buf->xdp) { + if (tx_buf->type == ICE_TX_BUF_XSK_TX) { + tx_buf->type = ICE_TX_BUF_EMPTY; xsk_buff_free(tx_buf->xdp); xdp_ring->xdp_tx_active--; } else { @@ -685,6 +686,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, tx_buf = &xdp_ring->tx_buf[ntu]; tx_buf->xdp = xdp; + tx_buf->type = ICE_TX_BUF_XSK_TX; tx_desc = ICE_TX_DESC(xdp_ring, ntu); tx_desc->buf_addr = cpu_to_le64(dma); tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP, @@ -1083,12 +1085,12 @@ void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring) while (ntc != ntu) { struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc]; - if (tx_buf->xdp) + if (tx_buf->type == ICE_TX_BUF_XSK_TX) { + tx_buf->type = ICE_TX_BUF_EMPTY; xsk_buff_free(tx_buf->xdp); - else + } else { xsk_frames++; - - tx_buf->raw_buf = NULL; + } ntc++; if (ntc >= xdp_ring->count) |