summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/intel/igc/igc_main.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 21:48:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 21:48:21 -0700
commit1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95 (patch)
tree286dda5e84757594218e684b94b01b3a3cac15a2 /drivers/net/ethernet/intel/igc/igc_main.c
parente61f33273ca755b3e2ebee4520a76097199dc7a8 (diff)
parent023b1e9d265ca0662111a9df23d22b4632717a8a (diff)
Merge tag 'net-next-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Continue Netlink conversions to per-namespace RTNL lock (IPv4 routing, routing rules, routing next hops, ARP ioctls) - Continue extending the use of netdev instance locks. As a driver opt-in protect queue operations and (in due course) ethtool operations with the instance lock and not RTNL lock. - Support collecting TCP timestamps (data submitted, sent, acked) in BPF, allowing for transparent (to the application) and lower overhead tracking of TCP RPC performance. - Tweak existing networking Rx zero-copy infra to support zero-copy Rx via io_uring. - Optimize MPTCP performance in single subflow mode by 29%. - Enable GRO on packets which went thru XDP CPU redirect (were queued for processing on a different CPU). Improving TCP stream performance up to 2x. - Improve performance of contended connect() by 200% by searching for an available 4-tuple under RCU rather than a spin lock. Bring an additional 229% improvement by tweaking hash distribution. - Avoid unconditionally touching sk_tsflags on RX, improving performance under UDP flood by as much as 10%. - Avoid skb_clone() dance in ping_rcv() to improve performance under ping flood. - Avoid FIB lookup in netfilter if socket is available, 20% perf win. - Rework network device creation (in-kernel) API to more clearly identify network namespaces and their roles. There are up to 4 namespace roles but we used to have just 2 netns pointer arguments, interpreted differently based on context. - Use sysfs_break_active_protection() instead of trylock to avoid deadlocks between unregistering objects and sysfs access. - Add a new sysctl and sockopt for capping max retransmit timeout in TCP. - Support masking port and DSCP in routing rule matches. - Support dumping IPv4 multicast addresses with RTM_GETMULTICAST. - Support specifying at what time packet should be sent on AF_XDP sockets. - Expose TCP ULP diagnostic info (for TLS and MPTCP) to non-admin users. - Add Netlink YAML spec for WiFi (nl80211) and conntrack. - Introduce EXPORT_IPV6_MOD() and EXPORT_IPV6_MOD_GPL() for symbols which only need to be exported when IPv6 support is built as a module. - Age FDB entries based on Rx not Tx traffic in VxLAN, similar to normal bridging. - Allow users to specify source port range for GENEVE tunnels. - netconsole: allow attaching kernel release, CPU ID and task name to messages as metadata Driver API: - Continue rework / fixing of Energy Efficient Ethernet (EEE) across the SW layers. Delegate the responsibilities to phylink where possible. Improve its handling in phylib. - Support symmetric OR-XOR RSS hashing algorithm. - Support tracking and preserving IRQ affinity by NAPI itself. - Support loopback mode speed selection for interface selftests. Device drivers: - Remove the IBM LCS driver for s390 - Remove the sb1000 cable modem driver - Add support for SFP module access over SMBus - Add MCTP transport driver for MCTP-over-USB - Enable XDP metadata support in multiple drivers - Ethernet high-speed NICs: - Broadcom (bnxt): - add PCIe TLP Processing Hints (TPH) support for new AMD platforms - support dumping RoCE queue state for debug - opt into instance locking - Intel (100G, ice, idpf): - ice: rework MSI-X IRQ management and distribution - ice: support for E830 devices - iavf: add support for Rx timestamping - iavf: opt into instance locking - nVidia/Mellanox: - mlx4: use page pool memory allocator for Rx - mlx5: support for one PTP device per hardware clock - mlx5: support for 200Gbps per-lane link modes - mlx5: move IPSec policy check after decryption - AMD/Solarflare: - support FW flashing via devlink - Cisco (enic): - use page pool memory allocator for Rx - enable 32, 64 byte CQEs - get max rx/tx ring size from the device - Meta (fbnic): - support flow steering and RSS configuration - report queue stats - support TCP segmentation - support IRQ coalescing - support ring size configuration - Marvell/Cavium: - support AF_XDP - Wangxun: - support for PTP clock and timestamping - Huawei (hibmcge): - checksum offload - add more statistics - Ethernet virtual: - VirtIO net: - aggressively suppress Tx completions, improve perf by 96% with 1 CPU and 55% with 2 CPUs - expose NAPI to IRQ mapping and persist NAPI settings - Google (gve): - support XDP in DQO RDA Queue Format - opt into instance locking - Microsoft vNIC: - support BIG TCP - Ethernet NICs consumer, and embedded: - Synopsys (stmmac): - cleanup Tx and Tx clock setting and other link-focused cleanups - enable SGMII and 2500BASEX mode switching for Intel platforms - support Sophgo SG2044 - Broadcom switches (b53): - support for BCM53101 - TI: - iep: add perout configuration support - icssg: support XDP - Cadence (macb): - implement BQL - Xilinx (axinet): - support dynamic IRQ moderation and changing coalescing at runtime - implement BQL - report standard stats - MediaTek: - support phylink managed EEE - Intel: - igc: don't restart the interface on every XDP program change - RealTek (r8169): - support reading registers of internal PHYs directly - increase max jumbo packet size on RTL8125/RTL8126 - Airoha: - support for RISC-V NPU packet processing unit - enable scatter-gather and support MTU up to 9kB - Tehuti (tn40xx): - support cards with TN4010 MAC and an Aquantia AQR105 PHY - Ethernet PHYs: - support for TJA1102S, TJA1121 - dp83tg720: add randomized polling intervals for link detection - dp83822: support changing the transmit amplitude voltage - support for LEDs on 88q2xxx - CAN: - canxl: support Remote Request Substitution bit access - flexcan: add S32G2/S32G3 SoC - WiFi: - remove cooked monitor support - strict mode for better AP testing - basic EPCS support - OMI RX bandwidth reduction support - batman-adv: add support for jumbo frames - WiFi drivers: - RealTek (rtw88): - support RTL8814AE and RTL8814AU - RealTek (rtw89): - switch using wiphy_lock and wiphy_work - add BB context to manipulate two PHY as preparation of MLO - improve BT-coexistence mechanism to play A2DP smoothly - Intel (iwlwifi): - add new iwlmld sub-driver for latest HW/FW combinations - MediaTek (mt76): - preparation for mt7996 Multi-Link Operation (MLO) support - Qualcomm/Atheros (ath12k): - continued work on MLO - Silabs (wfx): - Wake-on-WLAN support - Bluetooth: - add support for skb TX SND/COMPLETION timestamping - hci_core: enable buffer flow control for SCO/eSCO - coredump: log devcd dumps into the monitor - Bluetooth drivers: - intel: add support to configure TX power - nxp: handle bootloader error during cmd5 and cmd7" * tag 'net-next-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1681 commits) unix: fix up for "apparmor: add fine grained af_unix mediation" mctp: Fix incorrect tx flow invalidation condition in mctp-i2c net: usb: asix: ax88772: Increase phy_name size net: phy: Introduce PHY_ID_SIZE — minimum size for PHY ID string net: libwx: fix Tx L4 checksum net: libwx: fix Tx descriptor content for some tunnel packets atm: Fix NULL pointer dereference net: tn40xx: add pci-id of the aqr105-based Tehuti TN4010 cards net: tn40xx: prepare tn40xx driver to find phy of the TN9510 card net: tn40xx: create swnode for mdio and aqr105 phy and add to mdiobus net: phy: aquantia: add essential functions to aqr105 driver net: phy: aquantia: search for firmware-name in fwnode net: phy: aquantia: add probe function to aqr105 for firmware loading net: phy: Add swnode support to mdiobus_scan gve: add XDP DROP and PASS support for DQ gve: update XDP allocation path support RX buffer posting gve: merge packet buffer size fields gve: update GQ RX to use buf_size gve: introduce config-based allocation for XDP gve: remove xdp_xsk_done and xdp_xsk_wakeup statistics ...
Diffstat (limited to 'drivers/net/ethernet/intel/igc/igc_main.c')
-rw-r--r--drivers/net/ethernet/intel/igc/igc_main.c146
1 files changed, 111 insertions, 35 deletions
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 733820a0c350..491d942cefca 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -1092,7 +1092,8 @@ static int igc_init_empty_frame(struct igc_ring *ring,
dma = dma_map_single(ring->dev, skb->data, size, DMA_TO_DEVICE);
if (dma_mapping_error(ring->dev, dma)) {
- netdev_err_once(ring->netdev, "Failed to map DMA for TX\n");
+ net_err_ratelimited("%s: DMA mapping error for empty frame\n",
+ netdev_name(ring->netdev));
return -ENOMEM;
}
@@ -1108,20 +1109,12 @@ static int igc_init_empty_frame(struct igc_ring *ring,
return 0;
}
-static int igc_init_tx_empty_descriptor(struct igc_ring *ring,
- struct sk_buff *skb,
- struct igc_tx_buffer *first)
+static void igc_init_tx_empty_descriptor(struct igc_ring *ring,
+ struct sk_buff *skb,
+ struct igc_tx_buffer *first)
{
union igc_adv_tx_desc *desc;
u32 cmd_type, olinfo_status;
- int err;
-
- if (!igc_desc_unused(ring))
- return -EBUSY;
-
- err = igc_init_empty_frame(ring, first, skb);
- if (err)
- return err;
cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT |
IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD |
@@ -1140,8 +1133,6 @@ static int igc_init_tx_empty_descriptor(struct igc_ring *ring,
ring->next_to_use++;
if (ring->next_to_use == ring->count)
ring->next_to_use = 0;
-
- return 0;
}
#define IGC_EMPTY_FRAME_SIZE 60
@@ -1567,6 +1558,40 @@ static bool igc_request_tx_tstamp(struct igc_adapter *adapter, struct sk_buff *s
return false;
}
+static int igc_insert_empty_frame(struct igc_ring *tx_ring)
+{
+ struct igc_tx_buffer *empty_info;
+ struct sk_buff *empty_skb;
+ void *data;
+ int ret;
+
+ empty_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
+ empty_skb = alloc_skb(IGC_EMPTY_FRAME_SIZE, GFP_ATOMIC);
+ if (unlikely(!empty_skb)) {
+ net_err_ratelimited("%s: skb alloc error for empty frame\n",
+ netdev_name(tx_ring->netdev));
+ return -ENOMEM;
+ }
+
+ data = skb_put(empty_skb, IGC_EMPTY_FRAME_SIZE);
+ memset(data, 0, IGC_EMPTY_FRAME_SIZE);
+
+ /* Prepare DMA mapping and Tx buffer information */
+ ret = igc_init_empty_frame(tx_ring, empty_info, empty_skb);
+ if (unlikely(ret)) {
+ dev_kfree_skb_any(empty_skb);
+ return ret;
+ }
+
+ /* Prepare advanced context descriptor for empty packet */
+ igc_tx_ctxtdesc(tx_ring, 0, false, 0, 0, 0);
+
+ /* Prepare advanced data descriptor for empty packet */
+ igc_init_tx_empty_descriptor(tx_ring, empty_skb, empty_info);
+
+ return 0;
+}
+
static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb,
struct igc_ring *tx_ring)
{
@@ -1586,6 +1611,7 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb,
* + 1 desc for skb_headlen/IGC_MAX_DATA_PER_TXD,
* + 2 desc gap to keep tail from touching head,
* + 1 desc for context descriptor,
+ * + 2 desc for inserting an empty packet for launch time,
* otherwise try next time
*/
for (f = 0; f < skb_shinfo(skb)->nr_frags; f++)
@@ -1605,24 +1631,16 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb,
launch_time = igc_tx_launchtime(tx_ring, txtime, &first_flag, &insert_empty);
if (insert_empty) {
- struct igc_tx_buffer *empty_info;
- struct sk_buff *empty;
- void *data;
-
- empty_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
- empty = alloc_skb(IGC_EMPTY_FRAME_SIZE, GFP_ATOMIC);
- if (!empty)
- goto done;
-
- data = skb_put(empty, IGC_EMPTY_FRAME_SIZE);
- memset(data, 0, IGC_EMPTY_FRAME_SIZE);
-
- igc_tx_ctxtdesc(tx_ring, 0, false, 0, 0, 0);
-
- if (igc_init_tx_empty_descriptor(tx_ring,
- empty,
- empty_info) < 0)
- dev_kfree_skb_any(empty);
+ /* Reset the launch time if the required empty frame fails to
+ * be inserted. However, this packet is not dropped, so it
+ * "dirties" the current Qbv cycle. This ensures that the
+ * upcoming packet, which is scheduled in the next Qbv cycle,
+ * does not require an empty frame. This way, the launch time
+ * continues to function correctly despite the current failure
+ * to insert the empty frame.
+ */
+ if (igc_insert_empty_frame(tx_ring))
+ launch_time = 0;
}
done:
@@ -1650,7 +1668,8 @@ done:
if (igc_request_tx_tstamp(adapter, skb, &tstamp_flags)) {
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
tx_flags |= IGC_TX_FLAGS_TSTAMP | tstamp_flags;
- if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_USE_CYCLES)
+ if (skb->sk &&
+ READ_ONCE(skb->sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC)
tx_flags |= IGC_TX_FLAGS_TSTAMP_TIMER_1;
} else {
adapter->tx_hwtstamp_skipped++;
@@ -2953,9 +2972,48 @@ static u64 igc_xsk_fill_timestamp(void *_priv)
return *(u64 *)_priv;
}
+static void igc_xsk_request_launch_time(u64 launch_time, void *_priv)
+{
+ struct igc_metadata_request *meta_req = _priv;
+ struct igc_ring *tx_ring = meta_req->tx_ring;
+ __le32 launch_time_offset;
+ bool insert_empty = false;
+ bool first_flag = false;
+ u16 used_desc = 0;
+
+ if (!tx_ring->launchtime_enable)
+ return;
+
+ launch_time_offset = igc_tx_launchtime(tx_ring,
+ ns_to_ktime(launch_time),
+ &first_flag, &insert_empty);
+ if (insert_empty) {
+ /* Disregard the launch time request if the required empty frame
+ * fails to be inserted.
+ */
+ if (igc_insert_empty_frame(tx_ring))
+ return;
+
+ meta_req->tx_buffer =
+ &tx_ring->tx_buffer_info[tx_ring->next_to_use];
+ /* Inserting an empty packet requires two descriptors:
+ * one data descriptor and one context descriptor.
+ */
+ used_desc += 2;
+ }
+
+ /* Use one context descriptor to specify launch time and first flag. */
+ igc_tx_ctxtdesc(tx_ring, launch_time_offset, first_flag, 0, 0, 0);
+ used_desc += 1;
+
+ /* Update the number of used descriptors in this request */
+ meta_req->used_desc += used_desc;
+}
+
const struct xsk_tx_metadata_ops igc_xsk_tx_metadata_ops = {
.tmo_request_timestamp = igc_xsk_request_timestamp,
.tmo_fill_timestamp = igc_xsk_fill_timestamp,
+ .tmo_request_launch_time = igc_xsk_request_launch_time,
};
static void igc_xdp_xmit_zc(struct igc_ring *ring)
@@ -2978,7 +3036,13 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring)
ntu = ring->next_to_use;
budget = igc_desc_unused(ring);
- while (xsk_tx_peek_desc(pool, &xdp_desc) && budget--) {
+ /* Packets with launch time require one data descriptor and one context
+ * descriptor. When the launch time falls into the next Qbv cycle, we
+ * may need to insert an empty packet, which requires two more
+ * descriptors. Therefore, to be safe, we always ensure we have at least
+ * 4 descriptors available.
+ */
+ while (xsk_tx_peek_desc(pool, &xdp_desc) && budget >= 4) {
struct igc_metadata_request meta_req;
struct xsk_tx_metadata *meta = NULL;
struct igc_tx_buffer *bi;
@@ -2999,9 +3063,19 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring)
meta_req.tx_ring = ring;
meta_req.tx_buffer = bi;
meta_req.meta = meta;
+ meta_req.used_desc = 0;
xsk_tx_metadata_request(meta, &igc_xsk_tx_metadata_ops,
&meta_req);
+ /* xsk_tx_metadata_request() may have updated next_to_use */
+ ntu = ring->next_to_use;
+
+ /* xsk_tx_metadata_request() may have updated Tx buffer info */
+ bi = meta_req.tx_buffer;
+
+ /* xsk_tx_metadata_request() may use a few descriptors */
+ budget -= meta_req.used_desc;
+
tx_desc = IGC_TX_DESC(ring, ntu);
tx_desc->read.cmd_type_len = cpu_to_le32(meta_req.cmd_type);
tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
@@ -3019,9 +3093,11 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring)
ntu++;
if (ntu == ring->count)
ntu = 0;
+
+ ring->next_to_use = ntu;
+ budget--;
}
- ring->next_to_use = ntu;
if (tx_desc) {
igc_flush_tx_descriptors(ring);
xsk_tx_release(pool);