summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/google/gve
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/ethernet/google/gve')
-rw-r--r--drivers/net/ethernet/google/gve/Makefile4
-rw-r--r--drivers/net/ethernet/google/gve/gve.h107
-rw-r--r--drivers/net/ethernet/google/gve/gve_adminq.c105
-rw-r--r--drivers/net/ethernet/google/gve/gve_adminq.h30
-rw-r--r--drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c30
-rw-r--r--drivers/net/ethernet/google/gve/gve_desc_dqo.h6
-rw-r--r--drivers/net/ethernet/google/gve/gve_dqo.h4
-rw-r--r--drivers/net/ethernet/google/gve/gve_ethtool.c135
-rw-r--r--drivers/net/ethernet/google/gve/gve_main.c533
-rw-r--r--drivers/net/ethernet/google/gve/gve_ptp.c166
-rw-r--r--drivers/net/ethernet/google/gve/gve_rx.c14
-rw-r--r--drivers/net/ethernet/google/gve/gve_rx_dqo.c291
-rw-r--r--drivers/net/ethernet/google/gve/gve_tx.c6
-rw-r--r--drivers/net/ethernet/google/gve/gve_tx_dqo.c401
-rw-r--r--drivers/net/ethernet/google/gve/gve_utils.c2
15 files changed, 1490 insertions, 344 deletions
diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile
index 4520f1c07a63..e0ec227a50f7 100644
--- a/drivers/net/ethernet/google/gve/Makefile
+++ b/drivers/net/ethernet/google/gve/Makefile
@@ -1,5 +1,7 @@
# Makefile for the Google virtual Ethernet (gve) driver
obj-$(CONFIG_GVE) += gve.o
-gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o gve_flow_rule.o \
+gve-y := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o gve_flow_rule.o \
gve_buffer_mgmt_dqo.o
+
+gve-$(CONFIG_PTP_1588_CLOCK) += gve_ptp.o
diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index 2fab38c8ee78..970d5ca8cdde 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -11,7 +11,9 @@
#include <linux/dmapool.h>
#include <linux/ethtool_netlink.h>
#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
#include <linux/pci.h>
+#include <linux/ptp_clock_kernel.h>
#include <linux/u64_stats_sync.h>
#include <net/page_pool/helpers.h>
#include <net/xdp.h>
@@ -57,8 +59,6 @@
#define GVE_DEFAULT_RX_BUFFER_SIZE 2048
-#define GVE_MAX_RX_BUFFER_SIZE 4096
-
#define GVE_XDP_RX_BUFFER_SIZE_DQO 4096
#define GVE_DEFAULT_RX_BUFFER_OFFSET 2048
@@ -98,6 +98,8 @@
*/
#define GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD 96
+#define GVE_DQO_RX_HWTSTAMP_VALID 0x1
+
/* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */
struct gve_rx_desc_queue {
struct gve_rx_desc *desc_ring; /* the descriptor ring */
@@ -188,6 +190,9 @@ struct gve_rx_buf_state_dqo {
/* The page posted to HW. */
struct gve_rx_slot_page_info page_info;
+ /* XSK buffer */
+ struct xdp_buff *xsk_buff;
+
/* The DMA address corresponding to `page_info`. */
dma_addr_t addr;
@@ -200,6 +205,13 @@ struct gve_rx_buf_state_dqo {
s16 next;
};
+/* Wrapper for XDP Rx metadata */
+struct gve_xdp_buff {
+ struct xdp_buff xdp;
+ struct gve_priv *gve;
+ const struct gve_rx_compl_desc_dqo *compl_desc;
+};
+
/* `head` and `tail` are indices into an array, or -1 if empty. */
struct gve_index_list {
s16 head;
@@ -329,7 +341,6 @@ struct gve_rx_ring {
/* XDP stuff */
struct xdp_rxq_info xdp_rxq;
- struct xdp_rxq_info xsk_rxq;
struct xsk_buff_pool *xsk_pool;
struct page_frag_cache page_cache; /* Page cache to allocate XDP frames */
};
@@ -398,10 +409,24 @@ enum gve_packet_state {
GVE_PACKET_STATE_PENDING_REINJECT_COMPL,
/* No valid completion received within the specified timeout. */
GVE_PACKET_STATE_TIMED_OUT_COMPL,
+ /* XSK pending packet has received a packet/reinjection completion, or
+ * has timed out. At this point, the pending packet can be counted by
+ * xsk_tx_complete and freed.
+ */
+ GVE_PACKET_STATE_XSK_COMPLETE,
+};
+
+enum gve_tx_pending_packet_dqo_type {
+ GVE_TX_PENDING_PACKET_DQO_SKB,
+ GVE_TX_PENDING_PACKET_DQO_XDP_FRAME,
+ GVE_TX_PENDING_PACKET_DQO_XSK,
};
struct gve_tx_pending_packet_dqo {
- struct sk_buff *skb; /* skb for this packet */
+ union {
+ struct sk_buff *skb;
+ struct xdp_frame *xdpf;
+ };
/* 0th element corresponds to the linear portion of `skb`, should be
* unmapped with `dma_unmap_single`.
@@ -431,7 +456,10 @@ struct gve_tx_pending_packet_dqo {
/* Identifies the current state of the packet as defined in
* `enum gve_packet_state`.
*/
- u8 state;
+ u8 state : 3;
+
+ /* gve_tx_pending_packet_dqo_type */
+ u8 type : 2;
/* If packet is an outstanding miss completion, then the packet is
* freed if the corresponding re-injection completion is not received
@@ -453,6 +481,9 @@ struct gve_tx_ring {
/* DQO fields. */
struct {
+ /* Spinlock for XDP tx traffic */
+ spinlock_t xdp_lock;
+
/* Linked list of gve_tx_pending_packet_dqo. Index into
* pending_packets, or -1 if empty.
*
@@ -497,6 +528,8 @@ struct gve_tx_ring {
/* Cached value of `dqo_compl.free_tx_qpl_buf_cnt` */
u32 free_tx_qpl_buf_cnt;
};
+
+ atomic_t xsk_reorder_queue_tail;
} dqo_tx;
};
@@ -530,6 +563,9 @@ struct gve_tx_ring {
/* Last TX ring index fetched by HW */
atomic_t hw_tx_head;
+ u16 xsk_reorder_queue_head;
+ u16 xsk_reorder_queue_tail;
+
/* List to track pending packets which received a miss
* completion but not a corresponding reinjection.
*/
@@ -583,6 +619,8 @@ struct gve_tx_ring {
struct gve_tx_pending_packet_dqo *pending_packets;
s16 num_pending_packets;
+ u16 *xsk_reorder_queue;
+
u32 complq_mask; /* complq size is complq_mask + 1 */
/* QPL fields */
@@ -750,6 +788,12 @@ struct gve_rss_config {
u32 *hash_lut;
};
+struct gve_ptp {
+ struct ptp_clock_info info;
+ struct ptp_clock *clock;
+ struct gve_priv *priv;
+};
+
struct gve_priv {
struct net_device *dev;
struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */
@@ -781,7 +825,9 @@ struct gve_priv {
struct gve_tx_queue_config tx_cfg;
struct gve_rx_queue_config rx_cfg;
- u32 num_ntfy_blks; /* spilt between TX and RX so must be even */
+ unsigned long *xsk_pools; /* bitmap of RX queues with XSK pools */
+ u32 num_ntfy_blks; /* split between TX and RX so must be even */
+ int numa_node;
struct gve_registers __iomem *reg_bar0; /* see gve_register.h */
__be32 __iomem *db_bar2; /* "array" of doorbells */
@@ -813,6 +859,7 @@ struct gve_priv {
u32 adminq_set_driver_parameter_cnt;
u32 adminq_report_stats_cnt;
u32 adminq_report_link_speed_cnt;
+ u32 adminq_report_nic_timestamp_cnt;
u32 adminq_get_ptype_map_cnt;
u32 adminq_verify_driver_compatibility_cnt;
u32 adminq_query_flow_rules_cnt;
@@ -870,6 +917,14 @@ struct gve_priv {
u16 rss_lut_size;
bool cache_rss_config;
struct gve_rss_config rss_config;
+
+ /* True if the device supports reading the nic clock */
+ bool nic_timestamp_supported;
+ struct gve_ptp *ptp;
+ struct kernel_hwtstamp_config ts_config;
+ struct gve_nic_ts_report *nic_ts_report;
+ dma_addr_t nic_ts_report_bus;
+ u64 last_sync_nic_counter; /* Clock counter from last NIC TS report */
};
enum gve_service_task_flags_bit {
@@ -1119,6 +1174,12 @@ static inline bool gve_is_gqi(struct gve_priv *priv)
priv->queue_format == GVE_GQI_QPL_FORMAT;
}
+static inline bool gve_is_dqo(struct gve_priv *priv)
+{
+ return priv->queue_format == GVE_DQO_RDA_FORMAT ||
+ priv->queue_format == GVE_DQO_QPL_FORMAT;
+}
+
static inline u32 gve_num_tx_queues(struct gve_priv *priv)
{
return priv->tx_cfg.num_queues + priv->tx_cfg.num_xdp_queues;
@@ -1138,6 +1199,7 @@ static inline bool gve_supports_xdp_xmit(struct gve_priv *priv)
{
switch (priv->queue_format) {
case GVE_GQI_QPL_FORMAT:
+ case GVE_DQO_RDA_FORMAT:
return true;
default:
return false;
@@ -1161,11 +1223,15 @@ void gve_free_queue_page_list(struct gve_priv *priv,
u32 id);
/* tx handling */
netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev);
-int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
- u32 flags);
+int gve_xdp_xmit_gqi(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags);
+int gve_xdp_xmit_dqo(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags);
int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx,
void *data, int len, void *frame_p);
void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid);
+int gve_xdp_xmit_one_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
+ struct xdp_frame *xdpf);
bool gve_tx_poll(struct gve_notify_block *block, int budget);
bool gve_xdp_poll(struct gve_notify_block *block, int budget);
int gve_xsk_tx_poll(struct gve_notify_block *block, int budget);
@@ -1194,9 +1260,12 @@ void gve_rx_free_rings_gqi(struct gve_priv *priv,
struct gve_rx_alloc_rings_cfg *cfg);
void gve_rx_start_ring_gqi(struct gve_priv *priv, int idx);
void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx);
-u16 gve_get_pkt_buf_size(const struct gve_priv *priv, bool enable_hplit);
bool gve_header_split_supported(const struct gve_priv *priv);
-int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split);
+int gve_set_rx_buf_len_config(struct gve_priv *priv, u32 rx_buf_len,
+ struct netlink_ext_ack *extack,
+ struct gve_rx_alloc_rings_cfg *rx_alloc_cfg);
+int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split,
+ struct gve_rx_alloc_rings_cfg *rx_alloc_cfg);
/* rx buffer handling */
int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs);
void gve_free_page_dqo(struct gve_priv *priv, struct gve_rx_buf_state_dqo *bs,
@@ -1249,6 +1318,24 @@ int gve_del_flow_rule(struct gve_priv *priv, struct ethtool_rxnfc *cmd);
int gve_flow_rules_reset(struct gve_priv *priv);
/* RSS config */
int gve_init_rss_config(struct gve_priv *priv, u16 num_queues);
+/* PTP and timestamping */
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+int gve_clock_nic_ts_read(struct gve_priv *priv);
+int gve_init_clock(struct gve_priv *priv);
+void gve_teardown_clock(struct gve_priv *priv);
+#else /* CONFIG_PTP_1588_CLOCK */
+static inline int gve_clock_nic_ts_read(struct gve_priv *priv)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int gve_init_clock(struct gve_priv *priv)
+{
+ return 0;
+}
+
+static inline void gve_teardown_clock(struct gve_priv *priv) { }
+#endif /* CONFIG_PTP_1588_CLOCK */
/* report stats handling */
void gve_handle_report_stats(struct gve_priv *priv);
/* exported by ethtool.c */
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
index 3e8fc33cc11f..b72cc0fa2ba2 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.c
+++ b/drivers/net/ethernet/google/gve/gve_adminq.c
@@ -46,6 +46,7 @@ void gve_parse_device_option(struct gve_priv *priv,
struct gve_device_option_buffer_sizes **dev_op_buffer_sizes,
struct gve_device_option_flow_steering **dev_op_flow_steering,
struct gve_device_option_rss_config **dev_op_rss_config,
+ struct gve_device_option_nic_timestamp **dev_op_nic_timestamp,
struct gve_device_option_modify_ring **dev_op_modify_ring)
{
u32 req_feat_mask = be32_to_cpu(option->required_features_mask);
@@ -225,6 +226,23 @@ void gve_parse_device_option(struct gve_priv *priv,
"RSS config");
*dev_op_rss_config = (void *)(option + 1);
break;
+ case GVE_DEV_OPT_ID_NIC_TIMESTAMP:
+ if (option_length < sizeof(**dev_op_nic_timestamp) ||
+ req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP) {
+ dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT,
+ "Nic Timestamp",
+ (int)sizeof(**dev_op_nic_timestamp),
+ GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP,
+ option_length, req_feat_mask);
+ break;
+ }
+
+ if (option_length > sizeof(**dev_op_nic_timestamp))
+ dev_warn(&priv->pdev->dev,
+ GVE_DEVICE_OPTION_TOO_BIG_FMT,
+ "Nic Timestamp");
+ *dev_op_nic_timestamp = (void *)(option + 1);
+ break;
default:
/* If we don't recognize the option just continue
* without doing anything.
@@ -246,6 +264,7 @@ gve_process_device_options(struct gve_priv *priv,
struct gve_device_option_buffer_sizes **dev_op_buffer_sizes,
struct gve_device_option_flow_steering **dev_op_flow_steering,
struct gve_device_option_rss_config **dev_op_rss_config,
+ struct gve_device_option_nic_timestamp **dev_op_nic_timestamp,
struct gve_device_option_modify_ring **dev_op_modify_ring)
{
const int num_options = be16_to_cpu(descriptor->num_device_options);
@@ -269,6 +288,7 @@ gve_process_device_options(struct gve_priv *priv,
dev_op_dqo_rda, dev_op_jumbo_frames,
dev_op_dqo_qpl, dev_op_buffer_sizes,
dev_op_flow_steering, dev_op_rss_config,
+ dev_op_nic_timestamp,
dev_op_modify_ring);
dev_opt = next_opt;
}
@@ -306,6 +326,7 @@ int gve_adminq_alloc(struct device *dev, struct gve_priv *priv)
priv->adminq_set_driver_parameter_cnt = 0;
priv->adminq_report_stats_cnt = 0;
priv->adminq_report_link_speed_cnt = 0;
+ priv->adminq_report_nic_timestamp_cnt = 0;
priv->adminq_get_ptype_map_cnt = 0;
priv->adminq_query_flow_rules_cnt = 0;
priv->adminq_cfg_flow_rule_cnt = 0;
@@ -442,6 +463,8 @@ static int gve_adminq_kick_and_wait(struct gve_priv *priv)
int tail, head;
int i;
+ lockdep_assert_held(&priv->adminq_lock);
+
tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
head = priv->adminq_prod_cnt;
@@ -467,9 +490,6 @@ static int gve_adminq_kick_and_wait(struct gve_priv *priv)
return 0;
}
-/* This function is not threadsafe - the caller is responsible for any
- * necessary locks.
- */
static int gve_adminq_issue_cmd(struct gve_priv *priv,
union gve_adminq_command *cmd_orig)
{
@@ -477,6 +497,8 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
u32 opcode;
u32 tail;
+ lockdep_assert_held(&priv->adminq_lock);
+
tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
// Check if next command will overflow the buffer.
@@ -544,6 +566,9 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
case GVE_ADMINQ_REPORT_LINK_SPEED:
priv->adminq_report_link_speed_cnt++;
break;
+ case GVE_ADMINQ_REPORT_NIC_TIMESTAMP:
+ priv->adminq_report_nic_timestamp_cnt++;
+ break;
case GVE_ADMINQ_GET_PTYPE_MAP:
priv->adminq_get_ptype_map_cnt++;
break;
@@ -564,6 +589,7 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
break;
default:
dev_err(&priv->pdev->dev, "unknown AQ command opcode %d\n", opcode);
+ return -EINVAL;
}
return 0;
@@ -625,7 +651,7 @@ static int gve_adminq_execute_extended_cmd(struct gve_priv *priv, u32 opcode,
/* The device specifies that the management vector can either be the first irq
* or the last irq. ntfy_blk_msix_base_idx indicates the first irq assigned to
- * the ntfy blks. It if is 0 then the management vector is last, if it is 1 then
+ * the ntfy blks. If it is 0 then the management vector is last, if it is 1 then
* the management vector is first.
*
* gve arranges the msix vectors so that the management vector is last.
@@ -709,13 +735,19 @@ int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 start_id, u32 num_que
int err;
int i;
+ mutex_lock(&priv->adminq_lock);
+
for (i = start_id; i < start_id + num_queues; i++) {
err = gve_adminq_create_tx_queue(priv, i);
if (err)
- return err;
+ goto out;
}
- return gve_adminq_kick_and_wait(priv);
+ err = gve_adminq_kick_and_wait(priv);
+
+out:
+ mutex_unlock(&priv->adminq_lock);
+ return err;
}
static void gve_adminq_get_create_rx_queue_cmd(struct gve_priv *priv,
@@ -788,13 +820,19 @@ int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues)
int err;
int i;
+ mutex_lock(&priv->adminq_lock);
+
for (i = 0; i < num_queues; i++) {
err = gve_adminq_create_rx_queue(priv, i);
if (err)
- return err;
+ goto out;
}
- return gve_adminq_kick_and_wait(priv);
+ err = gve_adminq_kick_and_wait(priv);
+
+out:
+ mutex_unlock(&priv->adminq_lock);
+ return err;
}
static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
@@ -820,13 +858,19 @@ int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 start_id, u32 num_qu
int err;
int i;
+ mutex_lock(&priv->adminq_lock);
+
for (i = start_id; i < start_id + num_queues; i++) {
err = gve_adminq_destroy_tx_queue(priv, i);
if (err)
- return err;
+ goto out;
}
- return gve_adminq_kick_and_wait(priv);
+ err = gve_adminq_kick_and_wait(priv);
+
+out:
+ mutex_unlock(&priv->adminq_lock);
+ return err;
}
static void gve_adminq_make_destroy_rx_queue_cmd(union gve_adminq_command *cmd,
@@ -861,13 +905,19 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
int err;
int i;
+ mutex_lock(&priv->adminq_lock);
+
for (i = 0; i < num_queues; i++) {
err = gve_adminq_destroy_rx_queue(priv, i);
if (err)
- return err;
+ goto out;
}
- return gve_adminq_kick_and_wait(priv);
+ err = gve_adminq_kick_and_wait(priv);
+
+out:
+ mutex_unlock(&priv->adminq_lock);
+ return err;
}
static void gve_set_default_desc_cnt(struct gve_priv *priv,
@@ -904,6 +954,8 @@ static void gve_enable_supported_features(struct gve_priv *priv,
*dev_op_flow_steering,
const struct gve_device_option_rss_config
*dev_op_rss_config,
+ const struct gve_device_option_nic_timestamp
+ *dev_op_nic_timestamp,
const struct gve_device_option_modify_ring
*dev_op_modify_ring)
{
@@ -935,6 +987,10 @@ static void gve_enable_supported_features(struct gve_priv *priv,
dev_info(&priv->pdev->dev,
"BUFFER SIZES device option enabled with max_rx_buffer_size of %u, header_buf_size of %u.\n",
priv->max_rx_buffer_size, priv->header_buf_size);
+ if (gve_is_dqo(priv) &&
+ priv->max_rx_buffer_size > GVE_DEFAULT_RX_BUFFER_SIZE)
+ priv->rx_cfg.packet_buffer_size =
+ priv->max_rx_buffer_size;
}
/* Read and store ring size ranges given by device */
@@ -980,10 +1036,15 @@ static void gve_enable_supported_features(struct gve_priv *priv,
"RSS device option enabled with key size of %u, lut size of %u.\n",
priv->rss_key_size, priv->rss_lut_size);
}
+
+ if (dev_op_nic_timestamp &&
+ (supported_features_mask & GVE_SUP_NIC_TIMESTAMP_MASK))
+ priv->nic_timestamp_supported = true;
}
int gve_adminq_describe_device(struct gve_priv *priv)
{
+ struct gve_device_option_nic_timestamp *dev_op_nic_timestamp = NULL;
struct gve_device_option_flow_steering *dev_op_flow_steering = NULL;
struct gve_device_option_buffer_sizes *dev_op_buffer_sizes = NULL;
struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL;
@@ -1024,6 +1085,7 @@ int gve_adminq_describe_device(struct gve_priv *priv)
&dev_op_buffer_sizes,
&dev_op_flow_steering,
&dev_op_rss_config,
+ &dev_op_nic_timestamp,
&dev_op_modify_ring);
if (err)
goto free_device_descriptor;
@@ -1088,7 +1150,8 @@ int gve_adminq_describe_device(struct gve_priv *priv)
gve_enable_supported_features(priv, supported_features_mask,
dev_op_jumbo_frames, dev_op_dqo_qpl,
dev_op_buffer_sizes, dev_op_flow_steering,
- dev_op_rss_config, dev_op_modify_ring);
+ dev_op_rss_config, dev_op_nic_timestamp,
+ dev_op_modify_ring);
free_device_descriptor:
dma_pool_free(priv->adminq_pool, descriptor, descriptor_bus);
@@ -1200,6 +1263,22 @@ int gve_adminq_report_link_speed(struct gve_priv *priv)
return err;
}
+int gve_adminq_report_nic_ts(struct gve_priv *priv,
+ dma_addr_t nic_ts_report_addr)
+{
+ union gve_adminq_command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_NIC_TIMESTAMP);
+ cmd.report_nic_ts = (struct gve_adminq_report_nic_ts) {
+ .nic_ts_report_len =
+ cpu_to_be64(sizeof(struct gve_nic_ts_report)),
+ .nic_ts_report_addr = cpu_to_be64(nic_ts_report_addr),
+ };
+
+ return gve_adminq_execute_cmd(priv, &cmd);
+}
+
int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv,
struct gve_ptype_lut *ptype_lut)
{
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h
index 228217458275..22a74b6aa17e 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.h
+++ b/drivers/net/ethernet/google/gve/gve_adminq.h
@@ -27,6 +27,7 @@ enum gve_adminq_opcodes {
GVE_ADMINQ_GET_PTYPE_MAP = 0xE,
GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY = 0xF,
GVE_ADMINQ_QUERY_FLOW_RULES = 0x10,
+ GVE_ADMINQ_REPORT_NIC_TIMESTAMP = 0x11,
GVE_ADMINQ_QUERY_RSS = 0x12,
/* For commands that are larger than 56 bytes */
@@ -174,6 +175,12 @@ struct gve_device_option_rss_config {
static_assert(sizeof(struct gve_device_option_rss_config) == 8);
+struct gve_device_option_nic_timestamp {
+ __be32 supported_features_mask;
+};
+
+static_assert(sizeof(struct gve_device_option_nic_timestamp) == 4);
+
/* Terminology:
*
* RDA - Raw DMA Addressing - Buffers associated with SKBs are directly DMA
@@ -192,6 +199,7 @@ enum gve_dev_opt_id {
GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8,
GVE_DEV_OPT_ID_BUFFER_SIZES = 0xa,
GVE_DEV_OPT_ID_FLOW_STEERING = 0xb,
+ GVE_DEV_OPT_ID_NIC_TIMESTAMP = 0xd,
GVE_DEV_OPT_ID_RSS_CONFIG = 0xe,
};
@@ -206,6 +214,7 @@ enum gve_dev_opt_req_feat_mask {
GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_FLOW_STEERING = 0x0,
GVE_DEV_OPT_REQ_FEAT_MASK_RSS_CONFIG = 0x0,
+ GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP = 0x0,
};
enum gve_sup_feature_mask {
@@ -214,6 +223,7 @@ enum gve_sup_feature_mask {
GVE_SUP_BUFFER_SIZES_MASK = 1 << 4,
GVE_SUP_FLOW_STEERING_MASK = 1 << 5,
GVE_SUP_RSS_CONFIG_MASK = 1 << 7,
+ GVE_SUP_NIC_TIMESTAMP_MASK = 1 << 8,
};
#define GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING 0x0
@@ -392,6 +402,21 @@ struct gve_adminq_report_link_speed {
static_assert(sizeof(struct gve_adminq_report_link_speed) == 8);
+struct gve_adminq_report_nic_ts {
+ __be64 nic_ts_report_len;
+ __be64 nic_ts_report_addr;
+};
+
+static_assert(sizeof(struct gve_adminq_report_nic_ts) == 16);
+
+struct gve_nic_ts_report {
+ __be64 nic_timestamp; /* NIC clock in nanoseconds */
+ __be64 reserved1;
+ __be64 reserved2;
+ __be64 reserved3;
+ __be64 reserved4;
+};
+
struct stats {
__be32 stat_name;
__be32 queue_id;
@@ -451,7 +476,7 @@ struct gve_ptype_entry {
};
struct gve_ptype_map {
- struct gve_ptype_entry ptypes[1 << 10]; /* PTYPES are always 10 bits. */
+ struct gve_ptype_entry ptypes[GVE_NUM_PTYPES]; /* PTYPES are always 10 bits. */
};
struct gve_adminq_get_ptype_map {
@@ -585,6 +610,7 @@ union gve_adminq_command {
struct gve_adminq_query_flow_rules query_flow_rules;
struct gve_adminq_configure_rss configure_rss;
struct gve_adminq_query_rss query_rss;
+ struct gve_adminq_report_nic_ts report_nic_ts;
struct gve_adminq_extended_command extended_command;
};
};
@@ -624,6 +650,8 @@ int gve_adminq_reset_flow_rules(struct gve_priv *priv);
int gve_adminq_query_flow_rules(struct gve_priv *priv, u16 query_opcode, u32 starting_loc);
int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *rxfh);
int gve_adminq_query_rss_config(struct gve_priv *priv, struct ethtool_rxfh_param *rxfh);
+int gve_adminq_report_nic_ts(struct gve_priv *priv,
+ dma_addr_t nic_ts_report_addr);
struct gve_ptype_lut;
int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv,
diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c
index a71883e1d920..0e2b703c673a 100644
--- a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c
@@ -4,6 +4,7 @@
* Copyright (C) 2015-2024 Google, Inc.
*/
+#include <net/xdp_sock_drv.h>
#include "gve.h"
#include "gve_utils.h"
@@ -29,6 +30,10 @@ struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx)
/* Point buf_state to itself to mark it as allocated */
buf_state->next = buffer_id;
+ /* Clear the buffer pointers */
+ buf_state->page_info.page = NULL;
+ buf_state->xsk_buff = NULL;
+
return buf_state;
}
@@ -246,6 +251,7 @@ struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv,
.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
.order = 0,
.pool_size = GVE_PAGE_POOL_SIZE_MULTIPLIER * priv->rx_desc_cnt,
+ .nid = priv->numa_node,
.dev = &priv->pdev->dev,
.netdev = priv->dev,
.napi = &priv->ntfy_blocks[ntfy_id].napi,
@@ -254,6 +260,11 @@ struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv,
.offset = xdp ? XDP_PACKET_HEADROOM : 0,
};
+ if (priv->header_split_enabled) {
+ pp.flags |= PP_FLAG_ALLOW_UNREADABLE_NETMEM;
+ pp.queue_idx = rx->q_num;
+ }
+
return page_pool_create(&pp);
}
@@ -285,7 +296,24 @@ int gve_alloc_buffer(struct gve_rx_ring *rx, struct gve_rx_desc_dqo *desc)
{
struct gve_rx_buf_state_dqo *buf_state;
- if (rx->dqo.page_pool) {
+ if (rx->xsk_pool) {
+ buf_state = gve_alloc_buf_state(rx);
+ if (unlikely(!buf_state))
+ return -ENOMEM;
+
+ buf_state->xsk_buff = xsk_buff_alloc(rx->xsk_pool);
+ if (unlikely(!buf_state->xsk_buff)) {
+ xsk_set_rx_need_wakeup(rx->xsk_pool);
+ gve_free_buf_state(rx, buf_state);
+ return -ENOMEM;
+ }
+ /* Allocated xsk buffer. Clear wakeup in case it was set. */
+ xsk_clear_rx_need_wakeup(rx->xsk_pool);
+ desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states);
+ desc->buf_addr =
+ cpu_to_le64(xsk_buff_xdp_get_dma(buf_state->xsk_buff));
+ return 0;
+ } else if (rx->dqo.page_pool) {
buf_state = gve_alloc_buf_state(rx);
if (WARN_ON_ONCE(!buf_state))
return -ENOMEM;
diff --git a/drivers/net/ethernet/google/gve/gve_desc_dqo.h b/drivers/net/ethernet/google/gve/gve_desc_dqo.h
index f79cd0591110..f7786b03c744 100644
--- a/drivers/net/ethernet/google/gve/gve_desc_dqo.h
+++ b/drivers/net/ethernet/google/gve/gve_desc_dqo.h
@@ -236,7 +236,8 @@ struct gve_rx_compl_desc_dqo {
u8 status_error1;
- __le16 reserved5;
+ u8 reserved5;
+ u8 ts_sub_nsecs_low;
__le16 buf_id; /* Buffer ID which was sent on the buffer queue. */
union {
@@ -247,7 +248,8 @@ struct gve_rx_compl_desc_dqo {
};
__le32 hash;
__le32 reserved6;
- __le64 reserved7;
+ __le32 reserved7;
+ __le32 ts; /* timestamp in nanosecs */
} __packed;
static_assert(sizeof(struct gve_rx_compl_desc_dqo) == 32);
diff --git a/drivers/net/ethernet/google/gve/gve_dqo.h b/drivers/net/ethernet/google/gve/gve_dqo.h
index e83773fb891f..5871f773f0c7 100644
--- a/drivers/net/ethernet/google/gve/gve_dqo.h
+++ b/drivers/net/ethernet/google/gve/gve_dqo.h
@@ -36,7 +36,10 @@ netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev);
netdev_features_t gve_features_check_dqo(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features);
+int gve_xdp_rx_timestamp(const struct xdp_md *_ctx, u64 *timestamp);
bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean);
+bool gve_xdp_poll_dqo(struct gve_notify_block *block);
+bool gve_xsk_tx_poll_dqo(struct gve_notify_block *block, int budget);
int gve_rx_poll_dqo(struct gve_notify_block *block, int budget);
int gve_tx_alloc_rings_dqo(struct gve_priv *priv,
struct gve_tx_alloc_rings_cfg *cfg);
@@ -60,6 +63,7 @@ int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
struct napi_struct *napi);
void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx);
void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx);
+void gve_xdp_tx_flush_dqo(struct gve_priv *priv, u32 xdp_qid);
static inline void
gve_tx_put_doorbell_dqo(const struct gve_priv *priv,
diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c
index eae1a7595a69..52500ae8348e 100644
--- a/drivers/net/ethernet/google/gve/gve_ethtool.c
+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c
@@ -67,7 +67,7 @@ static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = {
"tx_xsk_sent[%u]", "tx_xdp_xmit[%u]", "tx_xdp_xmit_errors[%u]"
};
-static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = {
+static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] __nonstring_array = {
"adminq_prod_cnt", "adminq_cmd_fail", "adminq_timeouts",
"adminq_describe_device_cnt", "adminq_cfg_device_resources_cnt",
"adminq_register_page_list_cnt", "adminq_unregister_page_list_cnt",
@@ -76,7 +76,7 @@ static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = {
"adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt",
"adminq_report_stats_cnt", "adminq_report_link_speed_cnt", "adminq_get_ptype_map_cnt",
"adminq_query_flow_rules", "adminq_cfg_flow_rule", "adminq_cfg_rss_cnt",
- "adminq_query_rss_cnt",
+ "adminq_query_rss_cnt", "adminq_report_nic_timestamp_cnt",
};
static const char gve_gstrings_priv_flags[][ETH_GSTRING_LEN] = {
@@ -113,7 +113,7 @@ static void gve_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
i);
for (i = 0; i < ARRAY_SIZE(gve_gstrings_adminq_stats); i++)
- ethtool_puts(&s, gve_gstrings_adminq_stats[i]);
+ ethtool_cpy(&s, gve_gstrings_adminq_stats[i]);
break;
@@ -456,6 +456,7 @@ gve_get_ethtool_stats(struct net_device *netdev,
data[i++] = priv->adminq_cfg_flow_rule_cnt;
data[i++] = priv->adminq_cfg_rss_cnt;
data[i++] = priv->adminq_query_rss_cnt;
+ data[i++] = priv->adminq_report_nic_timestamp_cnt;
}
static void gve_get_channels(struct net_device *netdev,
@@ -528,6 +529,8 @@ static void gve_get_ringparam(struct net_device *netdev,
cmd->rx_pending = priv->rx_desc_cnt;
cmd->tx_pending = priv->tx_desc_cnt;
+ kernel_cmd->rx_buf_len = priv->rx_cfg.packet_buffer_size;
+
if (!gve_header_split_supported(priv))
kernel_cmd->tcp_data_split = ETHTOOL_TCP_DATA_SPLIT_UNKNOWN;
else if (priv->header_split_enabled)
@@ -536,34 +539,6 @@ static void gve_get_ringparam(struct net_device *netdev,
kernel_cmd->tcp_data_split = ETHTOOL_TCP_DATA_SPLIT_DISABLED;
}
-static int gve_adjust_ring_sizes(struct gve_priv *priv,
- u16 new_tx_desc_cnt,
- u16 new_rx_desc_cnt)
-{
- struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
- struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
- int err;
-
- /* get current queue configuration */
- gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
-
- /* copy over the new ring_size from ethtool */
- tx_alloc_cfg.ring_size = new_tx_desc_cnt;
- rx_alloc_cfg.ring_size = new_rx_desc_cnt;
-
- if (netif_running(priv->dev)) {
- err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
- if (err)
- return err;
- }
-
- /* Set new ring_size for the next up */
- priv->tx_desc_cnt = new_tx_desc_cnt;
- priv->rx_desc_cnt = new_rx_desc_cnt;
-
- return 0;
-}
-
static int gve_validate_req_ring_size(struct gve_priv *priv, u16 new_tx_desc_cnt,
u16 new_rx_desc_cnt)
{
@@ -583,34 +558,68 @@ static int gve_validate_req_ring_size(struct gve_priv *priv, u16 new_tx_desc_cnt
return 0;
}
+static int gve_set_ring_sizes_config(struct gve_priv *priv, u16 new_tx_desc_cnt,
+ u16 new_rx_desc_cnt,
+ struct gve_tx_alloc_rings_cfg *tx_alloc_cfg,
+ struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
+{
+ if (new_tx_desc_cnt == priv->tx_desc_cnt &&
+ new_rx_desc_cnt == priv->rx_desc_cnt)
+ return 0;
+
+ if (!priv->modify_ring_size_enabled) {
+ dev_err(&priv->pdev->dev, "Modify ring size is not supported.\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (gve_validate_req_ring_size(priv, new_tx_desc_cnt, new_rx_desc_cnt))
+ return -EINVAL;
+
+ tx_alloc_cfg->ring_size = new_tx_desc_cnt;
+ rx_alloc_cfg->ring_size = new_rx_desc_cnt;
+ return 0;
+}
+
static int gve_set_ringparam(struct net_device *netdev,
struct ethtool_ringparam *cmd,
struct kernel_ethtool_ringparam *kernel_cmd,
struct netlink_ext_ack *extack)
{
+ struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
+ struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
struct gve_priv *priv = netdev_priv(netdev);
- u16 new_tx_cnt, new_rx_cnt;
int err;
- err = gve_set_hsplit_config(priv, kernel_cmd->tcp_data_split);
+ gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
+
+ err = gve_set_rx_buf_len_config(priv, kernel_cmd->rx_buf_len, extack,
+ &rx_alloc_cfg);
if (err)
return err;
- if (cmd->tx_pending == priv->tx_desc_cnt && cmd->rx_pending == priv->rx_desc_cnt)
- return 0;
-
- if (!priv->modify_ring_size_enabled) {
- dev_err(&priv->pdev->dev, "Modify ring size is not supported.\n");
- return -EOPNOTSUPP;
- }
-
- new_tx_cnt = cmd->tx_pending;
- new_rx_cnt = cmd->rx_pending;
+ err = gve_set_hsplit_config(priv, kernel_cmd->tcp_data_split,
+ &rx_alloc_cfg);
+ if (err)
+ return err;
- if (gve_validate_req_ring_size(priv, new_tx_cnt, new_rx_cnt))
- return -EINVAL;
+ err = gve_set_ring_sizes_config(priv, cmd->tx_pending, cmd->rx_pending,
+ &tx_alloc_cfg, &rx_alloc_cfg);
+ if (err)
+ return err;
- return gve_adjust_ring_sizes(priv, new_tx_cnt, new_rx_cnt);
+ if (netif_running(priv->dev)) {
+ err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
+ if (err)
+ return err;
+ } else {
+ /* Set ring params for the next up */
+ priv->rx_cfg.packet_buffer_size =
+ rx_alloc_cfg.packet_buffer_size;
+ priv->header_split_enabled = rx_alloc_cfg.enable_header_split;
+ priv->tx_desc_cnt = tx_alloc_cfg.ring_size;
+ priv->rx_desc_cnt = rx_alloc_cfg.ring_size;
+ }
+ return 0;
}
static int gve_user_reset(struct net_device *netdev, u32 *flags)
@@ -667,7 +676,7 @@ static u32 gve_get_priv_flags(struct net_device *netdev)
struct gve_priv *priv = netdev_priv(netdev);
u32 ret_flags = 0;
- /* Only 1 flag exists currently: report-stats (BIT(O)), so set that flag. */
+ /* Only 1 flag exists currently: report-stats (BIT(0)), so set that flag. */
if (priv->ethtool_flags & BIT(0))
ret_flags |= BIT(0);
return ret_flags;
@@ -798,9 +807,6 @@ static int gve_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
case ETHTOOL_SRXCLSRLDEL:
err = gve_del_flow_rule(priv, cmd);
break;
- case ETHTOOL_SRXFH:
- err = -EOPNOTSUPP;
- break;
default:
err = -EOPNOTSUPP;
break;
@@ -835,9 +841,6 @@ static int gve_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd, u
case ETHTOOL_GRXCLSRLALL:
err = gve_get_flow_rule_ids(priv, cmd, (u32 *)rule_locs);
break;
- case ETHTOOL_GRXFH:
- err = -EOPNOTSUPP;
- break;
default:
err = -EOPNOTSUPP;
break;
@@ -928,9 +931,31 @@ static int gve_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rx
return 0;
}
+static int gve_get_ts_info(struct net_device *netdev,
+ struct kernel_ethtool_ts_info *info)
+{
+ struct gve_priv *priv = netdev_priv(netdev);
+
+ ethtool_op_get_ts_info(netdev, info);
+
+ if (priv->nic_timestamp_supported) {
+ info->so_timestamping |= SOF_TIMESTAMPING_RX_HARDWARE |
+ SOF_TIMESTAMPING_RAW_HARDWARE;
+
+ info->rx_filters |= BIT(HWTSTAMP_FILTER_NONE) |
+ BIT(HWTSTAMP_FILTER_ALL);
+
+ if (priv->ptp)
+ info->phc_index = ptp_clock_index(priv->ptp->clock);
+ }
+
+ return 0;
+}
+
const struct ethtool_ops gve_ethtool_ops = {
.supported_coalesce_params = ETHTOOL_COALESCE_USECS,
- .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT,
+ .supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT |
+ ETHTOOL_RING_USE_RX_BUF_LEN,
.get_drvinfo = gve_get_drvinfo,
.get_strings = gve_get_strings,
.get_sset_count = gve_get_sset_count,
@@ -956,5 +981,5 @@ const struct ethtool_ops gve_ethtool_ops = {
.get_priv_flags = gve_get_priv_flags,
.set_priv_flags = gve_set_priv_flags,
.get_link_ksettings = gve_get_link_ksettings,
- .get_ts_info = ethtool_op_get_ts_info,
+ .get_ts_info = gve_get_ts_info,
};
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index c3791cf23c87..7eb64e1e4d85 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -4,6 +4,7 @@
* Copyright (C) 2015-2024 Google LLC
*/
+#include <linux/bitmap.h>
#include <linux/bpf.h>
#include <linux/cpumask.h>
#include <linux/etherdevice.h>
@@ -268,7 +269,8 @@ static void gve_stats_report_schedule(struct gve_priv *priv)
static void gve_stats_report_timer(struct timer_list *t)
{
- struct gve_priv *priv = from_timer(priv, t, stats_report_timer);
+ struct gve_priv *priv = timer_container_of(priv, t,
+ stats_report_timer);
mod_timer(&priv->stats_report_timer,
round_jiffies(jiffies +
@@ -413,14 +415,24 @@ int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
bool reschedule = false;
int work_done = 0;
- if (block->tx)
- reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true);
+ if (block->tx) {
+ if (block->tx->q_num < priv->tx_cfg.num_queues)
+ reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true);
+ else
+ reschedule |= gve_xdp_poll_dqo(block);
+ }
if (!budget)
return 0;
if (block->rx) {
work_done = gve_rx_poll_dqo(block, budget);
+
+ /* Poll XSK TX as part of RX NAPI. Setup re-poll based on if
+ * either datapath has more work to do.
+ */
+ if (priv->xdp_prog)
+ reschedule |= gve_xsk_tx_poll_dqo(block, budget);
reschedule |= work_done == budget;
}
@@ -456,10 +468,19 @@ int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
return work_done;
}
+static const struct cpumask *gve_get_node_mask(struct gve_priv *priv)
+{
+ if (priv->numa_node == NUMA_NO_NODE)
+ return cpu_all_mask;
+ else
+ return cpumask_of_node(priv->numa_node);
+}
+
static int gve_alloc_notify_blocks(struct gve_priv *priv)
{
int num_vecs_requested = priv->num_ntfy_blks + 1;
- unsigned int active_cpus;
+ const struct cpumask *node_mask;
+ unsigned int cur_cpu;
int vecs_enabled;
int i, j;
int err;
@@ -498,8 +519,6 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)
if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues)
priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
}
- /* Half the notification blocks go to TX and half to RX */
- active_cpus = min_t(int, priv->num_ntfy_blks / 2, num_online_cpus());
/* Setup Management Vector - the last vector */
snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "gve-mgmnt@pci:%s",
@@ -528,6 +547,8 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)
}
/* Setup the other blocks - the first n-1 vectors */
+ node_mask = gve_get_node_mask(priv);
+ cur_cpu = cpumask_first(node_mask);
for (i = 0; i < priv->num_ntfy_blks; i++) {
struct gve_notify_block *block = &priv->ntfy_blocks[i];
int msix_idx = i;
@@ -537,16 +558,24 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)
block->priv = priv;
err = request_irq(priv->msix_vectors[msix_idx].vector,
gve_is_gqi(priv) ? gve_intr : gve_intr_dqo,
- 0, block->name, block);
+ IRQF_NO_AUTOEN, block->name, block);
if (err) {
dev_err(&priv->pdev->dev,
"Failed to receive msix vector %d\n", i);
goto abort_with_some_ntfy_blocks;
}
block->irq = priv->msix_vectors[msix_idx].vector;
- irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
- get_cpu_mask(i % active_cpus));
+ irq_set_affinity_and_hint(block->irq,
+ cpumask_of(cur_cpu));
block->irq_db_index = &priv->irq_db_indices[i].index;
+
+ cur_cpu = cpumask_next(cur_cpu, node_mask);
+ /* Wrap once CPUs in the node have been exhausted, or when
+ * starting RX queue affinities. TX and RX queues of the same
+ * index share affinity.
+ */
+ if (cur_cpu >= nr_cpu_ids || (i + 1) == priv->tx_cfg.max_queues)
+ cur_cpu = cpumask_first(node_mask);
}
return 0;
abort_with_some_ntfy_blocks:
@@ -651,10 +680,16 @@ static int gve_setup_device_resources(struct gve_priv *priv)
}
}
+ err = gve_init_clock(priv);
+ if (err) {
+ dev_err(&priv->pdev->dev, "Failed to init clock");
+ goto abort_with_ptype_lut;
+ }
+
err = gve_init_rss_config(priv, priv->rx_cfg.num_queues);
if (err) {
dev_err(&priv->pdev->dev, "Failed to init RSS config");
- goto abort_with_ptype_lut;
+ goto abort_with_clock;
}
err = gve_adminq_report_stats(priv, priv->stats_report_len,
@@ -666,6 +701,8 @@ static int gve_setup_device_resources(struct gve_priv *priv)
gve_set_device_resources_ok(priv);
return 0;
+abort_with_clock:
+ gve_teardown_clock(priv);
abort_with_ptype_lut:
kvfree(priv->ptype_lut_dqo);
priv->ptype_lut_dqo = NULL;
@@ -721,6 +758,7 @@ static void gve_teardown_device_resources(struct gve_priv *priv)
gve_free_counter_array(priv);
gve_free_notify_blocks(priv);
gve_free_stats_report(priv);
+ gve_teardown_clock(priv);
gve_clear_device_resources_ok(priv);
}
@@ -1029,7 +1067,7 @@ int gve_alloc_page(struct gve_priv *priv, struct device *dev,
struct page **page, dma_addr_t *dma,
enum dma_data_direction dir, gfp_t gfp_flags)
{
- *page = alloc_page(gfp_flags);
+ *page = alloc_pages_node(priv->numa_node, gfp_flags, 0);
if (!*page) {
priv->page_alloc_fail++;
return -ENOMEM;
@@ -1130,18 +1168,84 @@ static int gve_reset_recovery(struct gve_priv *priv, bool was_up);
static void gve_turndown(struct gve_priv *priv);
static void gve_turnup(struct gve_priv *priv);
+static void gve_unreg_xsk_pool(struct gve_priv *priv, u16 qid)
+{
+ struct gve_rx_ring *rx;
+
+ if (!priv->rx)
+ return;
+
+ rx = &priv->rx[qid];
+ rx->xsk_pool = NULL;
+ if (xdp_rxq_info_is_reg(&rx->xdp_rxq))
+ xdp_rxq_info_unreg_mem_model(&rx->xdp_rxq);
+
+ if (!priv->tx)
+ return;
+ priv->tx[gve_xdp_tx_queue_id(priv, qid)].xsk_pool = NULL;
+}
+
+static int gve_reg_xsk_pool(struct gve_priv *priv, struct net_device *dev,
+ struct xsk_buff_pool *pool, u16 qid)
+{
+ struct gve_rx_ring *rx;
+ u16 tx_qid;
+ int err;
+
+ rx = &priv->rx[qid];
+ err = xdp_rxq_info_reg_mem_model(&rx->xdp_rxq,
+ MEM_TYPE_XSK_BUFF_POOL, pool);
+ if (err) {
+ gve_unreg_xsk_pool(priv, qid);
+ return err;
+ }
+
+ rx->xsk_pool = pool;
+
+ tx_qid = gve_xdp_tx_queue_id(priv, qid);
+ priv->tx[tx_qid].xsk_pool = pool;
+
+ return 0;
+}
+
+static void gve_unreg_xdp_info(struct gve_priv *priv)
+{
+ int i;
+
+ if (!priv->tx_cfg.num_xdp_queues || !priv->rx)
+ return;
+
+ for (i = 0; i < priv->rx_cfg.num_queues; i++) {
+ struct gve_rx_ring *rx = &priv->rx[i];
+
+ if (xdp_rxq_info_is_reg(&rx->xdp_rxq))
+ xdp_rxq_info_unreg(&rx->xdp_rxq);
+
+ gve_unreg_xsk_pool(priv, i);
+ }
+}
+
+static struct xsk_buff_pool *gve_get_xsk_pool(struct gve_priv *priv, int qid)
+{
+ if (!test_bit(qid, priv->xsk_pools))
+ return NULL;
+
+ return xsk_get_pool_from_qid(priv->dev, qid);
+}
+
static int gve_reg_xdp_info(struct gve_priv *priv, struct net_device *dev)
{
struct napi_struct *napi;
struct gve_rx_ring *rx;
int err = 0;
- int i, j;
- u32 tx_qid;
+ int i;
if (!priv->tx_cfg.num_xdp_queues)
return 0;
for (i = 0; i < priv->rx_cfg.num_queues; i++) {
+ struct xsk_buff_pool *xsk_pool;
+
rx = &priv->rx[i];
napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
@@ -1149,7 +1253,11 @@ static int gve_reg_xdp_info(struct gve_priv *priv, struct net_device *dev)
napi->napi_id);
if (err)
goto err;
- if (gve_is_qpl(priv))
+
+ xsk_pool = gve_get_xsk_pool(priv, i);
+ if (xsk_pool)
+ err = gve_reg_xsk_pool(priv, dev, xsk_pool, i);
+ else if (gve_is_qpl(priv))
err = xdp_rxq_info_reg_mem_model(&rx->xdp_rxq,
MEM_TYPE_PAGE_SHARED,
NULL);
@@ -1159,60 +1267,14 @@ static int gve_reg_xdp_info(struct gve_priv *priv, struct net_device *dev)
rx->dqo.page_pool);
if (err)
goto err;
- rx->xsk_pool = xsk_get_pool_from_qid(dev, i);
- if (rx->xsk_pool) {
- err = xdp_rxq_info_reg(&rx->xsk_rxq, dev, i,
- napi->napi_id);
- if (err)
- goto err;
- err = xdp_rxq_info_reg_mem_model(&rx->xsk_rxq,
- MEM_TYPE_XSK_BUFF_POOL, NULL);
- if (err)
- goto err;
- xsk_pool_set_rxq_info(rx->xsk_pool,
- &rx->xsk_rxq);
- }
- }
-
- for (i = 0; i < priv->tx_cfg.num_xdp_queues; i++) {
- tx_qid = gve_xdp_tx_queue_id(priv, i);
- priv->tx[tx_qid].xsk_pool = xsk_get_pool_from_qid(dev, i);
}
return 0;
err:
- for (j = i; j >= 0; j--) {
- rx = &priv->rx[j];
- if (xdp_rxq_info_is_reg(&rx->xdp_rxq))
- xdp_rxq_info_unreg(&rx->xdp_rxq);
- if (xdp_rxq_info_is_reg(&rx->xsk_rxq))
- xdp_rxq_info_unreg(&rx->xsk_rxq);
- }
+ gve_unreg_xdp_info(priv);
return err;
}
-static void gve_unreg_xdp_info(struct gve_priv *priv)
-{
- int i, tx_qid;
-
- if (!priv->tx_cfg.num_xdp_queues || !priv->rx || !priv->tx)
- return;
-
- for (i = 0; i < priv->rx_cfg.num_queues; i++) {
- struct gve_rx_ring *rx = &priv->rx[i];
-
- xdp_rxq_info_unreg(&rx->xdp_rxq);
- if (rx->xsk_pool) {
- xdp_rxq_info_unreg(&rx->xsk_rxq);
- rx->xsk_pool = NULL;
- }
- }
-
- for (i = 0; i < priv->tx_cfg.num_xdp_queues; i++) {
- tx_qid = gve_xdp_tx_queue_id(priv, i);
- priv->tx[tx_qid].xsk_pool = NULL;
- }
-}
static void gve_drain_page_cache(struct gve_priv *priv)
{
@@ -1509,14 +1571,24 @@ out:
return err;
}
+static int gve_xdp_xmit(struct net_device *dev, int n,
+ struct xdp_frame **frames, u32 flags)
+{
+ struct gve_priv *priv = netdev_priv(dev);
+
+ if (priv->queue_format == GVE_GQI_QPL_FORMAT)
+ return gve_xdp_xmit_gqi(dev, n, frames, flags);
+ else if (priv->queue_format == GVE_DQO_RDA_FORMAT)
+ return gve_xdp_xmit_dqo(dev, n, frames, flags);
+
+ return -EOPNOTSUPP;
+}
+
static int gve_xsk_pool_enable(struct net_device *dev,
struct xsk_buff_pool *pool,
u16 qid)
{
struct gve_priv *priv = netdev_priv(dev);
- struct napi_struct *napi;
- struct gve_rx_ring *rx;
- int tx_qid;
int err;
if (qid >= priv->rx_cfg.num_queues) {
@@ -1534,34 +1606,31 @@ static int gve_xsk_pool_enable(struct net_device *dev,
if (err)
return err;
+ set_bit(qid, priv->xsk_pools);
+
/* If XDP prog is not installed or interface is down, return. */
if (!priv->xdp_prog || !netif_running(dev))
return 0;
- rx = &priv->rx[qid];
- napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
- err = xdp_rxq_info_reg(&rx->xsk_rxq, dev, qid, napi->napi_id);
- if (err)
- goto err;
-
- err = xdp_rxq_info_reg_mem_model(&rx->xsk_rxq,
- MEM_TYPE_XSK_BUFF_POOL, NULL);
+ err = gve_reg_xsk_pool(priv, dev, pool, qid);
if (err)
- goto err;
-
- xsk_pool_set_rxq_info(pool, &rx->xsk_rxq);
- rx->xsk_pool = pool;
-
- tx_qid = gve_xdp_tx_queue_id(priv, qid);
- priv->tx[tx_qid].xsk_pool = pool;
+ goto err_xsk_pool_dma_mapped;
+ /* Stop and start RDA queues to repost buffers. */
+ if (!gve_is_qpl(priv)) {
+ err = gve_configure_rings_xdp(priv, priv->rx_cfg.num_queues);
+ if (err)
+ goto err_xsk_pool_registered;
+ }
return 0;
-err:
- if (xdp_rxq_info_is_reg(&rx->xsk_rxq))
- xdp_rxq_info_unreg(&rx->xsk_rxq);
+err_xsk_pool_registered:
+ gve_unreg_xsk_pool(priv, qid);
+err_xsk_pool_dma_mapped:
+ clear_bit(qid, priv->xsk_pools);
xsk_pool_dma_unmap(pool,
- DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+ DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_WEAK_ORDERING);
return err;
}
@@ -1573,18 +1642,28 @@ static int gve_xsk_pool_disable(struct net_device *dev,
struct napi_struct *napi_tx;
struct xsk_buff_pool *pool;
int tx_qid;
+ int err;
- pool = xsk_get_pool_from_qid(dev, qid);
- if (!pool)
- return -EINVAL;
if (qid >= priv->rx_cfg.num_queues)
return -EINVAL;
- /* If XDP prog is not installed or interface is down, unmap DMA and
- * return.
- */
- if (!priv->xdp_prog || !netif_running(dev))
- goto done;
+ clear_bit(qid, priv->xsk_pools);
+
+ pool = xsk_get_pool_from_qid(dev, qid);
+ if (pool)
+ xsk_pool_dma_unmap(pool,
+ DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_WEAK_ORDERING);
+
+ if (!netif_running(dev) || !priv->tx_cfg.num_xdp_queues)
+ return 0;
+
+ /* Stop and start RDA queues to repost buffers. */
+ if (!gve_is_qpl(priv) && priv->xdp_prog) {
+ err = gve_configure_rings_xdp(priv, priv->rx_cfg.num_queues);
+ if (err)
+ return err;
+ }
napi_rx = &priv->ntfy_blocks[priv->rx[qid].ntfy_id].napi;
napi_disable(napi_rx); /* make sure current rx poll is done */
@@ -1593,22 +1672,19 @@ static int gve_xsk_pool_disable(struct net_device *dev,
napi_tx = &priv->ntfy_blocks[priv->tx[tx_qid].ntfy_id].napi;
napi_disable(napi_tx); /* make sure current tx poll is done */
- priv->rx[qid].xsk_pool = NULL;
- xdp_rxq_info_unreg(&priv->rx[qid].xsk_rxq);
- priv->tx[tx_qid].xsk_pool = NULL;
+ gve_unreg_xsk_pool(priv, qid);
smp_mb(); /* Make sure it is visible to the workers on datapath */
napi_enable(napi_rx);
- if (gve_rx_work_pending(&priv->rx[qid]))
- napi_schedule(napi_rx);
-
napi_enable(napi_tx);
- if (gve_tx_clean_pending(priv, &priv->tx[tx_qid]))
- napi_schedule(napi_tx);
+ if (gve_is_gqi(priv)) {
+ if (gve_rx_work_pending(&priv->rx[qid]))
+ napi_schedule(napi_rx);
+
+ if (gve_tx_clean_pending(priv, &priv->tx[tx_qid]))
+ napi_schedule(napi_tx);
+ }
-done:
- xsk_pool_dma_unmap(pool,
- DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
return 0;
}
@@ -1634,19 +1710,28 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
return 0;
}
-static int verify_xdp_configuration(struct net_device *dev)
+static int gve_verify_xdp_configuration(struct net_device *dev,
+ struct netlink_ext_ack *extack)
{
struct gve_priv *priv = netdev_priv(dev);
u16 max_xdp_mtu;
if (dev->features & NETIF_F_LRO) {
- netdev_warn(dev, "XDP is not supported when LRO is on.\n");
+ NL_SET_ERR_MSG_MOD(extack,
+ "XDP is not supported when LRO is on.");
+ return -EOPNOTSUPP;
+ }
+
+ if (priv->header_split_enabled) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "XDP is not supported when header-data split is enabled.");
return -EOPNOTSUPP;
}
- if (priv->queue_format != GVE_GQI_QPL_FORMAT) {
- netdev_warn(dev, "XDP is not supported in mode %d.\n",
- priv->queue_format);
+ if (priv->rx_cfg.packet_buffer_size != SZ_2K) {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "XDP is not supported for Rx buf len %d, only %d supported.",
+ priv->rx_cfg.packet_buffer_size, SZ_2K);
return -EOPNOTSUPP;
}
@@ -1655,17 +1740,20 @@ static int verify_xdp_configuration(struct net_device *dev)
max_xdp_mtu -= GVE_RX_PAD;
if (dev->mtu > max_xdp_mtu) {
- netdev_warn(dev, "XDP is not supported for mtu %d.\n",
- dev->mtu);
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "XDP is not supported for mtu %d.",
+ dev->mtu);
return -EOPNOTSUPP;
}
if (priv->rx_cfg.num_queues != priv->tx_cfg.num_queues ||
(2 * priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)) {
- netdev_warn(dev, "XDP load failed: The number of configured RX queues %d should be equal to the number of configured TX queues %d and the number of configured RX/TX queues should be less than or equal to half the maximum number of RX/TX queues %d",
- priv->rx_cfg.num_queues,
- priv->tx_cfg.num_queues,
+ netdev_warn(dev,
+ "XDP load failed: The number of configured RX queues %d should be equal to the number of configured TX queues %d and the number of configured RX/TX queues should be less than or equal to half the maximum number of RX/TX queues %d.",
+ priv->rx_cfg.num_queues, priv->tx_cfg.num_queues,
priv->tx_cfg.max_queues);
+ NL_SET_ERR_MSG_MOD(extack,
+ "XDP load failed: The number of configured RX queues should be equal to the number of configured TX queues and the number of configured RX/TX queues should be less than or equal to half the maximum number of RX/TX queues");
return -EINVAL;
}
return 0;
@@ -1676,7 +1764,7 @@ static int gve_xdp(struct net_device *dev, struct netdev_bpf *xdp)
struct gve_priv *priv = netdev_priv(dev);
int err;
- err = verify_xdp_configuration(dev);
+ err = gve_verify_xdp_configuration(dev, xdp->extack);
if (err)
return err;
switch (xdp->command) {
@@ -1726,7 +1814,7 @@ int gve_adjust_config(struct gve_priv *priv,
{
int err;
- /* Allocate resources for the new confiugration */
+ /* Allocate resources for the new configuration */
err = gve_queues_mem_alloc(priv, tx_alloc_cfg, rx_alloc_cfg);
if (err) {
netif_err(priv, drv, priv->dev,
@@ -1830,7 +1918,7 @@ static void gve_turndown(struct gve_priv *priv)
/* Stop tx queues */
netif_tx_disable(priv->dev);
- xdp_features_clear_redirect_target(priv->dev);
+ xdp_features_clear_redirect_target_locked(priv->dev);
gve_clear_napi_enabled(priv);
gve_clear_report_stats(priv);
@@ -1902,7 +1990,7 @@ static void gve_turnup(struct gve_priv *priv)
}
if (priv->tx_cfg.num_xdp_queues && gve_supports_xdp_xmit(priv))
- xdp_features_set_redirect_target(priv->dev, false);
+ xdp_features_set_redirect_target_locked(priv->dev, false);
gve_set_napi_enabled(priv);
}
@@ -1916,72 +2004,104 @@ static void gve_turnup_and_check_status(struct gve_priv *priv)
gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
}
-static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)
+static struct gve_notify_block *gve_get_tx_notify_block(struct gve_priv *priv,
+ unsigned int txqueue)
{
- struct gve_notify_block *block;
- struct gve_tx_ring *tx = NULL;
- struct gve_priv *priv;
- u32 last_nic_done;
- u32 current_time;
u32 ntfy_idx;
- netdev_info(dev, "Timeout on tx queue, %d", txqueue);
- priv = netdev_priv(dev);
if (txqueue > priv->tx_cfg.num_queues)
- goto reset;
+ return NULL;
ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue);
if (ntfy_idx >= priv->num_ntfy_blks)
- goto reset;
+ return NULL;
+
+ return &priv->ntfy_blocks[ntfy_idx];
+}
+
+static bool gve_tx_timeout_try_q_kick(struct gve_priv *priv,
+ unsigned int txqueue)
+{
+ struct gve_notify_block *block;
+ u32 current_time;
+
+ block = gve_get_tx_notify_block(priv, txqueue);
- block = &priv->ntfy_blocks[ntfy_idx];
- tx = block->tx;
+ if (!block)
+ return false;
current_time = jiffies_to_msecs(jiffies);
- if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time)
- goto reset;
+ if (block->tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time)
+ return false;
- /* Check to see if there are missed completions, which will allow us to
- * kick the queue.
- */
- last_nic_done = gve_tx_load_event_counter(priv, tx);
- if (last_nic_done - tx->done) {
- netdev_info(dev, "Kicking queue %d", txqueue);
- iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
- napi_schedule(&block->napi);
- tx->last_kick_msec = current_time;
- goto out;
- } // Else reset.
+ netdev_info(priv->dev, "Kicking queue %d", txqueue);
+ napi_schedule(&block->napi);
+ block->tx->last_kick_msec = current_time;
+ return true;
+}
-reset:
- gve_schedule_reset(priv);
+static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)
+{
+ struct gve_notify_block *block;
+ struct gve_priv *priv;
-out:
- if (tx)
- tx->queue_timeout++;
+ netdev_info(dev, "Timeout on tx queue, %d", txqueue);
+ priv = netdev_priv(dev);
+
+ if (!gve_tx_timeout_try_q_kick(priv, txqueue))
+ gve_schedule_reset(priv);
+
+ block = gve_get_tx_notify_block(priv, txqueue);
+ if (block)
+ block->tx->queue_timeout++;
priv->tx_timeo_cnt++;
}
-u16 gve_get_pkt_buf_size(const struct gve_priv *priv, bool enable_hsplit)
+/* Header split is only supported on DQ RDA queue format. If XDP is enabled,
+ * header split is not allowed.
+ */
+bool gve_header_split_supported(const struct gve_priv *priv)
{
- if (enable_hsplit && priv->max_rx_buffer_size >= GVE_MAX_RX_BUFFER_SIZE)
- return GVE_MAX_RX_BUFFER_SIZE;
- else
- return GVE_DEFAULT_RX_BUFFER_SIZE;
+ return priv->header_buf_size &&
+ priv->queue_format == GVE_DQO_RDA_FORMAT && !priv->xdp_prog;
}
-/* header-split is not supported on non-DQO_RDA yet even if device advertises it */
-bool gve_header_split_supported(const struct gve_priv *priv)
+int gve_set_rx_buf_len_config(struct gve_priv *priv, u32 rx_buf_len,
+ struct netlink_ext_ack *extack,
+ struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
{
- return priv->header_buf_size && priv->queue_format == GVE_DQO_RDA_FORMAT;
+ u32 old_rx_buf_len = rx_alloc_cfg->packet_buffer_size;
+
+ if (rx_buf_len == old_rx_buf_len)
+ return 0;
+
+ /* device options may not always contain support for 4K buffers */
+ if (!gve_is_dqo(priv) || priv->max_rx_buffer_size < SZ_4K) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Modifying Rx buf len is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (priv->xdp_prog && rx_buf_len != SZ_2K) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Rx buf len can only be 2048 when XDP is on");
+ return -EINVAL;
+ }
+
+ if (rx_buf_len != SZ_2K && rx_buf_len != SZ_4K) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Rx buf len can only be 2048 or 4096");
+ return -EINVAL;
+ }
+ rx_alloc_cfg->packet_buffer_size = rx_buf_len;
+
+ return 0;
}
-int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split)
+int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split,
+ struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
{
- struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
- struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
bool enable_hdr_split;
- int err = 0;
if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN)
return 0;
@@ -1999,14 +2119,9 @@ int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split)
if (enable_hdr_split == priv->header_split_enabled)
return 0;
- gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
-
- rx_alloc_cfg.enable_header_split = enable_hdr_split;
- rx_alloc_cfg.packet_buffer_size = gve_get_pkt_buf_size(priv, enable_hdr_split);
+ rx_alloc_cfg->enable_header_split = enable_hdr_split;
- if (netif_running(priv->dev))
- err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
- return err;
+ return 0;
}
static int gve_set_features(struct net_device *netdev,
@@ -2022,6 +2137,12 @@ static int gve_set_features(struct net_device *netdev,
if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) {
netdev->features ^= NETIF_F_LRO;
+ if (priv->xdp_prog && (netdev->features & NETIF_F_LRO)) {
+ netdev_warn(netdev,
+ "XDP is not supported when LRO is on.\n");
+ err = -EOPNOTSUPP;
+ goto revert_features;
+ }
if (netif_running(netdev)) {
err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
if (err)
@@ -2041,6 +2162,42 @@ revert_features:
return err;
}
+static int gve_get_ts_config(struct net_device *dev,
+ struct kernel_hwtstamp_config *kernel_config)
+{
+ struct gve_priv *priv = netdev_priv(dev);
+
+ *kernel_config = priv->ts_config;
+ return 0;
+}
+
+static int gve_set_ts_config(struct net_device *dev,
+ struct kernel_hwtstamp_config *kernel_config,
+ struct netlink_ext_ack *extack)
+{
+ struct gve_priv *priv = netdev_priv(dev);
+
+ if (kernel_config->tx_type != HWTSTAMP_TX_OFF) {
+ NL_SET_ERR_MSG_MOD(extack, "TX timestamping is not supported");
+ return -ERANGE;
+ }
+
+ if (kernel_config->rx_filter != HWTSTAMP_FILTER_NONE) {
+ if (!priv->nic_ts_report) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "RX timestamping is not supported");
+ kernel_config->rx_filter = HWTSTAMP_FILTER_NONE;
+ return -EOPNOTSUPP;
+ }
+
+ kernel_config->rx_filter = HWTSTAMP_FILTER_ALL;
+ }
+
+ priv->ts_config.rx_filter = kernel_config->rx_filter;
+
+ return 0;
+}
+
static const struct net_device_ops gve_netdev_ops = {
.ndo_start_xmit = gve_start_xmit,
.ndo_features_check = gve_features_check,
@@ -2052,6 +2209,8 @@ static const struct net_device_ops gve_netdev_ops = {
.ndo_bpf = gve_xdp,
.ndo_xdp_xmit = gve_xdp_xmit,
.ndo_xsk_wakeup = gve_xsk_wakeup,
+ .ndo_hwtstamp_get = gve_get_ts_config,
+ .ndo_hwtstamp_set = gve_set_ts_config,
};
static void gve_handle_status(struct gve_priv *priv, u32 status)
@@ -2153,7 +2312,7 @@ void gve_handle_report_stats(struct gve_priv *priv)
};
stats[stats_idx++] = (struct stats) {
.stat_name = cpu_to_be32(RX_BUFFERS_POSTED),
- .value = cpu_to_be64(priv->rx[0].fill_cnt),
+ .value = cpu_to_be64(priv->rx[idx].fill_cnt),
.queue_id = cpu_to_be32(idx),
};
}
@@ -2181,13 +2340,21 @@ static void gve_set_netdev_xdp_features(struct gve_priv *priv)
xdp_features = NETDEV_XDP_ACT_BASIC;
xdp_features |= NETDEV_XDP_ACT_REDIRECT;
xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY;
+ } else if (priv->queue_format == GVE_DQO_RDA_FORMAT) {
+ xdp_features = NETDEV_XDP_ACT_BASIC;
+ xdp_features |= NETDEV_XDP_ACT_REDIRECT;
+ xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY;
} else {
xdp_features = 0;
}
- xdp_set_features_flag(priv->dev, xdp_features);
+ xdp_set_features_flag_locked(priv->dev, xdp_features);
}
+static const struct xdp_metadata_ops gve_xdp_metadata_ops = {
+ .xmo_rx_timestamp = gve_xdp_rx_timestamp,
+};
+
static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
{
int num_ntfy;
@@ -2235,7 +2402,7 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
goto err;
}
- /* Big TCP is only supported on DQ*/
+ /* Big TCP is only supported on DQO */
if (!gve_is_gqi(priv))
netif_set_tso_max_size(priv->dev, GVE_DQO_TX_MAX);
@@ -2245,6 +2412,7 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
*/
priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1;
priv->mgmt_msix_idx = priv->num_ntfy_blks;
+ priv->numa_node = dev_to_node(&priv->pdev->dev);
priv->tx_cfg.max_queues =
min_t(int, priv->tx_cfg.max_queues, priv->num_ntfy_blks / 2);
@@ -2271,11 +2439,29 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
priv->rx_coalesce_usecs = GVE_RX_IRQ_RATELIMIT_US_DQO;
}
+ priv->ts_config.tx_type = HWTSTAMP_TX_OFF;
+ priv->ts_config.rx_filter = HWTSTAMP_FILTER_NONE;
+
setup_device:
+ priv->xsk_pools = bitmap_zalloc(priv->rx_cfg.max_queues, GFP_KERNEL);
+ if (!priv->xsk_pools) {
+ err = -ENOMEM;
+ goto err;
+ }
+
gve_set_netdev_xdp_features(priv);
+ if (!gve_is_gqi(priv))
+ priv->dev->xdp_metadata_ops = &gve_xdp_metadata_ops;
+
err = gve_setup_device_resources(priv);
- if (!err)
- return 0;
+ if (err)
+ goto err_free_xsk_bitmap;
+
+ return 0;
+
+err_free_xsk_bitmap:
+ bitmap_free(priv->xsk_pools);
+ priv->xsk_pools = NULL;
err:
gve_adminq_free(&priv->pdev->dev, priv);
return err;
@@ -2285,6 +2471,8 @@ static void gve_teardown_priv_resources(struct gve_priv *priv)
{
gve_teardown_device_resources(priv);
gve_adminq_free(&priv->pdev->dev, priv);
+ bitmap_free(priv->xsk_pools);
+ priv->xsk_pools = NULL;
}
static void gve_trigger_reset(struct gve_priv *priv)
@@ -2659,6 +2847,9 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
if (err)
goto abort_with_wq;
+ if (!gve_is_gqi(priv) && !gve_is_qpl(priv))
+ dev->netmem_tx = true;
+
err = register_netdev(dev);
if (err)
goto abort_with_gve_init;
@@ -2715,6 +2906,8 @@ static void gve_shutdown(struct pci_dev *pdev)
struct gve_priv *priv = netdev_priv(netdev);
bool was_up = netif_running(priv->dev);
+ netif_device_detach(netdev);
+
rtnl_lock();
netdev_lock(netdev);
if (was_up && gve_close(priv->dev)) {
diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c
new file mode 100644
index 000000000000..073677d82ee8
--- /dev/null
+++ b/drivers/net/ethernet/google/gve/gve_ptp.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/* Google virtual Ethernet (gve) driver
+ *
+ * Copyright (C) 2025 Google LLC
+ */
+
+#include "gve.h"
+#include "gve_adminq.h"
+
+/* Interval to schedule a nic timestamp calibration, 250ms. */
+#define GVE_NIC_TS_SYNC_INTERVAL_MS 250
+
+/* Read the nic timestamp from hardware via the admin queue. */
+int gve_clock_nic_ts_read(struct gve_priv *priv)
+{
+ u64 nic_raw;
+ int err;
+
+ err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
+ if (err)
+ return err;
+
+ nic_raw = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+ WRITE_ONCE(priv->last_sync_nic_counter, nic_raw);
+
+ return 0;
+}
+
+static int gve_ptp_gettimex64(struct ptp_clock_info *info,
+ struct timespec64 *ts,
+ struct ptp_system_timestamp *sts)
+{
+ return -EOPNOTSUPP;
+}
+
+static int gve_ptp_settime64(struct ptp_clock_info *info,
+ const struct timespec64 *ts)
+{
+ return -EOPNOTSUPP;
+}
+
+static long gve_ptp_do_aux_work(struct ptp_clock_info *info)
+{
+ const struct gve_ptp *ptp = container_of(info, struct gve_ptp, info);
+ struct gve_priv *priv = ptp->priv;
+ int err;
+
+ if (gve_get_reset_in_progress(priv) || !gve_get_admin_queue_ok(priv))
+ goto out;
+
+ err = gve_clock_nic_ts_read(priv);
+ if (err && net_ratelimit())
+ dev_err(&priv->pdev->dev,
+ "%s read err %d\n", __func__, err);
+
+out:
+ return msecs_to_jiffies(GVE_NIC_TS_SYNC_INTERVAL_MS);
+}
+
+static const struct ptp_clock_info gve_ptp_caps = {
+ .owner = THIS_MODULE,
+ .name = "gve clock",
+ .gettimex64 = gve_ptp_gettimex64,
+ .settime64 = gve_ptp_settime64,
+ .do_aux_work = gve_ptp_do_aux_work,
+};
+
+static int gve_ptp_init(struct gve_priv *priv)
+{
+ struct gve_ptp *ptp;
+ int err;
+
+ if (!priv->nic_timestamp_supported) {
+ dev_dbg(&priv->pdev->dev, "Device does not support PTP\n");
+ return -EOPNOTSUPP;
+ }
+
+ priv->ptp = kzalloc(sizeof(*priv->ptp), GFP_KERNEL);
+ if (!priv->ptp)
+ return -ENOMEM;
+
+ ptp = priv->ptp;
+ ptp->info = gve_ptp_caps;
+ ptp->clock = ptp_clock_register(&ptp->info, &priv->pdev->dev);
+
+ if (IS_ERR(ptp->clock)) {
+ dev_err(&priv->pdev->dev, "PTP clock registration failed\n");
+ err = PTR_ERR(ptp->clock);
+ goto free_ptp;
+ }
+
+ ptp->priv = priv;
+ return 0;
+
+free_ptp:
+ kfree(ptp);
+ priv->ptp = NULL;
+ return err;
+}
+
+static void gve_ptp_release(struct gve_priv *priv)
+{
+ struct gve_ptp *ptp = priv->ptp;
+
+ if (!ptp)
+ return;
+
+ if (ptp->clock)
+ ptp_clock_unregister(ptp->clock);
+
+ kfree(ptp);
+ priv->ptp = NULL;
+}
+
+int gve_init_clock(struct gve_priv *priv)
+{
+ int err;
+
+ if (!priv->nic_timestamp_supported)
+ return 0;
+
+ err = gve_ptp_init(priv);
+ if (err)
+ return err;
+
+ priv->nic_ts_report =
+ dma_alloc_coherent(&priv->pdev->dev,
+ sizeof(struct gve_nic_ts_report),
+ &priv->nic_ts_report_bus,
+ GFP_KERNEL);
+ if (!priv->nic_ts_report) {
+ dev_err(&priv->pdev->dev, "%s dma alloc error\n", __func__);
+ err = -ENOMEM;
+ goto release_ptp;
+ }
+ err = gve_clock_nic_ts_read(priv);
+ if (err) {
+ dev_err(&priv->pdev->dev, "failed to read NIC clock %d\n", err);
+ goto release_nic_ts_report;
+ }
+ ptp_schedule_worker(priv->ptp->clock,
+ msecs_to_jiffies(GVE_NIC_TS_SYNC_INTERVAL_MS));
+
+ return 0;
+
+release_nic_ts_report:
+ dma_free_coherent(&priv->pdev->dev,
+ sizeof(struct gve_nic_ts_report),
+ priv->nic_ts_report, priv->nic_ts_report_bus);
+ priv->nic_ts_report = NULL;
+release_ptp:
+ gve_ptp_release(priv);
+ return err;
+}
+
+void gve_teardown_clock(struct gve_priv *priv)
+{
+ gve_ptp_release(priv);
+
+ if (priv->nic_ts_report) {
+ dma_free_coherent(&priv->pdev->dev,
+ sizeof(struct gve_nic_ts_report),
+ priv->nic_ts_report, priv->nic_ts_report_bus);
+ priv->nic_ts_report = NULL;
+ }
+}
diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
index 90e875c1832f..ec424d2f4f57 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -192,8 +192,8 @@ static int gve_rx_prefill_pages(struct gve_rx_ring *rx,
*/
slots = rx->mask + 1;
- rx->data.page_info = kvzalloc(slots *
- sizeof(*rx->data.page_info), GFP_KERNEL);
+ rx->data.page_info = kvcalloc_node(slots, sizeof(*rx->data.page_info),
+ GFP_KERNEL, priv->numa_node);
if (!rx->data.page_info)
return -ENOMEM;
@@ -216,7 +216,8 @@ static int gve_rx_prefill_pages(struct gve_rx_ring *rx,
if (!rx->data.raw_addressing) {
for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) {
- struct page *page = alloc_page(GFP_KERNEL);
+ struct page *page = alloc_pages_node(priv->numa_node,
+ GFP_KERNEL, 0);
if (!page) {
err = -ENOMEM;
@@ -303,10 +304,9 @@ int gve_rx_alloc_ring_gqi(struct gve_priv *priv,
rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1;
rx->qpl_copy_pool_head = 0;
- rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1,
- sizeof(rx->qpl_copy_pool[0]),
- GFP_KERNEL);
-
+ rx->qpl_copy_pool = kvcalloc_node(rx->qpl_copy_pool_mask + 1,
+ sizeof(rx->qpl_copy_pool[0]),
+ GFP_KERNEL, priv->numa_node);
if (!rx->qpl_copy_pool) {
err = -ENOMEM;
goto abort_with_slots;
diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c
index dcb0545baa50..f1bd8f5d5732 100644
--- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c
@@ -8,6 +8,7 @@
#include "gve_dqo.h"
#include "gve_adminq.h"
#include "gve_utils.h"
+#include <linux/bpf.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/skbuff.h>
@@ -15,6 +16,7 @@
#include <net/ip6_checksum.h>
#include <net/ipv6.h>
#include <net/tcp.h>
+#include <net/xdp_sock_drv.h>
static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
{
@@ -148,6 +150,10 @@ void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
gve_free_to_page_pool(rx, bs, false);
else
gve_free_qpl_page_dqo(bs);
+ if (gve_buf_state_is_allocated(rx, bs) && bs->xsk_buff) {
+ xsk_buff_free(bs->xsk_buff);
+ bs->xsk_buff = NULL;
+ }
}
if (rx->dqo.qpl) {
@@ -234,11 +240,16 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
rx->rx_headroom = 0;
}
+ /* struct gve_xdp_buff is overlaid on struct xdp_buff_xsk and utilizes
+ * the 24 byte field cb to store gve specific data.
+ */
+ XSK_CHECK_PRIV_TYPE(struct gve_xdp_buff);
+
rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots :
gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
- rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states,
- sizeof(rx->dqo.buf_states[0]),
- GFP_KERNEL);
+ rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states,
+ sizeof(rx->dqo.buf_states[0]),
+ GFP_KERNEL, priv->numa_node);
if (!rx->dqo.buf_states)
return -ENOMEM;
@@ -437,6 +448,53 @@ static void gve_rx_skb_hash(struct sk_buff *skb,
skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
}
+/* Expand the hardware timestamp to the full 64 bits of width, and add it to the
+ * skb.
+ *
+ * This algorithm works by using the passed hardware timestamp to generate a
+ * diff relative to the last read of the nic clock. This diff can be positive or
+ * negative, as it is possible that we have read the clock more recently than
+ * the hardware has received this packet. To detect this, we use the high bit of
+ * the diff, and assume that the read is more recent if the high bit is set. In
+ * this case we invert the process.
+ *
+ * Note that this means if the time delta between packet reception and the last
+ * clock read is greater than ~2 seconds, this will provide invalid results.
+ */
+static ktime_t gve_rx_get_hwtstamp(struct gve_priv *gve, u32 hwts)
+{
+ u64 last_read = READ_ONCE(gve->last_sync_nic_counter);
+ u32 low = (u32)last_read;
+ s32 diff = hwts - low;
+
+ return ns_to_ktime(last_read + diff);
+}
+
+static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx,
+ const struct gve_rx_compl_desc_dqo *desc)
+{
+ struct sk_buff *skb = rx->ctx.skb_head;
+
+ if (desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID)
+ skb_hwtstamps(skb)->hwtstamp =
+ gve_rx_get_hwtstamp(rx->gve, le32_to_cpu(desc->ts));
+}
+
+int gve_xdp_rx_timestamp(const struct xdp_md *_ctx, u64 *timestamp)
+{
+ const struct gve_xdp_buff *ctx = (void *)_ctx;
+
+ if (!ctx->gve->nic_ts_report)
+ return -ENODATA;
+
+ if (!(ctx->compl_desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID))
+ return -ENODATA;
+
+ *timestamp = gve_rx_get_hwtstamp(ctx->gve,
+ le32_to_cpu(ctx->compl_desc->ts));
+ return 0;
+}
+
static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx)
{
if (!rx->ctx.skb_head)
@@ -464,7 +522,7 @@ static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
struct gve_rx_buf_state_dqo *buf_state,
u16 buf_len)
{
- struct page *page = alloc_page(GFP_ATOMIC);
+ struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0);
int num_frags;
if (!page)
@@ -547,27 +605,171 @@ static int gve_rx_append_frags(struct napi_struct *napi,
return 0;
}
+static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
+ struct xdp_buff *xdp)
+{
+ struct gve_tx_ring *tx;
+ struct xdp_frame *xdpf;
+ u32 tx_qid;
+ int err;
+
+ xdpf = xdp_convert_buff_to_frame(xdp);
+ if (unlikely(!xdpf)) {
+ if (rx->xsk_pool)
+ xsk_buff_free(xdp);
+ return -ENOSPC;
+ }
+
+ tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
+ tx = &priv->tx[tx_qid];
+ spin_lock(&tx->dqo_tx.xdp_lock);
+ err = gve_xdp_xmit_one_dqo(priv, tx, xdpf);
+ spin_unlock(&tx->dqo_tx.xdp_lock);
+
+ return err;
+}
+
+static void gve_xsk_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
+ struct xdp_buff *xdp, struct bpf_prog *xprog,
+ int xdp_act)
+{
+ switch (xdp_act) {
+ case XDP_ABORTED:
+ case XDP_DROP:
+ default:
+ xsk_buff_free(xdp);
+ break;
+ case XDP_TX:
+ if (unlikely(gve_xdp_tx_dqo(priv, rx, xdp)))
+ goto err;
+ break;
+ case XDP_REDIRECT:
+ if (unlikely(xdp_do_redirect(priv->dev, xdp, xprog)))
+ goto err;
+ break;
+ }
+
+ u64_stats_update_begin(&rx->statss);
+ if ((u32)xdp_act < GVE_XDP_ACTIONS)
+ rx->xdp_actions[xdp_act]++;
+ u64_stats_update_end(&rx->statss);
+ return;
+
+err:
+ u64_stats_update_begin(&rx->statss);
+ if (xdp_act == XDP_TX)
+ rx->xdp_tx_errors++;
+ if (xdp_act == XDP_REDIRECT)
+ rx->xdp_redirect_errors++;
+ u64_stats_update_end(&rx->statss);
+}
+
static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
struct xdp_buff *xdp, struct bpf_prog *xprog,
int xdp_act,
struct gve_rx_buf_state_dqo *buf_state)
{
- u64_stats_update_begin(&rx->statss);
+ int err;
switch (xdp_act) {
case XDP_ABORTED:
case XDP_DROP:
default:
- rx->xdp_actions[xdp_act]++;
+ gve_free_buffer(rx, buf_state);
break;
case XDP_TX:
- rx->xdp_tx_errors++;
+ err = gve_xdp_tx_dqo(priv, rx, xdp);
+ if (unlikely(err))
+ goto err;
+ gve_reuse_buffer(rx, buf_state);
break;
case XDP_REDIRECT:
- rx->xdp_redirect_errors++;
+ err = xdp_do_redirect(priv->dev, xdp, xprog);
+ if (unlikely(err))
+ goto err;
+ gve_reuse_buffer(rx, buf_state);
break;
}
+ u64_stats_update_begin(&rx->statss);
+ if ((u32)xdp_act < GVE_XDP_ACTIONS)
+ rx->xdp_actions[xdp_act]++;
+ u64_stats_update_end(&rx->statss);
+ return;
+err:
+ u64_stats_update_begin(&rx->statss);
+ if (xdp_act == XDP_TX)
+ rx->xdp_tx_errors++;
+ else if (xdp_act == XDP_REDIRECT)
+ rx->xdp_redirect_errors++;
u64_stats_update_end(&rx->statss);
gve_free_buffer(rx, buf_state);
+ return;
+}
+
+static int gve_rx_xsk_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
+ const struct gve_rx_compl_desc_dqo *compl_desc,
+ struct gve_rx_buf_state_dqo *buf_state,
+ struct bpf_prog *xprog)
+{
+ struct xdp_buff *xdp = buf_state->xsk_buff;
+ int buf_len = compl_desc->packet_len;
+ struct gve_priv *priv = rx->gve;
+ struct gve_xdp_buff *gve_xdp;
+ int xdp_act;
+
+ xdp->data_end = xdp->data + buf_len;
+ xsk_buff_dma_sync_for_cpu(xdp);
+
+ gve_xdp = (void *)xdp;
+ gve_xdp->gve = priv;
+ gve_xdp->compl_desc = compl_desc;
+
+ if (xprog) {
+ xdp_act = bpf_prog_run_xdp(xprog, xdp);
+ buf_len = xdp->data_end - xdp->data;
+ if (xdp_act != XDP_PASS) {
+ gve_xsk_done_dqo(priv, rx, xdp, xprog, xdp_act);
+ gve_free_buf_state(rx, buf_state);
+ return 0;
+ }
+ }
+
+ /* Copy the data to skb */
+ rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
+ xdp->data, buf_len);
+ if (unlikely(!rx->ctx.skb_head)) {
+ xsk_buff_free(xdp);
+ gve_free_buf_state(rx, buf_state);
+ return -ENOMEM;
+ }
+ rx->ctx.skb_tail = rx->ctx.skb_head;
+
+ /* Free XSK buffer and Buffer state */
+ xsk_buff_free(xdp);
+ gve_free_buf_state(rx, buf_state);
+
+ /* Update Stats */
+ u64_stats_update_begin(&rx->statss);
+ rx->xdp_actions[XDP_PASS]++;
+ u64_stats_update_end(&rx->statss);
+ return 0;
+}
+
+static void gve_dma_sync(struct gve_priv *priv, struct gve_rx_ring *rx,
+ struct gve_rx_buf_state_dqo *buf_state, u16 buf_len)
+{
+ struct gve_rx_slot_page_info *page_info = &buf_state->page_info;
+
+ if (rx->dqo.page_pool) {
+ page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool,
+ page_info->netmem,
+ page_info->page_offset,
+ buf_len);
+ } else {
+ dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
+ page_info->page_offset +
+ page_info->pad,
+ buf_len, DMA_FROM_DEVICE);
+ }
}
/* Returns 0 if descriptor is completed successfully.
@@ -608,6 +810,10 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
buf_len = compl_desc->packet_len;
hdr_len = compl_desc->header_len;
+ xprog = READ_ONCE(priv->xdp_prog);
+ if (buf_state->xsk_buff)
+ return gve_rx_xsk_dqo(napi, rx, compl_desc, buf_state, xprog);
+
/* Page might have not been used for awhile and was likely last written
* by a different thread.
*/
@@ -641,13 +847,18 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
rx->rx_hsplit_unsplit_pkt += unsplit;
rx->rx_hsplit_bytes += hdr_len;
u64_stats_update_end(&rx->statss);
+ } else if (!rx->ctx.skb_head && rx->dqo.page_pool &&
+ netmem_is_net_iov(buf_state->page_info.netmem)) {
+ /* when header split is disabled, the header went to the packet
+ * buffer. If the packet buffer is a net_iov, those can't be
+ * easily mapped into the kernel space to access the header
+ * required to process the packet.
+ */
+ goto error;
}
/* Sync the portion of dma buffer for CPU to read. */
- dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
- buf_state->page_info.page_offset +
- buf_state->page_info.pad,
- buf_len, DMA_FROM_DEVICE);
+ gve_dma_sync(priv, rx, buf_state, buf_len);
/* Append to current skb if one exists. */
if (rx->ctx.skb_head) {
@@ -658,25 +869,27 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
return 0;
}
- xprog = READ_ONCE(priv->xdp_prog);
if (xprog) {
- struct xdp_buff xdp;
+ struct gve_xdp_buff gve_xdp;
void *old_data;
int xdp_act;
- xdp_init_buff(&xdp, buf_state->page_info.buf_size,
+ xdp_init_buff(&gve_xdp.xdp, buf_state->page_info.buf_size,
&rx->xdp_rxq);
- xdp_prepare_buff(&xdp,
+ xdp_prepare_buff(&gve_xdp.xdp,
buf_state->page_info.page_address +
buf_state->page_info.page_offset,
buf_state->page_info.pad,
buf_len, false);
- old_data = xdp.data;
- xdp_act = bpf_prog_run_xdp(xprog, &xdp);
- buf_state->page_info.pad += xdp.data - old_data;
- buf_len = xdp.data_end - xdp.data;
+ gve_xdp.gve = priv;
+ gve_xdp.compl_desc = compl_desc;
+
+ old_data = gve_xdp.xdp.data;
+ xdp_act = bpf_prog_run_xdp(xprog, &gve_xdp.xdp);
+ buf_state->page_info.pad += gve_xdp.xdp.data - old_data;
+ buf_len = gve_xdp.xdp.data_end - gve_xdp.xdp.data;
if (xdp_act != XDP_PASS) {
- gve_xdp_done_dqo(priv, rx, &xdp, xprog, xdp_act,
+ gve_xdp_done_dqo(priv, rx, &gve_xdp.xdp, xprog, xdp_act,
buf_state);
return 0;
}
@@ -686,7 +899,9 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
u64_stats_update_end(&rx->statss);
}
- if (eop && buf_len <= priv->rx_copybreak) {
+ if (eop && buf_len <= priv->rx_copybreak &&
+ !(rx->dqo.page_pool &&
+ netmem_is_net_iov(buf_state->page_info.netmem))) {
rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
&buf_state->page_info, buf_len);
if (unlikely(!rx->ctx.skb_head))
@@ -767,6 +982,9 @@ static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
if (feat & NETIF_F_RXCSUM)
gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
+ if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)
+ gve_rx_skb_hwtstamp(rx, desc);
+
/* RSC packets must set gso_size otherwise the TCP stack will complain
* that packets are larger than MTU.
*/
@@ -786,16 +1004,27 @@ static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
{
- struct napi_struct *napi = &block->napi;
- netdev_features_t feat = napi->dev->features;
-
- struct gve_rx_ring *rx = block->rx;
- struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
-
+ struct gve_rx_compl_queue_dqo *complq;
+ struct napi_struct *napi;
+ netdev_features_t feat;
+ struct gve_rx_ring *rx;
+ struct gve_priv *priv;
+ u64 xdp_redirects;
u32 work_done = 0;
u64 bytes = 0;
+ u64 xdp_txs;
int err;
+ napi = &block->napi;
+ feat = napi->dev->features;
+
+ rx = block->rx;
+ priv = rx->gve;
+ complq = &rx->dqo.complq;
+
+ xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
+ xdp_txs = rx->xdp_actions[XDP_TX];
+
while (work_done < budget) {
struct gve_rx_compl_desc_dqo *compl_desc =
&complq->desc_ring[complq->head];
@@ -869,6 +1098,12 @@ int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
rx->ctx.skb_tail = NULL;
}
+ if (xdp_txs != rx->xdp_actions[XDP_TX])
+ gve_xdp_tx_flush_dqo(priv, rx->q_num);
+
+ if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
+ xdp_do_flush();
+
gve_rx_post_buffers_dqo(rx);
u64_stats_update_begin(&rx->statss);
diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c
index 1b40bf0c811a..97efc8d27e6f 100644
--- a/drivers/net/ethernet/google/gve/gve_tx.c
+++ b/drivers/net/ethernet/google/gve/gve_tx.c
@@ -730,7 +730,9 @@ unmap_drop:
gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]);
}
drop:
+ u64_stats_update_begin(&tx->statss);
tx->dropped_pkt++;
+ u64_stats_update_end(&tx->statss);
return 0;
}
@@ -823,8 +825,8 @@ static int gve_tx_fill_xdp(struct gve_priv *priv, struct gve_tx_ring *tx,
return ndescs;
}
-int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
- u32 flags)
+int gve_xdp_xmit_gqi(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags)
{
struct gve_priv *priv = netdev_priv(dev);
struct gve_tx_ring *tx;
diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
index 2eba868d8037..40b89b3e5a31 100644
--- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
@@ -9,9 +9,11 @@
#include "gve_utils.h"
#include "gve_dqo.h"
#include <net/ip.h>
+#include <linux/bpf.h>
#include <linux/tcp.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
+#include <net/xdp_sock_drv.h>
/* Returns true if tx_bufs are available. */
static bool gve_has_free_tx_qpl_bufs(struct gve_tx_ring *tx, int count)
@@ -110,6 +112,14 @@ static bool gve_has_pending_packet(struct gve_tx_ring *tx)
return false;
}
+void gve_xdp_tx_flush_dqo(struct gve_priv *priv, u32 xdp_qid)
+{
+ u32 tx_qid = gve_xdp_tx_queue_id(priv, xdp_qid);
+ struct gve_tx_ring *tx = &priv->tx[tx_qid];
+
+ gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
+}
+
static struct gve_tx_pending_packet_dqo *
gve_alloc_pending_packet(struct gve_tx_ring *tx)
{
@@ -198,7 +208,8 @@ void gve_tx_stop_ring_dqo(struct gve_priv *priv, int idx)
gve_remove_napi(priv, ntfy_idx);
gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
- netdev_tx_reset_queue(tx->netdev_txq);
+ if (tx->netdev_txq)
+ netdev_tx_reset_queue(tx->netdev_txq);
gve_tx_clean_pending_packets(tx);
gve_tx_remove_from_block(priv, idx);
}
@@ -231,6 +242,9 @@ static void gve_tx_free_ring_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
tx->dqo.tx_ring = NULL;
}
+ kvfree(tx->dqo.xsk_reorder_queue);
+ tx->dqo.xsk_reorder_queue = NULL;
+
kvfree(tx->dqo.pending_packets);
tx->dqo.pending_packets = NULL;
@@ -276,7 +290,8 @@ void gve_tx_start_ring_dqo(struct gve_priv *priv, int idx)
gve_tx_add_to_block(priv, idx);
- tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
+ if (idx < priv->tx_cfg.num_queues)
+ tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
}
@@ -295,6 +310,7 @@ static int gve_tx_alloc_ring_dqo(struct gve_priv *priv,
memset(tx, 0, sizeof(*tx));
tx->q_num = idx;
tx->dev = hdev;
+ spin_lock_init(&tx->dqo_tx.xdp_lock);
atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
/* Queue sizes must be a power of 2 */
@@ -333,6 +349,17 @@ static int gve_tx_alloc_ring_dqo(struct gve_priv *priv,
tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
+
+ /* Only alloc xsk pool for XDP queues */
+ if (idx >= cfg->qcfg->num_queues && cfg->num_xdp_rings) {
+ tx->dqo.xsk_reorder_queue =
+ kvcalloc(tx->dqo.complq_mask + 1,
+ sizeof(tx->dqo.xsk_reorder_queue[0]),
+ GFP_KERNEL);
+ if (!tx->dqo.xsk_reorder_queue)
+ goto err;
+ }
+
tx->dqo_compl.miss_completions.head = -1;
tx->dqo_compl.miss_completions.tail = -1;
tx->dqo_compl.timed_out_completions.head = -1;
@@ -439,12 +466,28 @@ static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
return tx->mask - num_used;
}
+/* Checks if the requested number of slots are available in the ring */
+static bool gve_has_tx_slots_available(struct gve_tx_ring *tx, u32 slots_req)
+{
+ u32 num_avail = num_avail_tx_slots(tx);
+
+ slots_req += GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP;
+
+ if (num_avail >= slots_req)
+ return true;
+
+ /* Update cached TX head pointer */
+ tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
+
+ return num_avail_tx_slots(tx) >= slots_req;
+}
+
static bool gve_has_avail_slots_tx_dqo(struct gve_tx_ring *tx,
int desc_count, int buf_count)
{
return gve_has_pending_packet(tx) &&
- num_avail_tx_slots(tx) >= desc_count &&
- gve_has_free_tx_qpl_bufs(tx, buf_count);
+ gve_has_tx_slots_available(tx, desc_count) &&
+ gve_has_free_tx_qpl_bufs(tx, buf_count);
}
/* Stops the queue if available descriptors is less than 'count'.
@@ -456,12 +499,6 @@ static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx,
if (likely(gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count)))
return 0;
- /* Update cached TX head pointer */
- tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
-
- if (likely(gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count)))
- return 0;
-
/* No space, so stop the queue */
tx->stop_queue++;
netif_tx_stop_queue(tx->netdev_txq);
@@ -472,8 +509,6 @@ static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx,
/* After stopping queue, check if we can transmit again in order to
* avoid TOCTOU bug.
*/
- tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
-
if (likely(!gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count)))
return -EBUSY;
@@ -500,11 +535,9 @@ static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
}
static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
- struct sk_buff *skb, u32 len, u64 addr,
+ bool enable_csum, u32 len, u64 addr,
s16 compl_tag, bool eop, bool is_gso)
{
- const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
-
while (len > 0) {
struct gve_tx_pkt_desc_dqo *desc =
&tx->dqo.tx_ring[*desc_idx].pkt;
@@ -515,7 +548,7 @@ static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
.buf_addr = cpu_to_le64(addr),
.dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
.end_of_packet = cur_eop,
- .checksum_offload_enable = checksum_offload_en,
+ .checksum_offload_enable = enable_csum,
.compl_tag = cpu_to_le16(compl_tag),
.buf_size = cur_len,
};
@@ -612,6 +645,25 @@ gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
};
}
+static void gve_tx_update_tail(struct gve_tx_ring *tx, u32 desc_idx)
+{
+ u32 last_desc_idx = (desc_idx - 1) & tx->mask;
+ u32 last_report_event_interval =
+ (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
+
+ /* Commit the changes to our state */
+ tx->dqo_tx.tail = desc_idx;
+
+ /* Request a descriptor completion on the last descriptor of the
+ * packet if we are allowed to by the HW enforced interval.
+ */
+
+ if (unlikely(last_report_event_interval >= GVE_TX_MIN_RE_INTERVAL)) {
+ tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
+ tx->dqo_tx.last_re_idx = last_desc_idx;
+ }
+}
+
static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
struct sk_buff *skb,
struct gve_tx_pending_packet_dqo *pkt,
@@ -619,6 +671,7 @@ static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
u32 *desc_idx,
bool is_gso)
{
+ bool enable_csum = skb->ip_summed == CHECKSUM_PARTIAL;
const struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
@@ -644,7 +697,7 @@ static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
++pkt->num_bufs;
- gve_tx_fill_pkt_desc_dqo(tx, desc_idx, skb, len, addr,
+ gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, len, addr,
completion_tag,
/*eop=*/shinfo->nr_frags == 0, is_gso);
}
@@ -660,10 +713,11 @@ static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
goto err;
dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
- dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
+ netmem_dma_unmap_addr_set(skb_frag_netmem(frag), pkt,
+ dma[pkt->num_bufs], addr);
++pkt->num_bufs;
- gve_tx_fill_pkt_desc_dqo(tx, desc_idx, skb, len, addr,
+ gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, len, addr,
completion_tag, is_eop, is_gso);
}
@@ -708,6 +762,7 @@ static int gve_tx_add_skb_copy_dqo(struct gve_tx_ring *tx,
u32 *desc_idx,
bool is_gso)
{
+ bool enable_csum = skb->ip_summed == CHECKSUM_PARTIAL;
u32 copy_offset = 0;
dma_addr_t dma_addr;
u32 copy_len;
@@ -729,7 +784,7 @@ static int gve_tx_add_skb_copy_dqo(struct gve_tx_ring *tx,
copy_offset += copy_len;
dma_sync_single_for_device(tx->dev, dma_addr,
copy_len, DMA_TO_DEVICE);
- gve_tx_fill_pkt_desc_dqo(tx, desc_idx, skb,
+ gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum,
copy_len,
dma_addr,
completion_tag,
@@ -763,7 +818,11 @@ static int gve_tx_add_skb_dqo(struct gve_tx_ring *tx,
s16 completion_tag;
pkt = gve_alloc_pending_packet(tx);
+ if (!pkt)
+ return -ENOMEM;
+
pkt->skb = skb;
+ pkt->type = GVE_TX_PENDING_PACKET_DQO_SKB;
completion_tag = pkt - tx->dqo.pending_packets;
gve_extract_tx_metadata_dqo(skb, &metadata);
@@ -796,24 +855,7 @@ static int gve_tx_add_skb_dqo(struct gve_tx_ring *tx,
tx->dqo_tx.posted_packet_desc_cnt += pkt->num_bufs;
- /* Commit the changes to our state */
- tx->dqo_tx.tail = desc_idx;
-
- /* Request a descriptor completion on the last descriptor of the
- * packet if we are allowed to by the HW enforced interval.
- */
- {
- u32 last_desc_idx = (desc_idx - 1) & tx->mask;
- u32 last_report_event_interval =
- (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
-
- if (unlikely(last_report_event_interval >=
- GVE_TX_MIN_RE_INTERVAL)) {
- tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
- tx->dqo_tx.last_re_idx = last_desc_idx;
- }
- }
-
+ gve_tx_update_tail(tx, desc_idx);
return 0;
err:
@@ -947,9 +989,8 @@ static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
/* Metadata + (optional TSO) + data descriptors. */
total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
- if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
- GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP,
- num_buffer_descs))) {
+ if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs,
+ num_buffer_descs))) {
return -1;
}
@@ -961,11 +1002,45 @@ static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
return 0;
drop:
+ u64_stats_update_begin(&tx->statss);
tx->dropped_pkt++;
+ u64_stats_update_end(&tx->statss);
dev_kfree_skb_any(skb);
return 0;
}
+static void gve_xsk_reorder_queue_push_dqo(struct gve_tx_ring *tx,
+ u16 completion_tag)
+{
+ u32 tail = atomic_read(&tx->dqo_tx.xsk_reorder_queue_tail);
+
+ tx->dqo.xsk_reorder_queue[tail] = completion_tag;
+ tail = (tail + 1) & tx->dqo.complq_mask;
+ atomic_set_release(&tx->dqo_tx.xsk_reorder_queue_tail, tail);
+}
+
+static struct gve_tx_pending_packet_dqo *
+gve_xsk_reorder_queue_head(struct gve_tx_ring *tx)
+{
+ u32 head = tx->dqo_compl.xsk_reorder_queue_head;
+
+ if (head == tx->dqo_compl.xsk_reorder_queue_tail) {
+ tx->dqo_compl.xsk_reorder_queue_tail =
+ atomic_read_acquire(&tx->dqo_tx.xsk_reorder_queue_tail);
+
+ if (head == tx->dqo_compl.xsk_reorder_queue_tail)
+ return NULL;
+ }
+
+ return &tx->dqo.pending_packets[tx->dqo.xsk_reorder_queue[head]];
+}
+
+static void gve_xsk_reorder_queue_pop_dqo(struct gve_tx_ring *tx)
+{
+ tx->dqo_compl.xsk_reorder_queue_head++;
+ tx->dqo_compl.xsk_reorder_queue_head &= tx->dqo.complq_mask;
+}
+
/* Transmit a given skb and ring the doorbell. */
netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
{
@@ -989,6 +1064,62 @@ netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
+static bool gve_xsk_tx_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
+ int budget)
+{
+ struct xsk_buff_pool *pool = tx->xsk_pool;
+ struct xdp_desc desc;
+ bool repoll = false;
+ int sent = 0;
+
+ spin_lock(&tx->dqo_tx.xdp_lock);
+ for (; sent < budget; sent++) {
+ struct gve_tx_pending_packet_dqo *pkt;
+ s16 completion_tag;
+ dma_addr_t addr;
+ u32 desc_idx;
+
+ if (unlikely(!gve_has_avail_slots_tx_dqo(tx, 1, 1))) {
+ repoll = true;
+ break;
+ }
+
+ if (!xsk_tx_peek_desc(pool, &desc))
+ break;
+
+ pkt = gve_alloc_pending_packet(tx);
+ pkt->type = GVE_TX_PENDING_PACKET_DQO_XSK;
+ pkt->num_bufs = 0;
+ completion_tag = pkt - tx->dqo.pending_packets;
+
+ addr = xsk_buff_raw_get_dma(pool, desc.addr);
+ xsk_buff_raw_dma_sync_for_device(pool, addr, desc.len);
+
+ desc_idx = tx->dqo_tx.tail;
+ gve_tx_fill_pkt_desc_dqo(tx, &desc_idx,
+ true, desc.len,
+ addr, completion_tag, true,
+ false);
+ ++pkt->num_bufs;
+ gve_tx_update_tail(tx, desc_idx);
+ tx->dqo_tx.posted_packet_desc_cnt += pkt->num_bufs;
+ gve_xsk_reorder_queue_push_dqo(tx, completion_tag);
+ }
+
+ if (sent) {
+ gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
+ xsk_tx_release(pool);
+ }
+
+ spin_unlock(&tx->dqo_tx.xdp_lock);
+
+ u64_stats_update_begin(&tx->statss);
+ tx->xdp_xsk_sent += sent;
+ u64_stats_update_end(&tx->statss);
+
+ return (sent == budget) || repoll;
+}
+
static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
struct gve_tx_pending_packet_dqo *pending_packet)
{
@@ -1038,8 +1169,9 @@ static void gve_unmap_packet(struct device *dev,
dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
for (i = 1; i < pkt->num_bufs; i++) {
- dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
- dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
+ netmem_dma_unmap_page_attrs(dev, dma_unmap_addr(pkt, dma[i]),
+ dma_unmap_len(pkt, len[i]),
+ DMA_TO_DEVICE, 0);
}
pkt->num_bufs = 0;
}
@@ -1102,16 +1234,35 @@ static void gve_handle_packet_completion(struct gve_priv *priv,
}
}
tx->dqo_tx.completed_packet_desc_cnt += pending_packet->num_bufs;
- if (tx->dqo.qpl)
- gve_free_tx_qpl_bufs(tx, pending_packet);
- else
+
+ switch (pending_packet->type) {
+ case GVE_TX_PENDING_PACKET_DQO_SKB:
+ if (tx->dqo.qpl)
+ gve_free_tx_qpl_bufs(tx, pending_packet);
+ else
+ gve_unmap_packet(tx->dev, pending_packet);
+ (*pkts)++;
+ *bytes += pending_packet->skb->len;
+
+ napi_consume_skb(pending_packet->skb, is_napi);
+ pending_packet->skb = NULL;
+ gve_free_pending_packet(tx, pending_packet);
+ break;
+ case GVE_TX_PENDING_PACKET_DQO_XDP_FRAME:
gve_unmap_packet(tx->dev, pending_packet);
+ (*pkts)++;
+ *bytes += pending_packet->xdpf->len;
- *bytes += pending_packet->skb->len;
- (*pkts)++;
- napi_consume_skb(pending_packet->skb, is_napi);
- pending_packet->skb = NULL;
- gve_free_pending_packet(tx, pending_packet);
+ xdp_return_frame(pending_packet->xdpf);
+ pending_packet->xdpf = NULL;
+ gve_free_pending_packet(tx, pending_packet);
+ break;
+ case GVE_TX_PENDING_PACKET_DQO_XSK:
+ pending_packet->state = GVE_PACKET_STATE_XSK_COMPLETE;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
}
static void gve_handle_miss_completion(struct gve_priv *priv,
@@ -1175,7 +1326,11 @@ static void remove_miss_completions(struct gve_priv *priv,
/* This indicates the packet was dropped. */
dev_kfree_skb_any(pending_packet->skb);
pending_packet->skb = NULL;
+
+ u64_stats_update_begin(&tx->statss);
tx->dropped_pkt++;
+ u64_stats_update_end(&tx->statss);
+
net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
priv->dev->name,
(int)(pending_packet - tx->dqo.pending_packets));
@@ -1208,8 +1363,34 @@ static void remove_timed_out_completions(struct gve_priv *priv,
remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
pending_packet);
+
+ /* Need to count XSK packets in xsk_tx_completed. */
+ if (pending_packet->type == GVE_TX_PENDING_PACKET_DQO_XSK)
+ pending_packet->state = GVE_PACKET_STATE_XSK_COMPLETE;
+ else
+ gve_free_pending_packet(tx, pending_packet);
+ }
+}
+
+static void gve_tx_process_xsk_completions(struct gve_tx_ring *tx)
+{
+ u32 num_xsks = 0;
+
+ while (true) {
+ struct gve_tx_pending_packet_dqo *pending_packet =
+ gve_xsk_reorder_queue_head(tx);
+
+ if (!pending_packet ||
+ pending_packet->state != GVE_PACKET_STATE_XSK_COMPLETE)
+ break;
+
+ num_xsks++;
+ gve_xsk_reorder_queue_pop_dqo(tx);
gve_free_pending_packet(tx, pending_packet);
}
+
+ if (num_xsks)
+ xsk_tx_completed(tx->xsk_pool, num_xsks);
}
int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
@@ -1282,13 +1463,17 @@ int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
num_descs_cleaned++;
}
- netdev_tx_completed_queue(tx->netdev_txq,
- pkt_compl_pkts + miss_compl_pkts,
- pkt_compl_bytes + miss_compl_bytes);
+ if (tx->netdev_txq)
+ netdev_tx_completed_queue(tx->netdev_txq,
+ pkt_compl_pkts + miss_compl_pkts,
+ pkt_compl_bytes + miss_compl_bytes);
remove_miss_completions(priv, tx);
remove_timed_out_completions(priv, tx);
+ if (tx->xsk_pool)
+ gve_tx_process_xsk_completions(tx);
+
u64_stats_update_begin(&tx->statss);
tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
@@ -1320,3 +1505,111 @@ bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
}
+
+bool gve_xsk_tx_poll_dqo(struct gve_notify_block *rx_block, int budget)
+{
+ struct gve_rx_ring *rx = rx_block->rx;
+ struct gve_priv *priv = rx->gve;
+ struct gve_tx_ring *tx;
+
+ tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)];
+ if (tx->xsk_pool)
+ return gve_xsk_tx_dqo(priv, tx, budget);
+
+ return 0;
+}
+
+bool gve_xdp_poll_dqo(struct gve_notify_block *block)
+{
+ struct gve_tx_compl_desc *compl_desc;
+ struct gve_tx_ring *tx = block->tx;
+ struct gve_priv *priv = block->priv;
+
+ gve_clean_tx_done_dqo(priv, tx, &block->napi);
+
+ /* Return true if we still have work. */
+ compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
+ return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
+}
+
+int gve_xdp_xmit_one_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
+ struct xdp_frame *xdpf)
+{
+ struct gve_tx_pending_packet_dqo *pkt;
+ u32 desc_idx = tx->dqo_tx.tail;
+ s16 completion_tag;
+ int num_descs = 1;
+ dma_addr_t addr;
+ int err;
+
+ if (unlikely(!gve_has_tx_slots_available(tx, num_descs)))
+ return -EBUSY;
+
+ pkt = gve_alloc_pending_packet(tx);
+ if (unlikely(!pkt))
+ return -EBUSY;
+
+ pkt->type = GVE_TX_PENDING_PACKET_DQO_XDP_FRAME;
+ pkt->num_bufs = 0;
+ pkt->xdpf = xdpf;
+ completion_tag = pkt - tx->dqo.pending_packets;
+
+ /* Generate Packet Descriptor */
+ addr = dma_map_single(tx->dev, xdpf->data, xdpf->len, DMA_TO_DEVICE);
+ err = dma_mapping_error(tx->dev, addr);
+ if (unlikely(err))
+ goto err;
+
+ dma_unmap_len_set(pkt, len[pkt->num_bufs], xdpf->len);
+ dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
+ pkt->num_bufs++;
+
+ gve_tx_fill_pkt_desc_dqo(tx, &desc_idx,
+ false, xdpf->len,
+ addr, completion_tag, true,
+ false);
+
+ gve_tx_update_tail(tx, desc_idx);
+ return 0;
+
+err:
+ pkt->xdpf = NULL;
+ pkt->num_bufs = 0;
+ gve_free_pending_packet(tx, pkt);
+ return err;
+}
+
+int gve_xdp_xmit_dqo(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags)
+{
+ struct gve_priv *priv = netdev_priv(dev);
+ struct gve_tx_ring *tx;
+ int i, err = 0, qid;
+
+ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+ return -EINVAL;
+
+ qid = gve_xdp_tx_queue_id(priv,
+ smp_processor_id() % priv->tx_cfg.num_xdp_queues);
+
+ tx = &priv->tx[qid];
+
+ spin_lock(&tx->dqo_tx.xdp_lock);
+ for (i = 0; i < n; i++) {
+ err = gve_xdp_xmit_one_dqo(priv, tx, frames[i]);
+ if (err)
+ break;
+ }
+
+ if (flags & XDP_XMIT_FLUSH)
+ gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
+
+ spin_unlock(&tx->dqo_tx.xdp_lock);
+
+ u64_stats_update_begin(&tx->statss);
+ tx->xdp_xmit += n;
+ tx->xdp_xmit_errors += n - i;
+ u64_stats_update_end(&tx->statss);
+
+ return i ? i : err;
+}
diff --git a/drivers/net/ethernet/google/gve/gve_utils.c b/drivers/net/ethernet/google/gve/gve_utils.c
index ace9b8698021..b53b7fcdcdaf 100644
--- a/drivers/net/ethernet/google/gve/gve_utils.c
+++ b/drivers/net/ethernet/google/gve/gve_utils.c
@@ -112,11 +112,13 @@ void gve_add_napi(struct gve_priv *priv, int ntfy_idx,
netif_napi_add_locked(priv->dev, &block->napi, gve_poll);
netif_napi_set_irq_locked(&block->napi, block->irq);
+ enable_irq(block->irq);
}
void gve_remove_napi(struct gve_priv *priv, int ntfy_idx)
{
struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
+ disable_irq(block->irq);
netif_napi_del_locked(&block->napi);
}