diff options
Diffstat (limited to 'drivers/net/virtio_net.c')
-rw-r--r-- | drivers/net/virtio_net.c | 3203 |
1 files changed, 2673 insertions, 530 deletions
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 115c3c5414f2..e53ba600605a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -24,6 +24,8 @@ #include <net/xdp.h> #include <net/net_failover.h> #include <net/netdev_rx_queue.h> +#include <net/netdev_queues.h> +#include <net/xdp_sock_drv.h> static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); @@ -39,15 +41,10 @@ module_param(napi_tx, bool, 0644); #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) -/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */ -#define VIRTIO_XDP_HEADROOM 256 - /* Separating two types of XDP xmit */ #define VIRTIO_XDP_TX BIT(0) #define VIRTIO_XDP_REDIR BIT(1) -#define VIRTIO_XDP_FLAG BIT(0) - /* RX packet size EWMA. The average packet size is used to determine the packet * buffer size when refilling RX rings. As the entire RX ring may be refilled * at once, the weight is chosen so that the EWMA will be insensitive to short- @@ -78,11 +75,15 @@ static const unsigned long guest_offloads[] = { struct virtnet_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; + size_t qstat_offset; }; struct virtnet_sq_free_stats { u64 packets; u64 bytes; + u64 napi_packets; + u64 napi_bytes; + u64 xsk; }; struct virtnet_sq_stats { @@ -93,6 +94,8 @@ struct virtnet_sq_stats { u64_stats_t xdp_tx_drops; u64_stats_t kicks; u64_stats_t tx_timeouts; + u64_stats_t stop; + u64_stats_t wake; }; struct virtnet_rq_stats { @@ -107,31 +110,159 @@ struct virtnet_rq_stats { u64_stats_t kicks; }; -#define VIRTNET_SQ_STAT(m) offsetof(struct virtnet_sq_stats, m) -#define VIRTNET_RQ_STAT(m) offsetof(struct virtnet_rq_stats, m) +#define VIRTNET_SQ_STAT(name, m) {name, offsetof(struct virtnet_sq_stats, m), -1} +#define VIRTNET_RQ_STAT(name, m) {name, offsetof(struct virtnet_rq_stats, m), -1} + +#define VIRTNET_SQ_STAT_QSTAT(name, m) \ + { \ + name, \ + offsetof(struct virtnet_sq_stats, m), \ + offsetof(struct netdev_queue_stats_tx, m), \ + } + +#define VIRTNET_RQ_STAT_QSTAT(name, m) \ + { \ + name, \ + offsetof(struct virtnet_rq_stats, m), \ + offsetof(struct netdev_queue_stats_rx, m), \ + } static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = { - { "packets", VIRTNET_SQ_STAT(packets) }, - { "bytes", VIRTNET_SQ_STAT(bytes) }, - { "xdp_tx", VIRTNET_SQ_STAT(xdp_tx) }, - { "xdp_tx_drops", VIRTNET_SQ_STAT(xdp_tx_drops) }, - { "kicks", VIRTNET_SQ_STAT(kicks) }, - { "tx_timeouts", VIRTNET_SQ_STAT(tx_timeouts) }, + VIRTNET_SQ_STAT("xdp_tx", xdp_tx), + VIRTNET_SQ_STAT("xdp_tx_drops", xdp_tx_drops), + VIRTNET_SQ_STAT("kicks", kicks), + VIRTNET_SQ_STAT("tx_timeouts", tx_timeouts), }; static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = { - { "packets", VIRTNET_RQ_STAT(packets) }, - { "bytes", VIRTNET_RQ_STAT(bytes) }, - { "drops", VIRTNET_RQ_STAT(drops) }, - { "xdp_packets", VIRTNET_RQ_STAT(xdp_packets) }, - { "xdp_tx", VIRTNET_RQ_STAT(xdp_tx) }, - { "xdp_redirects", VIRTNET_RQ_STAT(xdp_redirects) }, - { "xdp_drops", VIRTNET_RQ_STAT(xdp_drops) }, - { "kicks", VIRTNET_RQ_STAT(kicks) }, + VIRTNET_RQ_STAT("drops", drops), + VIRTNET_RQ_STAT("xdp_packets", xdp_packets), + VIRTNET_RQ_STAT("xdp_tx", xdp_tx), + VIRTNET_RQ_STAT("xdp_redirects", xdp_redirects), + VIRTNET_RQ_STAT("xdp_drops", xdp_drops), + VIRTNET_RQ_STAT("kicks", kicks), }; -#define VIRTNET_SQ_STATS_LEN ARRAY_SIZE(virtnet_sq_stats_desc) -#define VIRTNET_RQ_STATS_LEN ARRAY_SIZE(virtnet_rq_stats_desc) +static const struct virtnet_stat_desc virtnet_sq_stats_desc_qstat[] = { + VIRTNET_SQ_STAT_QSTAT("packets", packets), + VIRTNET_SQ_STAT_QSTAT("bytes", bytes), + VIRTNET_SQ_STAT_QSTAT("stop", stop), + VIRTNET_SQ_STAT_QSTAT("wake", wake), +}; + +static const struct virtnet_stat_desc virtnet_rq_stats_desc_qstat[] = { + VIRTNET_RQ_STAT_QSTAT("packets", packets), + VIRTNET_RQ_STAT_QSTAT("bytes", bytes), +}; + +#define VIRTNET_STATS_DESC_CQ(name) \ + {#name, offsetof(struct virtio_net_stats_cvq, name), -1} + +#define VIRTNET_STATS_DESC_RX(class, name) \ + {#name, offsetof(struct virtio_net_stats_rx_ ## class, rx_ ## name), -1} + +#define VIRTNET_STATS_DESC_TX(class, name) \ + {#name, offsetof(struct virtio_net_stats_tx_ ## class, tx_ ## name), -1} + + +static const struct virtnet_stat_desc virtnet_stats_cvq_desc[] = { + VIRTNET_STATS_DESC_CQ(command_num), + VIRTNET_STATS_DESC_CQ(ok_num), +}; + +static const struct virtnet_stat_desc virtnet_stats_rx_basic_desc[] = { + VIRTNET_STATS_DESC_RX(basic, packets), + VIRTNET_STATS_DESC_RX(basic, bytes), + + VIRTNET_STATS_DESC_RX(basic, notifications), + VIRTNET_STATS_DESC_RX(basic, interrupts), +}; + +static const struct virtnet_stat_desc virtnet_stats_tx_basic_desc[] = { + VIRTNET_STATS_DESC_TX(basic, packets), + VIRTNET_STATS_DESC_TX(basic, bytes), + + VIRTNET_STATS_DESC_TX(basic, notifications), + VIRTNET_STATS_DESC_TX(basic, interrupts), +}; + +static const struct virtnet_stat_desc virtnet_stats_rx_csum_desc[] = { + VIRTNET_STATS_DESC_RX(csum, needs_csum), +}; + +static const struct virtnet_stat_desc virtnet_stats_tx_gso_desc[] = { + VIRTNET_STATS_DESC_TX(gso, gso_packets_noseg), + VIRTNET_STATS_DESC_TX(gso, gso_bytes_noseg), +}; + +static const struct virtnet_stat_desc virtnet_stats_rx_speed_desc[] = { + VIRTNET_STATS_DESC_RX(speed, ratelimit_bytes), +}; + +static const struct virtnet_stat_desc virtnet_stats_tx_speed_desc[] = { + VIRTNET_STATS_DESC_TX(speed, ratelimit_bytes), +}; + +#define VIRTNET_STATS_DESC_RX_QSTAT(class, name, qstat_field) \ + { \ + #name, \ + offsetof(struct virtio_net_stats_rx_ ## class, rx_ ## name), \ + offsetof(struct netdev_queue_stats_rx, qstat_field), \ + } + +#define VIRTNET_STATS_DESC_TX_QSTAT(class, name, qstat_field) \ + { \ + #name, \ + offsetof(struct virtio_net_stats_tx_ ## class, tx_ ## name), \ + offsetof(struct netdev_queue_stats_tx, qstat_field), \ + } + +static const struct virtnet_stat_desc virtnet_stats_rx_basic_desc_qstat[] = { + VIRTNET_STATS_DESC_RX_QSTAT(basic, drops, hw_drops), + VIRTNET_STATS_DESC_RX_QSTAT(basic, drop_overruns, hw_drop_overruns), +}; + +static const struct virtnet_stat_desc virtnet_stats_tx_basic_desc_qstat[] = { + VIRTNET_STATS_DESC_TX_QSTAT(basic, drops, hw_drops), + VIRTNET_STATS_DESC_TX_QSTAT(basic, drop_malformed, hw_drop_errors), +}; + +static const struct virtnet_stat_desc virtnet_stats_rx_csum_desc_qstat[] = { + VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_valid, csum_unnecessary), + VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_none, csum_none), + VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_bad, csum_bad), +}; + +static const struct virtnet_stat_desc virtnet_stats_tx_csum_desc_qstat[] = { + VIRTNET_STATS_DESC_TX_QSTAT(csum, csum_none, csum_none), + VIRTNET_STATS_DESC_TX_QSTAT(csum, needs_csum, needs_csum), +}; + +static const struct virtnet_stat_desc virtnet_stats_rx_gso_desc_qstat[] = { + VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_packets, hw_gro_packets), + VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_bytes, hw_gro_bytes), + VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_packets_coalesced, hw_gro_wire_packets), + VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_bytes_coalesced, hw_gro_wire_bytes), +}; + +static const struct virtnet_stat_desc virtnet_stats_tx_gso_desc_qstat[] = { + VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_packets, hw_gso_packets), + VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_bytes, hw_gso_bytes), + VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_segments, hw_gso_wire_packets), + VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_segments_bytes, hw_gso_wire_bytes), +}; + +static const struct virtnet_stat_desc virtnet_stats_rx_speed_desc_qstat[] = { + VIRTNET_STATS_DESC_RX_QSTAT(speed, ratelimit_packets, hw_drop_ratelimits), +}; + +static const struct virtnet_stat_desc virtnet_stats_tx_speed_desc_qstat[] = { + VIRTNET_STATS_DESC_TX_QSTAT(speed, ratelimit_packets, hw_drop_ratelimits), +}; + +#define VIRTNET_Q_TYPE_RX 0 +#define VIRTNET_Q_TYPE_TX 1 +#define VIRTNET_Q_TYPE_CQ 2 struct virtnet_interrupt_coalesce { u32 max_packets; @@ -165,6 +296,10 @@ struct send_queue { /* Record whether sq is in reset state. */ bool reset; + + struct xsk_buff_pool *xsk_pool; + + dma_addr_t xsk_hdr_dma_addr; }; /* Internal representation of a receive virtqueue */ @@ -184,6 +319,9 @@ struct receive_queue { /* Is dynamic interrupt moderation enabled? */ bool dim_enabled; + /* Used to protect dim_enabled and inter_coal */ + struct mutex dim_lock; + /* Dynamic Interrupt Moderation */ struct dim dim; @@ -214,41 +352,20 @@ struct receive_queue { /* Record the last dma info to free after new pages is allocated. */ struct virtnet_rq_dma *last_dma; - /* Do dma by self */ - bool do_dma; + struct xsk_buff_pool *xsk_pool; + + /* xdp rxq used by xsk */ + struct xdp_rxq_info xsk_rxq_info; + + struct xdp_buff **xsk_buffs; }; -/* This structure can contain rss message with maximum settings for indirection table and keysize - * Note, that default structure that describes RSS configuration virtio_net_rss_config - * contains same info but can't handle table values. - * In any case, structure would be passed to virtio hw through sg_buf split by parts - * because table sizes may be differ according to the device configuration. - */ #define VIRTIO_NET_RSS_MAX_KEY_SIZE 40 -#define VIRTIO_NET_RSS_MAX_TABLE_LEN 128 -struct virtio_net_ctrl_rss { - u32 hash_types; - u16 indirection_table_mask; - u16 unclassified_queue; - u16 indirection_table[VIRTIO_NET_RSS_MAX_TABLE_LEN]; - u16 max_tx_vq; - u8 hash_key_length; - u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; -}; /* Control VQ buffers: protected by the rtnl lock */ struct control_buf { struct virtio_net_ctrl_hdr hdr; virtio_net_ctrl_ack status; - struct virtio_net_ctrl_mq mq; - u8 promisc; - u8 allmulti; - __virtio16 vid; - __virtio64 offloads; - struct virtio_net_ctrl_rss rss; - struct virtio_net_ctrl_coal_tx coal_tx; - struct virtio_net_ctrl_coal_rx coal_rx; - struct virtio_net_ctrl_coal_vq coal_vq; }; struct virtnet_info { @@ -287,10 +404,16 @@ struct virtnet_info { u16 rss_indir_table_size; u32 rss_hash_types_supported; u32 rss_hash_types_saved; + struct virtio_net_rss_config_hdr *rss_hdr; + struct virtio_net_rss_config_trailer rss_trailer; + u8 rss_hash_key_data[VIRTIO_NET_RSS_MAX_KEY_SIZE]; /* Has control virtqueue */ bool has_cvq; + /* Lock to protect the control VQ */ + struct mutex cvq_lock; + /* Host can handle any s/g split between our header and packet data */ bool any_header_sg; @@ -340,6 +463,8 @@ struct virtnet_info { /* failover when STANDBY feature enabled */ struct failover *failover; + + u64 device_stats_cap; }; struct padded_vnet_hdr { @@ -360,46 +485,132 @@ struct virtio_net_common_hdr { }; }; +static struct virtio_net_common_hdr xsk_hdr; + static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); +static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq); +static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, + struct net_device *dev, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats); +static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, + struct sk_buff *skb, u8 flags); +static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, + struct sk_buff *curr_skb, + struct page *page, void *buf, + int len, int truesize); +static void virtnet_xsk_completed(struct send_queue *sq, int num); + +enum virtnet_xmit_type { + VIRTNET_XMIT_TYPE_SKB, + VIRTNET_XMIT_TYPE_SKB_ORPHAN, + VIRTNET_XMIT_TYPE_XDP, + VIRTNET_XMIT_TYPE_XSK, +}; + +static size_t virtnet_rss_hdr_size(const struct virtnet_info *vi) +{ + u16 indir_table_size = vi->has_rss ? vi->rss_indir_table_size : 1; + + return struct_size(vi->rss_hdr, indirection_table, indir_table_size); +} + +static size_t virtnet_rss_trailer_size(const struct virtnet_info *vi) +{ + return struct_size(&vi->rss_trailer, hash_key_data, vi->rss_key_size); +} + +/* We use the last two bits of the pointer to distinguish the xmit type. */ +#define VIRTNET_XMIT_TYPE_MASK (BIT(0) | BIT(1)) + +#define VIRTIO_XSK_FLAG_OFFSET 2 + +static enum virtnet_xmit_type virtnet_xmit_ptr_unpack(void **ptr) +{ + unsigned long p = (unsigned long)*ptr; + + *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK); + + return p & VIRTNET_XMIT_TYPE_MASK; +} + +static void *virtnet_xmit_ptr_pack(void *ptr, enum virtnet_xmit_type type) +{ + return (void *)((unsigned long)ptr | type); +} -static bool is_xdp_frame(void *ptr) +static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data, + enum virtnet_xmit_type type) { - return (unsigned long)ptr & VIRTIO_XDP_FLAG; + return virtqueue_add_outbuf(sq->vq, sq->sg, num, + virtnet_xmit_ptr_pack(data, type), + GFP_ATOMIC); } -static void *xdp_to_ptr(struct xdp_frame *ptr) +static u32 virtnet_ptr_to_xsk_buff_len(void *ptr) { - return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG); + return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET; } -static struct xdp_frame *ptr_to_xdp(void *ptr) +static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) { - return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG); + sg_dma_address(sg) = addr; + sg_dma_len(sg) = len; } -static void __free_old_xmit(struct send_queue *sq, bool in_napi, - struct virtnet_sq_free_stats *stats) +static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, + bool in_napi, struct virtnet_sq_free_stats *stats) { + struct xdp_frame *frame; + struct sk_buff *skb; unsigned int len; void *ptr; while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { - ++stats->packets; - - if (!is_xdp_frame(ptr)) { - struct sk_buff *skb = ptr; + switch (virtnet_xmit_ptr_unpack(&ptr)) { + case VIRTNET_XMIT_TYPE_SKB: + skb = ptr; pr_debug("Sent skb %p\n", skb); + stats->napi_packets++; + stats->napi_bytes += skb->len; + napi_consume_skb(skb, in_napi); + break; + + case VIRTNET_XMIT_TYPE_SKB_ORPHAN: + skb = ptr; + stats->packets++; stats->bytes += skb->len; napi_consume_skb(skb, in_napi); - } else { - struct xdp_frame *frame = ptr_to_xdp(ptr); + break; + + case VIRTNET_XMIT_TYPE_XDP: + frame = ptr; + stats->packets++; stats->bytes += xdp_get_frame_len(frame); xdp_return_frame(frame); + break; + + case VIRTNET_XMIT_TYPE_XSK: + stats->bytes += virtnet_ptr_to_xsk_buff_len(ptr); + stats->xsk++; + break; } } + netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); +} + +static void virtnet_free_old_xmit(struct send_queue *sq, + struct netdev_queue *txq, + bool in_napi, + struct virtnet_sq_free_stats *stats) +{ + __free_old_xmit(sq, txq, in_napi, stats); + + if (stats->xsk) + virtnet_xsk_completed(sq, stats->xsk); } /* Converting between virtqueue no. and kernel tx/rx queue no. @@ -425,6 +636,17 @@ static int rxq2vq(int rxq) return rxq * 2; } +static int vq_type(struct virtnet_info *vi, int qid) +{ + if (qid == vi->max_queue_pairs * 2) + return VIRTNET_Q_TYPE_CQ; + + if (qid % 2) + return VIRTNET_Q_TYPE_TX; + + return VIRTNET_Q_TYPE_RX; +} + static inline struct virtio_net_common_hdr * skb_vnet_common_hdr(struct sk_buff *skb) { @@ -603,7 +825,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - /* copy small packet so we can reuse these pages */ if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) { skb = virtnet_build_skb(buf, truesize, p - buf, len); if (unlikely(!skb)) @@ -675,11 +896,14 @@ ok: static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) { + struct virtnet_info *vi = rq->vq->vdev->priv; struct page *page = virt_to_head_page(buf); struct virtnet_rq_dma *dma; void *head; int offset; + BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); + head = page_address(page); dma = head; @@ -704,10 +928,13 @@ static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx) { + struct virtnet_info *vi = rq->vq->vdev->priv; void *buf; + BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); + buf = virtqueue_get_buf_ctx(rq->vq, len, ctx); - if (buf && rq->do_dma) + if (buf) virtnet_rq_unmap(rq, buf, *len); return buf; @@ -715,15 +942,13 @@ static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx) static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len) { + struct virtnet_info *vi = rq->vq->vdev->priv; struct virtnet_rq_dma *dma; dma_addr_t addr; u32 offset; void *head; - if (!rq->do_dma) { - sg_init_one(rq->sg, buf, len); - return; - } + BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); head = page_address(rq->alloc_frag.page); @@ -734,60 +959,57 @@ static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len) addr = dma->addr - sizeof(*dma) + offset; sg_init_table(rq->sg, 1); - rq->sg[0].dma_address = addr; - rq->sg[0].length = len; + sg_fill_dma(rq->sg, addr, len); } static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) { struct page_frag *alloc_frag = &rq->alloc_frag; + struct virtnet_info *vi = rq->vq->vdev->priv; struct virtnet_rq_dma *dma; void *buf, *head; dma_addr_t addr; - if (unlikely(!skb_page_frag_refill(size, alloc_frag, gfp))) - return NULL; + BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); head = page_address(alloc_frag->page); - if (rq->do_dma) { - dma = head; - - /* new pages */ - if (!alloc_frag->offset) { - if (rq->last_dma) { - /* Now, the new page is allocated, the last dma - * will not be used. So the dma can be unmapped - * if the ref is 0. - */ - virtnet_rq_unmap(rq, rq->last_dma, 0); - rq->last_dma = NULL; - } + dma = head; - dma->len = alloc_frag->size - sizeof(*dma); + /* new pages */ + if (!alloc_frag->offset) { + if (rq->last_dma) { + /* Now, the new page is allocated, the last dma + * will not be used. So the dma can be unmapped + * if the ref is 0. + */ + virtnet_rq_unmap(rq, rq->last_dma, 0); + rq->last_dma = NULL; + } - addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1, - dma->len, DMA_FROM_DEVICE, 0); - if (virtqueue_dma_mapping_error(rq->vq, addr)) - return NULL; + dma->len = alloc_frag->size - sizeof(*dma); - dma->addr = addr; - dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr); + addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1, + dma->len, DMA_FROM_DEVICE, 0); + if (virtqueue_dma_mapping_error(rq->vq, addr)) + return NULL; - /* Add a reference to dma to prevent the entire dma from - * being released during error handling. This reference - * will be freed after the pages are no longer used. - */ - get_page(alloc_frag->page); - dma->ref = 1; - alloc_frag->offset = sizeof(*dma); + dma->addr = addr; + dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr); - rq->last_dma = dma; - } + /* Add a reference to dma to prevent the entire dma from + * being released during error handling. This reference + * will be freed after the pages are no longer used. + */ + get_page(alloc_frag->page); + dma->ref = 1; + alloc_frag->offset = sizeof(*dma); - ++dma->ref; + rq->last_dma = dma; } + ++dma->ref; + buf = head + alloc_frag->offset; get_page(alloc_frag->page); @@ -796,22 +1018,6 @@ static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) return buf; } -static void virtnet_rq_set_premapped(struct virtnet_info *vi) -{ - int i; - - /* disable for big mode */ - if (!vi->mergeable_rx_bufs && vi->big_packets) - return; - - for (i = 0; i < vi->max_queue_pairs; i++) { - if (virtqueue_set_dma_premapped(vi->rq[i].vq)) - continue; - - vi->rq[i].do_dma = true; - } -} - static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) { struct virtnet_info *vi = vq->vdev->priv; @@ -820,27 +1026,33 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) rq = &vi->rq[i]; - if (rq->do_dma) + if (rq->xsk_pool) { + xsk_buff_free((struct xdp_buff *)buf); + return; + } + + if (!vi->big_packets || vi->mergeable_rx_bufs) virtnet_rq_unmap(rq, buf, 0); virtnet_rq_free_buf(vi, rq, buf); } -static void free_old_xmit(struct send_queue *sq, bool in_napi) +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, + bool in_napi) { struct virtnet_sq_free_stats stats = {0}; - __free_old_xmit(sq, in_napi, &stats); + virtnet_free_old_xmit(sq, txq, in_napi, &stats); /* Avoid overhead when no packets have been processed * happens when called speculatively from start_xmit. */ - if (!stats.packets) + if (!stats.packets && !stats.napi_packets) return; u64_stats_update_begin(&sq->stats.syncp); - u64_stats_add(&sq->stats.bytes, stats.bytes); - u64_stats_add(&sq->stats.packets, stats.packets); + u64_stats_add(&sq->stats.bytes, stats.bytes + stats.napi_bytes); + u64_stats_add(&sq->stats.packets, stats.packets + stats.napi_packets); u64_stats_update_end(&sq->stats.syncp); } @@ -854,11 +1066,10 @@ static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) return false; } -static void check_sq_full_and_disable(struct virtnet_info *vi, - struct net_device *dev, - struct send_queue *sq) +static bool tx_may_stop(struct virtnet_info *vi, + struct net_device *dev, + struct send_queue *sq) { - bool use_napi = sq->napi.weight; int qnum; qnum = sq - vi->sq; @@ -874,21 +1085,485 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, * early means 16 slots are typically wasted. */ if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { - netif_stop_subqueue(dev, qnum); + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); + + netif_tx_stop_queue(txq); + u64_stats_update_begin(&sq->stats.syncp); + u64_stats_inc(&sq->stats.stop); + u64_stats_update_end(&sq->stats.syncp); + + return true; + } + + return false; +} + +static void check_sq_full_and_disable(struct virtnet_info *vi, + struct net_device *dev, + struct send_queue *sq) +{ + bool use_napi = sq->napi.weight; + int qnum; + + qnum = sq - vi->sq; + + if (tx_may_stop(vi, dev, sq)) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); + if (use_napi) { if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) virtqueue_napi_schedule(&sq->napi, sq->vq); } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */ - free_old_xmit(sq, false); + free_old_xmit(sq, txq, false); if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { netif_start_subqueue(dev, qnum); + u64_stats_update_begin(&sq->stats.syncp); + u64_stats_inc(&sq->stats.wake); + u64_stats_update_end(&sq->stats.syncp); virtqueue_disable_cb(sq->vq); } } } } +static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, + struct receive_queue *rq, void *buf, u32 len) +{ + struct xdp_buff *xdp; + u32 bufsize; + + xdp = (struct xdp_buff *)buf; + + bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; + + if (unlikely(len > bufsize)) { + pr_debug("%s: rx error: len %u exceeds truesize %u\n", + vi->dev->name, len, bufsize); + DEV_STATS_INC(vi->dev, rx_length_errors); + xsk_buff_free(xdp); + return NULL; + } + + xsk_buff_set_size(xdp, len); + xsk_buff_dma_sync_for_cpu(xdp); + + return xdp; +} + +static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, + struct xdp_buff *xdp) +{ + unsigned int metasize = xdp->data - xdp->data_meta; + struct sk_buff *skb; + unsigned int size; + + size = xdp->data_end - xdp->data_hard_start; + skb = napi_alloc_skb(&rq->napi, size); + if (unlikely(!skb)) { + xsk_buff_free(xdp); + return NULL; + } + + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); + + size = xdp->data_end - xdp->data_meta; + memcpy(__skb_put(skb, size), xdp->data_meta, size); + + if (metasize) { + __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); + } + + xsk_buff_free(xdp); + + return skb; +} + +static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, + struct receive_queue *rq, struct xdp_buff *xdp, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) +{ + struct bpf_prog *prog; + u32 ret; + + ret = XDP_PASS; + rcu_read_lock(); + prog = rcu_dereference(rq->xdp_prog); + if (prog) + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); + rcu_read_unlock(); + + switch (ret) { + case XDP_PASS: + return xsk_construct_skb(rq, xdp); + + case XDP_TX: + case XDP_REDIRECT: + return NULL; + + default: + /* drop packet */ + xsk_buff_free(xdp); + u64_stats_inc(&stats->drops); + return NULL; + } +} + +static void xsk_drop_follow_bufs(struct net_device *dev, + struct receive_queue *rq, + u32 num_buf, + struct virtnet_rq_stats *stats) +{ + struct xdp_buff *xdp; + u32 len; + + while (num_buf-- > 1) { + xdp = virtqueue_get_buf(rq->vq, &len); + if (unlikely(!xdp)) { + pr_debug("%s: rx error: %d buffers missing\n", + dev->name, num_buf); + DEV_STATS_INC(dev, rx_length_errors); + break; + } + u64_stats_add(&stats->bytes, len); + xsk_buff_free(xdp); + } +} + +static int xsk_append_merge_buffer(struct virtnet_info *vi, + struct receive_queue *rq, + struct sk_buff *head_skb, + u32 num_buf, + struct virtio_net_hdr_mrg_rxbuf *hdr, + struct virtnet_rq_stats *stats) +{ + struct sk_buff *curr_skb; + struct xdp_buff *xdp; + u32 len, truesize; + struct page *page; + void *buf; + + curr_skb = head_skb; + + while (--num_buf) { + buf = virtqueue_get_buf(rq->vq, &len); + if (unlikely(!buf)) { + pr_debug("%s: rx error: %d buffers out of %d missing\n", + vi->dev->name, num_buf, + virtio16_to_cpu(vi->vdev, + hdr->num_buffers)); + DEV_STATS_INC(vi->dev, rx_length_errors); + return -EINVAL; + } + + u64_stats_add(&stats->bytes, len); + + xdp = buf_to_xdp(vi, rq, buf, len); + if (!xdp) + goto err; + + buf = napi_alloc_frag(len); + if (!buf) { + xsk_buff_free(xdp); + goto err; + } + + memcpy(buf, xdp->data - vi->hdr_len, len); + + xsk_buff_free(xdp); + + page = virt_to_page(buf); + + truesize = len; + + curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, + buf, len, truesize); + if (!curr_skb) { + put_page(page); + goto err; + } + } + + return 0; + +err: + xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats); + return -EINVAL; +} + +static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi, + struct receive_queue *rq, struct xdp_buff *xdp, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) +{ + struct virtio_net_hdr_mrg_rxbuf *hdr; + struct bpf_prog *prog; + struct sk_buff *skb; + u32 ret, num_buf; + + hdr = xdp->data - vi->hdr_len; + num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); + + ret = XDP_PASS; + rcu_read_lock(); + prog = rcu_dereference(rq->xdp_prog); + /* TODO: support multi buffer. */ + if (prog && num_buf == 1) + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); + rcu_read_unlock(); + + switch (ret) { + case XDP_PASS: + skb = xsk_construct_skb(rq, xdp); + if (!skb) + goto drop_bufs; + + if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) { + dev_kfree_skb(skb); + goto drop; + } + + return skb; + + case XDP_TX: + case XDP_REDIRECT: + return NULL; + + default: + /* drop packet */ + xsk_buff_free(xdp); + } + +drop_bufs: + xsk_drop_follow_bufs(dev, rq, num_buf, stats); + +drop: + u64_stats_inc(&stats->drops); + return NULL; +} + +static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, + void *buf, u32 len, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) +{ + struct net_device *dev = vi->dev; + struct sk_buff *skb = NULL; + struct xdp_buff *xdp; + u8 flags; + + len -= vi->hdr_len; + + u64_stats_add(&stats->bytes, len); + + xdp = buf_to_xdp(vi, rq, buf, len); + if (!xdp) + return; + + if (unlikely(len < ETH_HLEN)) { + pr_debug("%s: short packet %i\n", dev->name, len); + DEV_STATS_INC(dev, rx_length_errors); + xsk_buff_free(xdp); + return; + } + + flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; + + if (!vi->mergeable_rx_bufs) + skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); + else + skb = virtnet_receive_xsk_merge(dev, vi, rq, xdp, xdp_xmit, stats); + + if (skb) + virtnet_receive_done(vi, rq, skb, flags); +} + +static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq, + struct xsk_buff_pool *pool, gfp_t gfp) +{ + struct xdp_buff **xsk_buffs; + dma_addr_t addr; + int err = 0; + u32 len, i; + int num; + + xsk_buffs = rq->xsk_buffs; + + num = xsk_buff_alloc_batch(pool, xsk_buffs, rq->vq->num_free); + if (!num) + return -ENOMEM; + + len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len; + + for (i = 0; i < num; ++i) { + /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space. + * We assume XDP_PACKET_HEADROOM is larger than hdr->len. + * (see function virtnet_xsk_pool_enable) + */ + addr = xsk_buff_xdp_get_dma(xsk_buffs[i]) - vi->hdr_len; + + sg_init_table(rq->sg, 1); + sg_fill_dma(rq->sg, addr, len); + + err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, + xsk_buffs[i], NULL, gfp); + if (err) + goto err; + } + + return num; + +err: + for (; i < num; ++i) + xsk_buff_free(xsk_buffs[i]); + + return err; +} + +static void *virtnet_xsk_to_ptr(u32 len) +{ + unsigned long p; + + p = len << VIRTIO_XSK_FLAG_OFFSET; + + return virtnet_xmit_ptr_pack((void *)p, VIRTNET_XMIT_TYPE_XSK); +} + +static int virtnet_xsk_xmit_one(struct send_queue *sq, + struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + struct virtnet_info *vi; + dma_addr_t addr; + + vi = sq->vq->vdev->priv; + + addr = xsk_buff_raw_get_dma(pool, desc->addr); + xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len); + + sg_init_table(sq->sg, 2); + sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len); + sg_fill_dma(sq->sg + 1, addr, desc->len); + + return virtqueue_add_outbuf_premapped(sq->vq, sq->sg, 2, + virtnet_xsk_to_ptr(desc->len), + GFP_ATOMIC); +} + +static int virtnet_xsk_xmit_batch(struct send_queue *sq, + struct xsk_buff_pool *pool, + unsigned int budget, + u64 *kicks) +{ + struct xdp_desc *descs = pool->tx_descs; + bool kick = false; + u32 nb_pkts, i; + int err; + + budget = min_t(u32, budget, sq->vq->num_free); + + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); + if (!nb_pkts) + return 0; + + for (i = 0; i < nb_pkts; i++) { + err = virtnet_xsk_xmit_one(sq, pool, &descs[i]); + if (unlikely(err)) { + xsk_tx_completed(sq->xsk_pool, nb_pkts - i); + break; + } + + kick = true; + } + + if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) + (*kicks)++; + + return i; +} + +static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool, + int budget) +{ + struct virtnet_info *vi = sq->vq->vdev->priv; + struct virtnet_sq_free_stats stats = {}; + struct net_device *dev = vi->dev; + u64 kicks = 0; + int sent; + + /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of + * free_old_xmit(). + */ + __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats); + + if (stats.xsk) + xsk_tx_completed(sq->xsk_pool, stats.xsk); + + sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks); + + if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq)) + check_sq_full_and_disable(vi, vi->dev, sq); + + if (sent) { + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(vi->dev, sq - vi->sq); + txq_trans_cond_update(txq); + } + + u64_stats_update_begin(&sq->stats.syncp); + u64_stats_add(&sq->stats.packets, stats.packets); + u64_stats_add(&sq->stats.bytes, stats.bytes); + u64_stats_add(&sq->stats.kicks, kicks); + u64_stats_add(&sq->stats.xdp_tx, sent); + u64_stats_update_end(&sq->stats.syncp); + + if (xsk_uses_need_wakeup(pool)) + xsk_set_tx_need_wakeup(pool); + + return sent; +} + +static void xsk_wakeup(struct send_queue *sq) +{ + if (napi_if_scheduled_mark_missed(&sq->napi)) + return; + + local_bh_disable(); + virtqueue_napi_schedule(&sq->napi, sq->vq); + local_bh_enable(); +} + +static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct send_queue *sq; + + if (!netif_running(dev)) + return -ENETDOWN; + + if (qid >= vi->curr_queue_pairs) + return -EINVAL; + + sq = &vi->sq[qid]; + + xsk_wakeup(sq); + return 0; +} + +static void virtnet_xsk_completed(struct send_queue *sq, int num) +{ + xsk_tx_completed(sq->xsk_pool, num); + + /* If this is called by rx poll, start_xmit and xdp xmit we should + * wakeup the tx napi to consume the xsk tx queue, because the tx + * interrupt may not be triggered. + */ + xsk_wakeup(sq); +} + static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, struct send_queue *sq, struct xdp_frame *xdpf) @@ -929,8 +1604,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, skb_frag_size(frag), skb_frag_off(frag)); } - err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1, - xdp_to_ptr(xdpf), GFP_ATOMIC); + err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP); if (unlikely(err)) return -ENOSPC; /* Caller handle free/refcnt */ @@ -1003,7 +1677,8 @@ static int virtnet_xdp_xmit(struct net_device *dev, } /* Free up any pending old buffers before queueing new ones. */ - __free_old_xmit(sq, false, &stats); + virtnet_free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), + false, &stats); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; @@ -1105,7 +1780,7 @@ static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, static unsigned int virtnet_get_headroom(struct virtnet_info *vi) { - return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0; + return vi->xdp_enabled ? XDP_PACKET_HEADROOM : 0; } /* We copy the packet for XDP in the following cases: @@ -1169,7 +1844,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, } /* Headroom does not contribute to packet length */ - *len = page_off - VIRTIO_XDP_HEADROOM; + *len = page_off - XDP_PACKET_HEADROOM; return page; err_buf: __free_pages(page, 0); @@ -1225,6 +1900,10 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev, if (unlikely(hdr->hdr.gso_type)) goto err_xdp; + /* Partially checksummed packets must be dropped. */ + if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) + goto err_xdp; + buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -1300,6 +1979,11 @@ static struct sk_buff *receive_small(struct net_device *dev, struct page *page = virt_to_head_page(buf); struct sk_buff *skb; + /* We passed the address of virtnet header to virtio-core, + * so truncate the padding. + */ + buf -= VIRTNET_RX_PAD + xdp_headroom; + len -= vi->hdr_len; u64_stats_add(&stats->bytes, len); @@ -1452,8 +2136,8 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, void *ctx; xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); - xdp_prepare_buff(xdp, buf - VIRTIO_XDP_HEADROOM, - VIRTIO_XDP_HEADROOM + vi->hdr_len, len - vi->hdr_len, true); + xdp_prepare_buff(xdp, buf - XDP_PACKET_HEADROOM, + XDP_PACKET_HEADROOM + vi->hdr_len, len - vi->hdr_len, true); if (!*num_buf) return 0; @@ -1542,6 +2226,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, if (unlikely(hdr->hdr.gso_type)) return NULL; + /* Partially checksummed packets must be dropped. */ + if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) + return NULL; + /* Now XDP core assumes frag size is PAGE_SIZE, but buffers * with headroom may add hole in truesize, which * make their length exceed PAGE_SIZE. So we disabled the @@ -1566,12 +2254,12 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, /* linearize data for XDP */ xdp_page = xdp_linearize_page(rq, num_buf, *page, offset, - VIRTIO_XDP_HEADROOM, + XDP_PACKET_HEADROOM, len); if (!xdp_page) return NULL; } else { - xdp_room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM + + xdp_room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + sizeof(struct skb_shared_info)); if (*len + xdp_room > PAGE_SIZE) return NULL; @@ -1580,7 +2268,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, if (!xdp_page) return NULL; - memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM, + memcpy(page_address(xdp_page) + XDP_PACKET_HEADROOM, page_address(*page) + offset, *len); } @@ -1590,7 +2278,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, *page = xdp_page; - return page_address(*page) + VIRTIO_XDP_HEADROOM; + return page_address(*page) + XDP_PACKET_HEADROOM; } static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, @@ -1653,6 +2341,49 @@ err_xdp: return NULL; } +static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, + struct sk_buff *curr_skb, + struct page *page, void *buf, + int len, int truesize) +{ + int num_skb_frags; + int offset; + + num_skb_frags = skb_shinfo(curr_skb)->nr_frags; + if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { + struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); + + if (unlikely(!nskb)) + return NULL; + + if (curr_skb == head_skb) + skb_shinfo(curr_skb)->frag_list = nskb; + else + curr_skb->next = nskb; + curr_skb = nskb; + head_skb->truesize += nskb->truesize; + num_skb_frags = 0; + } + + if (curr_skb != head_skb) { + head_skb->data_len += len; + head_skb->len += len; + head_skb->truesize += truesize; + } + + offset = buf - page_address(page); + if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { + put_page(page); + skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, + len, truesize); + } else { + skb_add_rx_frag(curr_skb, num_skb_frags, page, + offset, len, truesize); + } + + return curr_skb; +} + static struct sk_buff *receive_mergeable(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, @@ -1702,8 +2433,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (unlikely(!curr_skb)) goto err_skb; while (--num_buf) { - int num_skb_frags; - buf = virtnet_rq_get_buf(rq, &len, &ctx); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers out of %d missing\n", @@ -1728,34 +2457,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, goto err_skb; } - num_skb_frags = skb_shinfo(curr_skb)->nr_frags; - if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { - struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); - - if (unlikely(!nskb)) - goto err_skb; - if (curr_skb == head_skb) - skb_shinfo(curr_skb)->frag_list = nskb; - else - curr_skb->next = nskb; - curr_skb = nskb; - head_skb->truesize += nskb->truesize; - num_skb_frags = 0; - } - if (curr_skb != head_skb) { - head_skb->data_len += len; - head_skb->len += len; - head_skb->truesize += truesize; - } - offset = buf - page_address(page); - if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { - put_page(page); - skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, - len, truesize); - } else { - skb_add_rx_frag(curr_skb, num_skb_frags, page, - offset, len, truesize); - } + curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, + buf, len, truesize); + if (!curr_skb) + goto err_skb; } ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len); @@ -1800,38 +2505,17 @@ static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type); } -static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, - void *buf, unsigned int len, void **ctx, - unsigned int *xdp_xmit, - struct virtnet_rq_stats *stats) +static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, + struct sk_buff *skb, u8 flags) { - struct net_device *dev = vi->dev; - struct sk_buff *skb; struct virtio_net_common_hdr *hdr; - - if (unlikely(len < vi->hdr_len + ETH_HLEN)) { - pr_debug("%s: short packet %i\n", dev->name, len); - DEV_STATS_INC(dev, rx_length_errors); - virtnet_rq_free_buf(vi, rq, buf); - return; - } - - if (vi->mergeable_rx_bufs) - skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, - stats); - else if (vi->big_packets) - skb = receive_big(dev, vi, rq, buf, len, stats); - else - skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); - - if (unlikely(!skb)) - return; + struct net_device *dev = vi->dev; hdr = skb_vnet_common_hdr(skb); if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report) virtio_skb_set_hash(&hdr->hash_v1_hdr, skb); - if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) + if (flags & VIRTIO_NET_HDR_F_DATA_VALID) skb->ip_summed = CHECKSUM_UNNECESSARY; if (virtio_net_hdr_to_skb(skb, &hdr->hdr, @@ -1855,6 +2539,45 @@ frame_err: dev_kfree_skb(skb); } +static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, + void *buf, unsigned int len, void **ctx, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) +{ + struct net_device *dev = vi->dev; + struct sk_buff *skb; + u8 flags; + + if (unlikely(len < vi->hdr_len + ETH_HLEN)) { + pr_debug("%s: short packet %i\n", dev->name, len); + DEV_STATS_INC(dev, rx_length_errors); + virtnet_rq_free_buf(vi, rq, buf); + return; + } + + /* 1. Save the flags early, as the XDP program might overwrite them. + * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID + * stay valid after XDP processing. + * 2. XDP doesn't work with partially checksummed packets (refer to + * virtnet_xdp_set()), so packets marked as + * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing. + */ + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; + + if (vi->mergeable_rx_bufs) + skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, + stats); + else if (vi->big_packets) + skb = receive_big(dev, vi, rq, buf, len, stats); + else + skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); + + if (unlikely(!skb)) + return; + + virtnet_receive_done(vi, rq, skb, flags); +} + /* Unlike mergeable buffers, all buffers are allocated to the * same size, except for the headroom. For this reason we do * not need to use mergeable_len_to_ctx here - it is enough @@ -1872,17 +2595,20 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, len = SKB_DATA_ALIGN(len) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + if (unlikely(!skb_page_frag_refill(len, &rq->alloc_frag, gfp))) + return -ENOMEM; + buf = virtnet_rq_alloc(rq, len, gfp); if (unlikely(!buf)) return -ENOMEM; - virtnet_rq_init_one_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom, - vi->hdr_len + GOOD_PACKET_LEN); + buf += VIRTNET_RX_PAD + xdp_headroom; - err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); + virtnet_rq_init_one_sg(rq, buf, vi->hdr_len + GOOD_PACKET_LEN); + + err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { - if (rq->do_dma) - virtnet_rq_unmap(rq, buf, 0); + virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); } @@ -1973,6 +2699,12 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, */ len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) + return -ENOMEM; + + if (!alloc_frag->offset && len + room + sizeof(struct virtnet_rq_dma) > alloc_frag->size) + len -= sizeof(struct virtnet_rq_dma); + buf = virtnet_rq_alloc(rq, len + room, gfp); if (unlikely(!buf)) return -ENOMEM; @@ -1994,10 +2726,9 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, virtnet_rq_init_one_sg(rq, buf, len); ctx = mergeable_len_to_ctx(len + room, headroom); - err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); + err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { - if (rq->do_dma) - virtnet_rq_unmap(rq, buf, 0); + virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); } @@ -2015,7 +2746,11 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { int err; - bool oom; + + if (rq->xsk_pool) { + err = virtnet_add_recvbuf_xsk(vi, rq, rq->xsk_pool, gfp); + goto kick; + } do { if (vi->mergeable_rx_bufs) @@ -2025,10 +2760,11 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, else err = add_recvbuf_small(vi, rq, gfp); - oom = err == -ENOMEM; if (err) break; } while (rq->vq->num_free); + +kick: if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) { unsigned long flags; @@ -2037,7 +2773,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, u64_stats_update_end_irqrestore(&rq->stats.syncp, flags); } - return !oom; + return err != -ENOMEM; } static void skb_recv_done(struct virtqueue *rvq) @@ -2049,7 +2785,8 @@ static void skb_recv_done(struct virtqueue *rvq) virtqueue_napi_schedule(&rq->napi, rvq); } -static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi) +static void virtnet_napi_do_enable(struct virtqueue *vq, + struct napi_struct *napi) { napi_enable(napi); @@ -2062,10 +2799,21 @@ static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi) local_bh_enable(); } -static void virtnet_napi_tx_enable(struct virtnet_info *vi, - struct virtqueue *vq, - struct napi_struct *napi) +static void virtnet_napi_enable(struct receive_queue *rq) { + struct virtnet_info *vi = rq->vq->vdev->priv; + int qidx = vq2rxq(rq->vq); + + virtnet_napi_do_enable(rq->vq, &rq->napi); + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, &rq->napi); +} + +static void virtnet_napi_tx_enable(struct send_queue *sq) +{ + struct virtnet_info *vi = sq->vq->vdev->priv; + struct napi_struct *napi = &sq->napi; + int qidx = vq2txq(sq->vq); + if (!napi->weight) return; @@ -2077,13 +2825,30 @@ static void virtnet_napi_tx_enable(struct virtnet_info *vi, return; } - return virtnet_napi_enable(vq, napi); + virtnet_napi_do_enable(sq->vq, napi); + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, napi); } -static void virtnet_napi_tx_disable(struct napi_struct *napi) +static void virtnet_napi_tx_disable(struct send_queue *sq) { - if (napi->weight) + struct virtnet_info *vi = sq->vq->vdev->priv; + struct napi_struct *napi = &sq->napi; + int qidx = vq2txq(sq->vq); + + if (napi->weight) { + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, NULL); napi_disable(napi); + } +} + +static void virtnet_napi_disable(struct receive_queue *rq) +{ + struct virtnet_info *vi = rq->vq->vdev->priv; + struct napi_struct *napi = &rq->napi; + int qidx = vq2rxq(rq->vq); + + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, NULL); + napi_disable(napi); } static void refill_work(struct work_struct *work) @@ -2096,9 +2861,23 @@ static void refill_work(struct work_struct *work) for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; + /* + * When queue API support is added in the future and the call + * below becomes napi_disable_locked, this driver will need to + * be refactored. + * + * One possible solution would be to: + * - cancel refill_work with cancel_delayed_work (note: + * non-sync) + * - cancel refill_work with cancel_delayed_work_sync in + * virtnet_remove after the netdev is unregistered + * - wrap all of the work in a lock (perhaps the netdev + * instance lock) + * - check netif_running() and return early to avoid a race + */ napi_disable(&rq->napi); still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); - virtnet_napi_enable(rq->vq, &rq->napi); + virtnet_napi_do_enable(rq->vq, &rq->napi); /* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again. @@ -2108,32 +2887,68 @@ static void refill_work(struct work_struct *work) } } -static int virtnet_receive(struct receive_queue *rq, int budget, - unsigned int *xdp_xmit) +static int virtnet_receive_xsk_bufs(struct virtnet_info *vi, + struct receive_queue *rq, + int budget, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) +{ + unsigned int len; + int packets = 0; + void *buf; + + while (packets < budget) { + buf = virtqueue_get_buf(rq->vq, &len); + if (!buf) + break; + + virtnet_receive_xsk_buf(vi, rq, buf, len, xdp_xmit, stats); + packets++; + } + + return packets; +} + +static int virtnet_receive_packets(struct virtnet_info *vi, + struct receive_queue *rq, + int budget, + unsigned int *xdp_xmit, + struct virtnet_rq_stats *stats) { - struct virtnet_info *vi = rq->vq->vdev->priv; - struct virtnet_rq_stats stats = {}; unsigned int len; int packets = 0; void *buf; - int i; if (!vi->big_packets || vi->mergeable_rx_bufs) { void *ctx; - while (packets < budget && (buf = virtnet_rq_get_buf(rq, &len, &ctx))) { - receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats); + receive_buf(vi, rq, buf, len, ctx, xdp_xmit, stats); packets++; } } else { while (packets < budget && - (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) { - receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats); + (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { + receive_buf(vi, rq, buf, len, NULL, xdp_xmit, stats); packets++; } } + return packets; +} + +static int virtnet_receive(struct receive_queue *rq, int budget, + unsigned int *xdp_xmit) +{ + struct virtnet_info *vi = rq->vq->vdev->priv; + struct virtnet_rq_stats stats = {}; + int i, packets; + + if (rq->xsk_pool) + packets = virtnet_receive_xsk_bufs(vi, rq, budget, xdp_xmit, &stats); + else + packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats); + if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) { if (!try_fill_recv(vi, rq, GFP_ATOMIC)) { spin_lock(&vi->refill_lock); @@ -2145,7 +2960,7 @@ static int virtnet_receive(struct receive_queue *rq, int budget, u64_stats_set(&stats.packets, packets); u64_stats_update_begin(&rq->stats.syncp); - for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) { + for (i = 0; i < ARRAY_SIZE(virtnet_rq_stats_desc); i++) { size_t offset = virtnet_rq_stats_desc[i].offset; u64_stats_t *item, *src; @@ -2153,12 +2968,16 @@ static int virtnet_receive(struct receive_queue *rq, int budget, src = (u64_stats_t *)((u8 *)&stats + offset); u64_stats_add(item, u64_stats_read(src)); } + + u64_stats_add(&rq->stats.packets, u64_stats_read(&stats.packets)); + u64_stats_add(&rq->stats.bytes, u64_stats_read(&stats.bytes)); + u64_stats_update_end(&rq->stats.syncp); return packets; } -static void virtnet_poll_cleantx(struct receive_queue *rq) +static void virtnet_poll_cleantx(struct receive_queue *rq, int budget) { struct virtnet_info *vi = rq->vq->vdev->priv; unsigned int index = vq2rxq(rq->vq); @@ -2176,11 +2995,17 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) do { virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, true); + free_old_xmit(sq, txq, !!budget); } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); - if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) + if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { + if (netif_tx_queue_stopped(txq)) { + u64_stats_update_begin(&sq->stats.syncp); + u64_stats_inc(&sq->stats.wake); + u64_stats_update_end(&sq->stats.syncp); + } netif_tx_wake_queue(txq); + } __netif_tx_unlock(txq); } @@ -2193,14 +3018,15 @@ static void virtnet_rx_dim_update(struct virtnet_info *vi, struct receive_queue if (!rq->packets_in_napi) return; - u64_stats_update_begin(&rq->stats.syncp); + /* Don't need protection when fetching stats, since fetcher and + * updater of the stats are in same context + */ dim_update_sample(rq->calls, u64_stats_read(&rq->stats.packets), u64_stats_read(&rq->stats.bytes), &cur_sample); - u64_stats_update_end(&rq->stats.syncp); - net_dim(&rq->dim, cur_sample); + net_dim(&rq->dim, &cur_sample); rq->packets_in_napi = 0; } @@ -2214,7 +3040,7 @@ static int virtnet_poll(struct napi_struct *napi, int budget) unsigned int xdp_xmit = 0; bool napi_complete; - virtnet_poll_cleantx(rq); + virtnet_poll_cleantx(rq, budget); received = virtnet_receive(rq, budget, &xdp_xmit); rq->packets_in_napi += received; @@ -2225,6 +3051,10 @@ static int virtnet_poll(struct napi_struct *napi, int budget) /* Out of packets? */ if (received < budget) { napi_complete = virtqueue_napi_complete(napi, rq->vq, received); + /* Intentionally not taking dim_lock here. This may result in a + * spurious net_dim call. But if that happens virtnet_rx_dim_work + * will not act on the scheduled work. + */ if (napi_complete && rq->dim_enabled) virtnet_rx_dim_update(vi, rq); } @@ -2244,8 +3074,8 @@ static int virtnet_poll(struct napi_struct *napi, int budget) static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index) { - virtnet_napi_tx_disable(&vi->sq[qp_index].napi); - napi_disable(&vi->rq[qp_index].napi); + virtnet_napi_tx_disable(&vi->sq[qp_index]); + virtnet_napi_disable(&vi->rq[qp_index]); xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); } @@ -2264,8 +3094,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) if (err < 0) goto err_xdp_reg_mem_model; - virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); - virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); + virtnet_napi_enable(&vi->rq[qp_index]); + virtnet_napi_tx_enable(&vi->sq[qp_index]); return 0; @@ -2274,6 +3104,32 @@ err_xdp_reg_mem_model: return err; } +static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim) +{ + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + return; + net_dim_work_cancel(dim); +} + +static void virtnet_update_settings(struct virtnet_info *vi) +{ + u32 speed; + u8 duplex; + + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) + return; + + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed); + + if (ethtool_validate_speed(speed)) + vi->speed = speed; + + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex); + + if (ethtool_validate_duplex(duplex)) + vi->duplex = duplex; +} + static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); @@ -2292,6 +3148,15 @@ static int virtnet_open(struct net_device *dev) goto err_enable_qp; } + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { + if (vi->status & VIRTIO_NET_S_LINK_UP) + netif_carrier_on(vi->dev); + virtio_config_driver_enable(vi->vdev); + } else { + vi->status = VIRTIO_NET_S_LINK_UP; + netif_carrier_on(dev); + } + return 0; err_enable_qp: @@ -2300,7 +3165,7 @@ err_enable_qp: for (i--; i >= 0; i--) { virtnet_disable_queue_pair(vi, i); - cancel_work_sync(&vi->rq[i].dim.work); + virtnet_cancel_dim(vi, &vi->rq[i].dim); } return err; @@ -2312,7 +3177,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) struct virtnet_info *vi = sq->vq->vdev->priv; unsigned int index = vq2txq(sq->vq); struct netdev_queue *txq; - int opaque; + int opaque, xsk_done = 0; bool done; if (unlikely(is_xdp_raw_buffer_queue(vi, index))) { @@ -2324,10 +3189,25 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) txq = netdev_get_tx_queue(vi->dev, index); __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, true); - if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) + if (sq->xsk_pool) + xsk_done = virtnet_xsk_xmit(sq, sq->xsk_pool, budget); + else + free_old_xmit(sq, txq, !!budget); + + if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { + if (netif_tx_queue_stopped(txq)) { + u64_stats_update_begin(&sq->stats.syncp); + u64_stats_inc(&sq->stats.wake); + u64_stats_update_end(&sq->stats.syncp); + } netif_tx_wake_queue(txq); + } + + if (xsk_done >= budget) { + __netif_tx_unlock(txq); + return budget; + } opaque = virtqueue_enable_cb_prepare(sq->vq); @@ -2352,7 +3232,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) return 0; } -static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) +static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) { struct virtio_net_hdr_mrg_rxbuf *hdr; const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; @@ -2396,7 +3276,9 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) return num_sg; num_sg++; } - return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); + + return virtnet_add_outbuf(sq, num_sg, skb, + orphan ? VIRTNET_XMIT_TYPE_SKB_ORPHAN : VIRTNET_XMIT_TYPE_SKB); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -2406,24 +3288,20 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) struct send_queue *sq = &vi->sq[qnum]; int err; struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); - bool kick = !netdev_xmit_more(); + bool xmit_more = netdev_xmit_more(); bool use_napi = sq->napi.weight; + bool kick; - /* Free up any pending old buffers before queueing new ones. */ - do { - if (use_napi) - virtqueue_disable_cb(sq->vq); - - free_old_xmit(sq, false); - - } while (use_napi && kick && - unlikely(!virtqueue_enable_cb_delayed(sq->vq))); + if (!use_napi) + free_old_xmit(sq, txq, false); + else + virtqueue_disable_cb(sq->vq); /* timestamp packet in software */ skb_tx_timestamp(skb); /* Try to transmit */ - err = xmit_skb(sq, skb); + err = xmit_skb(sq, skb, !use_napi); /* This should not happen! */ if (unlikely(err)) { @@ -2443,9 +3321,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset_ct(skb); } - check_sq_full_and_disable(vi, dev, sq); + if (use_napi) + tx_may_stop(vi, dev, sq); + else + check_sq_full_and_disable(vi, dev,sq); - if (kick || netif_xmit_stopped(txq)) { + kick = use_napi ? __netdev_tx_sent_queue(txq, skb->len, xmit_more) : + !xmit_more || netif_xmit_stopped(txq); + if (kick) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.kicks); @@ -2453,45 +3336,110 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) } } + if (use_napi && kick && unlikely(!virtqueue_enable_cb_delayed(sq->vq))) + virtqueue_napi_schedule(&sq->napi, sq->vq); + return NETDEV_TX_OK; } +static void __virtnet_rx_pause(struct virtnet_info *vi, + struct receive_queue *rq) +{ + bool running = netif_running(vi->dev); + + if (running) { + virtnet_napi_disable(rq); + virtnet_cancel_dim(vi, &rq->dim); + } +} + +static void virtnet_rx_pause_all(struct virtnet_info *vi) +{ + int i; + + /* + * Make sure refill_work does not run concurrently to + * avoid napi_disable race which leads to deadlock. + */ + disable_delayed_refill(vi); + cancel_delayed_work_sync(&vi->refill); + for (i = 0; i < vi->max_queue_pairs; i++) + __virtnet_rx_pause(vi, &vi->rq[i]); +} + +static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) +{ + /* + * Make sure refill_work does not run concurrently to + * avoid napi_disable race which leads to deadlock. + */ + disable_delayed_refill(vi); + cancel_delayed_work_sync(&vi->refill); + __virtnet_rx_pause(vi, rq); +} + +static void __virtnet_rx_resume(struct virtnet_info *vi, + struct receive_queue *rq, + bool refill) +{ + bool running = netif_running(vi->dev); + bool schedule_refill = false; + + if (refill && !try_fill_recv(vi, rq, GFP_KERNEL)) + schedule_refill = true; + if (running) + virtnet_napi_enable(rq); + + if (schedule_refill) + schedule_delayed_work(&vi->refill, 0); +} + +static void virtnet_rx_resume_all(struct virtnet_info *vi) +{ + int i; + + enable_delayed_refill(vi); + for (i = 0; i < vi->max_queue_pairs; i++) { + if (i < vi->curr_queue_pairs) + __virtnet_rx_resume(vi, &vi->rq[i], true); + else + __virtnet_rx_resume(vi, &vi->rq[i], false); + } +} + +static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) +{ + enable_delayed_refill(vi); + __virtnet_rx_resume(vi, rq, true); +} + static int virtnet_rx_resize(struct virtnet_info *vi, struct receive_queue *rq, u32 ring_num) { - bool running = netif_running(vi->dev); int err, qindex; qindex = rq - vi->rq; - if (running) { - napi_disable(&rq->napi); - cancel_work_sync(&rq->dim.work); - } + virtnet_rx_pause(vi, rq); - err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf); + err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf, NULL); if (err) netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); - if (!try_fill_recv(vi, rq, GFP_KERNEL)) - schedule_delayed_work(&vi->refill, 0); - - if (running) - virtnet_napi_enable(rq->vq, &rq->napi); + virtnet_rx_resume(vi, rq); return err; } -static int virtnet_tx_resize(struct virtnet_info *vi, - struct send_queue *sq, u32 ring_num) +static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq) { bool running = netif_running(vi->dev); struct netdev_queue *txq; - int err, qindex; + int qindex; qindex = sq - vi->sq; if (running) - virtnet_napi_tx_disable(&sq->napi); + virtnet_napi_tx_disable(sq); txq = netdev_get_tx_queue(vi->dev, qindex); @@ -2507,10 +3455,17 @@ static int virtnet_tx_resize(struct virtnet_info *vi, netif_stop_subqueue(vi->dev, qindex); __netif_tx_unlock_bh(txq); +} - err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf); - if (err) - netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); +static void virtnet_tx_resume(struct virtnet_info *vi, struct send_queue *sq) +{ + bool running = netif_running(vi->dev); + struct netdev_queue *txq; + int qindex; + + qindex = sq - vi->sq; + + txq = netdev_get_tx_queue(vi->dev, qindex); __netif_tx_lock_bh(txq); sq->reset = false; @@ -2518,7 +3473,25 @@ static int virtnet_tx_resize(struct virtnet_info *vi, __netif_tx_unlock_bh(txq); if (running) - virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); + virtnet_napi_tx_enable(sq); +} + +static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, + u32 ring_num) +{ + int qindex, err; + + qindex = sq - vi->sq; + + virtnet_tx_pause(vi, sq); + + err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf, + virtnet_sq_free_unused_buf_done); + if (err) + netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); + + virtnet_tx_resume(vi, sq); + return err; } @@ -2527,16 +3500,19 @@ static int virtnet_tx_resize(struct virtnet_info *vi, * supported by the hypervisor, as indicated by feature bits, should * never fail unless improperly formatted. */ -static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, - struct scatterlist *out) +static bool virtnet_send_command_reply(struct virtnet_info *vi, u8 class, u8 cmd, + struct scatterlist *out, + struct scatterlist *in) { - struct scatterlist *sgs[4], hdr, stat; - unsigned out_num = 0, tmp; + struct scatterlist *sgs[5], hdr, stat; + u32 out_num = 0, tmp, in_num = 0; + bool ok; int ret; /* Caller should know better */ BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); + mutex_lock(&vi->cvq_lock); vi->ctrl->status = ~0; vi->ctrl->hdr.class = class; vi->ctrl->hdr.cmd = cmd; @@ -2549,18 +3525,22 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, /* Add return status. */ sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status)); - sgs[out_num] = &stat; + sgs[out_num + in_num++] = &stat; + + if (in) + sgs[out_num + in_num++] = in; - BUG_ON(out_num + 1 > ARRAY_SIZE(sgs)); - ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC); + BUG_ON(out_num + in_num > ARRAY_SIZE(sgs)); + ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC); if (ret < 0) { dev_warn(&vi->vdev->dev, "Failed to add sgs for command vq: %d\n.", ret); + mutex_unlock(&vi->cvq_lock); return false; } if (unlikely(!virtqueue_kick(vi->cvq))) - return vi->ctrl->status == VIRTIO_NET_OK; + goto unlock; /* Spin for a response, the kick causes an ioport write, trapping * into the hypervisor, so the request should be handled immediately. @@ -2571,7 +3551,16 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, cpu_relax(); } - return vi->ctrl->status == VIRTIO_NET_OK; +unlock: + ok = vi->ctrl->status == VIRTIO_NET_OK; + mutex_unlock(&vi->cvq_lock); + return ok; +} + +static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, + struct scatterlist *out) +{ + return virtnet_send_command_reply(vi, class, cmd, out, NULL); } static int virtnet_set_mac_address(struct net_device *dev, void *p) @@ -2663,49 +3652,93 @@ static void virtnet_stats(struct net_device *dev, static void virtnet_ack_link_announce(struct virtnet_info *vi) { - rtnl_lock(); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL)) dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); - rtnl_unlock(); } -static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) +static bool virtnet_commit_rss_command(struct virtnet_info *vi); + +static void virtnet_rss_update_by_qpairs(struct virtnet_info *vi, u16 queue_pairs) { - struct scatterlist sg; + u32 indir_val = 0; + int i = 0; + + for (; i < vi->rss_indir_table_size; ++i) { + indir_val = ethtool_rxfh_indir_default(i, queue_pairs); + vi->rss_hdr->indirection_table[i] = cpu_to_le16(indir_val); + } + vi->rss_trailer.max_tx_vq = cpu_to_le16(queue_pairs); +} + +static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) +{ + struct virtio_net_ctrl_mq *mq __free(kfree) = NULL; + struct virtio_net_rss_config_hdr *old_rss_hdr; + struct virtio_net_rss_config_trailer old_rss_trailer; struct net_device *dev = vi->dev; + struct scatterlist sg; if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0; - vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs); - sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq)); + /* Firstly check if we need update rss. Do updating if both (1) rss enabled and + * (2) no user configuration. + * + * During rss command processing, device updates queue_pairs using rss.max_tx_vq. That is, + * the device updates queue_pairs together with rss, so we can skip the sperate queue_pairs + * update (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. + */ + if (vi->has_rss && !netif_is_rxfh_configured(dev)) { + old_rss_hdr = vi->rss_hdr; + old_rss_trailer = vi->rss_trailer; + vi->rss_hdr = devm_kzalloc(&dev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); + if (!vi->rss_hdr) { + vi->rss_hdr = old_rss_hdr; + return -ENOMEM; + } + + *vi->rss_hdr = *old_rss_hdr; + virtnet_rss_update_by_qpairs(vi, queue_pairs); + + if (!virtnet_commit_rss_command(vi)) { + /* restore ctrl_rss if commit_rss_command failed */ + devm_kfree(&dev->dev, vi->rss_hdr); + vi->rss_hdr = old_rss_hdr; + vi->rss_trailer = old_rss_trailer; + + dev_warn(&dev->dev, "Fail to set num of queue pairs to %d, because committing RSS failed\n", + queue_pairs); + return -EINVAL; + } + devm_kfree(&dev->dev, old_rss_hdr); + goto succ; + } + + mq = kzalloc(sizeof(*mq), GFP_KERNEL); + if (!mq) + return -ENOMEM; + + mq->virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs); + sg_init_one(&sg, mq, sizeof(*mq)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) { dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", queue_pairs); return -EINVAL; - } else { - vi->curr_queue_pairs = queue_pairs; - /* virtnet_open() will refill when device is going to up. */ - if (dev->flags & IFF_UP) - schedule_delayed_work(&vi->refill, 0); } +succ: + vi->curr_queue_pairs = queue_pairs; + /* virtnet_open() will refill when device is going to up. */ + spin_lock_bh(&vi->refill_lock); + if (dev->flags & IFF_UP && vi->refill_enabled) + schedule_delayed_work(&vi->refill, 0); + spin_unlock_bh(&vi->refill_lock); return 0; } -static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) -{ - int err; - - rtnl_lock(); - err = _virtnet_set_queues(vi, queue_pairs); - rtnl_unlock(); - return err; -} - static int virtnet_close(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); @@ -2715,12 +3748,22 @@ static int virtnet_close(struct net_device *dev) disable_delayed_refill(vi); /* Make sure refill_work doesn't re-enable napi! */ cancel_delayed_work_sync(&vi->refill); + /* Prevent the config change callback from changing carrier + * after close + */ + virtio_config_driver_disable(vi->vdev); + /* Stop getting status/speed updates: we don't care until next + * open + */ + cancel_work_sync(&vi->config_work); for (i = 0; i < vi->max_queue_pairs; i++) { virtnet_disable_queue_pair(vi, i); - cancel_work_sync(&vi->rq[i].dim.work); + virtnet_cancel_dim(vi, &vi->rq[i].dim); } + netif_carrier_off(dev); + return 0; } @@ -2728,6 +3771,7 @@ static void virtnet_rx_mode_work(struct work_struct *work) { struct virtnet_info *vi = container_of(work, struct virtnet_info, rx_mode_work); + u8 *promisc_allmulti __free(kfree) = NULL; struct net_device *dev = vi->dev; struct scatterlist sg[2]; struct virtio_net_ctrl_mac *mac_data; @@ -2741,24 +3785,29 @@ static void virtnet_rx_mode_work(struct work_struct *work) if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) return; - rtnl_lock(); + promisc_allmulti = kzalloc(sizeof(*promisc_allmulti), GFP_KERNEL); + if (!promisc_allmulti) { + dev_warn(&dev->dev, "Failed to set RX mode, no memory.\n"); + return; + } - vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0); - vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0); + rtnl_lock(); - sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc)); + *promisc_allmulti = !!(dev->flags & IFF_PROMISC); + sg_init_one(sg, promisc_allmulti, sizeof(*promisc_allmulti)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_PROMISC, sg)) dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", - vi->ctrl->promisc ? "en" : "dis"); + *promisc_allmulti ? "en" : "dis"); - sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti)); + *promisc_allmulti = !!(dev->flags & IFF_ALLMULTI); + sg_init_one(sg, promisc_allmulti, sizeof(*promisc_allmulti)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_ALLMULTI, sg)) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", - vi->ctrl->allmulti ? "en" : "dis"); + *promisc_allmulti ? "en" : "dis"); netif_addr_lock_bh(dev); @@ -2819,10 +3868,15 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct virtnet_info *vi = netdev_priv(dev); + __virtio16 *_vid __free(kfree) = NULL; struct scatterlist sg; - vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid); - sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid)); + _vid = kzalloc(sizeof(*_vid), GFP_KERNEL); + if (!_vid) + return -ENOMEM; + + *_vid = cpu_to_virtio16(vi->vdev, vid); + sg_init_one(&sg, _vid, sizeof(*_vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, VIRTIO_NET_CTRL_VLAN_ADD, &sg)) @@ -2834,10 +3888,15 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct virtnet_info *vi = netdev_priv(dev); + __virtio16 *_vid __free(kfree) = NULL; struct scatterlist sg; - vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid); - sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid)); + _vid = kzalloc(sizeof(*_vid), GFP_KERNEL); + if (!_vid) + return -ENOMEM; + + *_vid = cpu_to_virtio16(vi->vdev, vid); + sg_init_one(&sg, _vid, sizeof(*_vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, VIRTIO_NET_CTRL_VLAN_DEL, &sg)) @@ -2864,7 +3923,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) cpumask_var_t mask; int stragglers; int group_size; - int i, j, cpu; + int i, start = 0, cpu; int num_cpu; int stride; @@ -2878,16 +3937,18 @@ static void virtnet_set_affinity(struct virtnet_info *vi) stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; - cpu = cpumask_first(cpu_online_mask); for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); - for (j = 0; j < group_size; j++) { + for_each_online_cpu_wrap(cpu, start) { + if (!group_size--) { + start = cpu; + break; + } cpumask_set_cpu(cpu, mask); - cpu = cpumask_next_wrap(cpu, cpu_online_mask, - nr_cpu_ids, false); } + virtqueue_set_affinity(vi->rq[i].vq, mask); virtqueue_set_affinity(vi->sq[i].vq, mask); __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS); @@ -2950,12 +4011,17 @@ static void virtnet_cpu_notif_remove(struct virtnet_info *vi) static int virtnet_send_ctrl_coal_vq_cmd(struct virtnet_info *vi, u16 vqn, u32 max_usecs, u32 max_packets) { + struct virtio_net_ctrl_coal_vq *coal_vq __free(kfree) = NULL; struct scatterlist sgs; - vi->ctrl->coal_vq.vqn = cpu_to_le16(vqn); - vi->ctrl->coal_vq.coal.max_usecs = cpu_to_le32(max_usecs); - vi->ctrl->coal_vq.coal.max_packets = cpu_to_le32(max_packets); - sg_init_one(&sgs, &vi->ctrl->coal_vq, sizeof(vi->ctrl->coal_vq)); + coal_vq = kzalloc(sizeof(*coal_vq), GFP_KERNEL); + if (!coal_vq) + return -ENOMEM; + + coal_vq->vqn = cpu_to_le16(vqn); + coal_vq->coal.max_usecs = cpu_to_le32(max_usecs); + coal_vq->coal.max_packets = cpu_to_le32(max_packets); + sg_init_one(&sgs, coal_vq, sizeof(*coal_vq)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_VQ_SET, @@ -2971,6 +4037,9 @@ static int virtnet_send_rx_ctrl_coal_vq_cmd(struct virtnet_info *vi, { int err; + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + return -EOPNOTSUPP; + err = virtnet_send_ctrl_coal_vq_cmd(vi, rxq2vq(queue), max_usecs, max_packets); if (err) @@ -2988,6 +4057,9 @@ static int virtnet_send_tx_ctrl_coal_vq_cmd(struct virtnet_info *vi, { int err; + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + return -EOPNOTSUPP; + err = virtnet_send_ctrl_coal_vq_cmd(vi, txq2vq(queue), max_usecs, max_packets); if (err) @@ -3056,7 +4128,11 @@ static int virtnet_set_ringparam(struct net_device *dev, err = virtnet_send_tx_ctrl_coal_vq_cmd(vi, i, vi->intr_coal_tx.max_usecs, vi->intr_coal_tx.max_packets); - if (err) + + /* Don't break the tx resize action if the vq coalescing is not + * supported. The same is true for rx resize below. + */ + if (err && err != -EOPNOTSUPP) return err; } @@ -3066,10 +4142,12 @@ static int virtnet_set_ringparam(struct net_device *dev, return err; /* The reason is same as the transmit virtqueue reset */ + mutex_lock(&vi->rq[i].dim_lock); err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, i, vi->intr_coal_rx.max_usecs, vi->intr_coal_rx.max_packets); - if (err) + mutex_unlock(&vi->rq[i].dim_lock); + if (err && err != -EOPNOTSUPP) return err; } } @@ -3080,54 +4158,39 @@ static int virtnet_set_ringparam(struct net_device *dev, static bool virtnet_commit_rss_command(struct virtnet_info *vi) { struct net_device *dev = vi->dev; - struct scatterlist sgs[4]; - unsigned int sg_buf_size; + struct scatterlist sgs[2]; /* prepare sgs */ - sg_init_table(sgs, 4); - - sg_buf_size = offsetof(struct virtio_net_ctrl_rss, indirection_table); - sg_set_buf(&sgs[0], &vi->ctrl->rss, sg_buf_size); - - sg_buf_size = sizeof(uint16_t) * (vi->ctrl->rss.indirection_table_mask + 1); - sg_set_buf(&sgs[1], vi->ctrl->rss.indirection_table, sg_buf_size); - - sg_buf_size = offsetof(struct virtio_net_ctrl_rss, key) - - offsetof(struct virtio_net_ctrl_rss, max_tx_vq); - sg_set_buf(&sgs[2], &vi->ctrl->rss.max_tx_vq, sg_buf_size); - - sg_buf_size = vi->rss_key_size; - sg_set_buf(&sgs[3], vi->ctrl->rss.key, sg_buf_size); + sg_init_table(sgs, 2); + sg_set_buf(&sgs[0], vi->rss_hdr, virtnet_rss_hdr_size(vi)); + sg_set_buf(&sgs[1], &vi->rss_trailer, virtnet_rss_trailer_size(vi)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, vi->has_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG - : VIRTIO_NET_CTRL_MQ_HASH_CONFIG, sgs)) { - dev_warn(&dev->dev, "VIRTIONET issue with committing RSS sgs\n"); - return false; - } + : VIRTIO_NET_CTRL_MQ_HASH_CONFIG, sgs)) + goto err; + return true; + +err: + dev_warn(&dev->dev, "VIRTIONET issue with committing RSS sgs\n"); + return false; + } static void virtnet_init_default_rss(struct virtnet_info *vi) { - u32 indir_val = 0; - int i = 0; - - vi->ctrl->rss.hash_types = vi->rss_hash_types_supported; + vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_supported); vi->rss_hash_types_saved = vi->rss_hash_types_supported; - vi->ctrl->rss.indirection_table_mask = vi->rss_indir_table_size - ? vi->rss_indir_table_size - 1 : 0; - vi->ctrl->rss.unclassified_queue = 0; + vi->rss_hdr->indirection_table_mask = vi->rss_indir_table_size + ? cpu_to_le16(vi->rss_indir_table_size - 1) : 0; + vi->rss_hdr->unclassified_queue = 0; - for (; i < vi->rss_indir_table_size; ++i) { - indir_val = ethtool_rxfh_indir_default(i, vi->curr_queue_pairs); - vi->ctrl->rss.indirection_table[i] = indir_val; - } + virtnet_rss_update_by_qpairs(vi, vi->curr_queue_pairs); - vi->ctrl->rss.max_tx_vq = vi->has_rss ? vi->curr_queue_pairs : 0; - vi->ctrl->rss.hash_key_length = vi->rss_key_size; + vi->rss_trailer.hash_key_length = vi->rss_key_size; - netdev_rss_key_fill(vi->ctrl->rss.key, vi->rss_key_size); + netdev_rss_key_fill(vi->rss_hash_key_data, vi->rss_key_size); } static void virtnet_get_hashflow(const struct virtnet_info *vi, struct ethtool_rxnfc *info) @@ -3238,7 +4301,7 @@ static bool virtnet_set_hashflow(struct virtnet_info *vi, struct ethtool_rxnfc * if (new_hashtypes != vi->rss_hash_types_saved) { vi->rss_hash_types_saved = new_hashtypes; - vi->ctrl->rss.hash_types = vi->rss_hash_types_saved; + vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_saved); if (vi->dev->features & NETIF_F_RXHASH) return virtnet_commit_rss_command(vi); } @@ -3283,7 +4346,7 @@ static int virtnet_set_channels(struct net_device *dev, return -EINVAL; cpus_read_lock(); - err = _virtnet_set_queues(vi, queue_pairs); + err = virtnet_set_queues(vi, queue_pairs); if (err) { cpus_read_unlock(); goto err; @@ -3297,25 +4360,654 @@ static int virtnet_set_channels(struct net_device *dev, return err; } +static void virtnet_stats_sprintf(u8 **p, const char *fmt, const char *noq_fmt, + int num, int qid, const struct virtnet_stat_desc *desc) +{ + int i; + + if (qid < 0) { + for (i = 0; i < num; ++i) + ethtool_sprintf(p, noq_fmt, desc[i].desc); + } else { + for (i = 0; i < num; ++i) + ethtool_sprintf(p, fmt, qid, desc[i].desc); + } +} + +/* qid == -1: for rx/tx queue total field */ +static void virtnet_get_stats_string(struct virtnet_info *vi, int type, int qid, u8 **data) +{ + const struct virtnet_stat_desc *desc; + const char *fmt, *noq_fmt; + u8 *p = *data; + u32 num; + + if (type == VIRTNET_Q_TYPE_CQ && qid >= 0) { + noq_fmt = "cq_hw_%s"; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_CVQ) { + desc = &virtnet_stats_cvq_desc[0]; + num = ARRAY_SIZE(virtnet_stats_cvq_desc); + + virtnet_stats_sprintf(&p, NULL, noq_fmt, num, -1, desc); + } + } + + if (type == VIRTNET_Q_TYPE_RX) { + fmt = "rx%u_%s"; + noq_fmt = "rx_%s"; + + desc = &virtnet_rq_stats_desc[0]; + num = ARRAY_SIZE(virtnet_rq_stats_desc); + + virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); + + fmt = "rx%u_hw_%s"; + noq_fmt = "rx_hw_%s"; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { + desc = &virtnet_stats_rx_basic_desc[0]; + num = ARRAY_SIZE(virtnet_stats_rx_basic_desc); + + virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { + desc = &virtnet_stats_rx_csum_desc[0]; + num = ARRAY_SIZE(virtnet_stats_rx_csum_desc); + + virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { + desc = &virtnet_stats_rx_speed_desc[0]; + num = ARRAY_SIZE(virtnet_stats_rx_speed_desc); + + virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); + } + } + + if (type == VIRTNET_Q_TYPE_TX) { + fmt = "tx%u_%s"; + noq_fmt = "tx_%s"; + + desc = &virtnet_sq_stats_desc[0]; + num = ARRAY_SIZE(virtnet_sq_stats_desc); + + virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); + + fmt = "tx%u_hw_%s"; + noq_fmt = "tx_hw_%s"; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { + desc = &virtnet_stats_tx_basic_desc[0]; + num = ARRAY_SIZE(virtnet_stats_tx_basic_desc); + + virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { + desc = &virtnet_stats_tx_gso_desc[0]; + num = ARRAY_SIZE(virtnet_stats_tx_gso_desc); + + virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { + desc = &virtnet_stats_tx_speed_desc[0]; + num = ARRAY_SIZE(virtnet_stats_tx_speed_desc); + + virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); + } + } + + *data = p; +} + +struct virtnet_stats_ctx { + /* The stats are write to qstats or ethtool -S */ + bool to_qstat; + + /* Used to calculate the offset inside the output buffer. */ + u32 desc_num[3]; + + /* The actual supported stat types. */ + u64 bitmap[3]; + + /* Used to calculate the reply buffer size. */ + u32 size[3]; + + /* Record the output buffer. */ + u64 *data; +}; + +static void virtnet_stats_ctx_init(struct virtnet_info *vi, + struct virtnet_stats_ctx *ctx, + u64 *data, bool to_qstat) +{ + u32 queue_type; + + ctx->data = data; + ctx->to_qstat = to_qstat; + + if (to_qstat) { + ctx->desc_num[VIRTNET_Q_TYPE_RX] = ARRAY_SIZE(virtnet_rq_stats_desc_qstat); + ctx->desc_num[VIRTNET_Q_TYPE_TX] = ARRAY_SIZE(virtnet_sq_stats_desc_qstat); + + queue_type = VIRTNET_Q_TYPE_RX; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_BASIC; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_basic_desc_qstat); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_basic); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_CSUM; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_csum_desc_qstat); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_csum); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_GSO) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_GSO; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_gso_desc_qstat); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_gso); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_SPEED; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_speed_desc_qstat); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_speed); + } + + queue_type = VIRTNET_Q_TYPE_TX; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_BASIC; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_basic_desc_qstat); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_basic); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_CSUM; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_csum_desc_qstat); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_csum); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_GSO; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_gso_desc_qstat); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_gso); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_SPEED; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_speed_desc_qstat); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_speed); + } + + return; + } + + ctx->desc_num[VIRTNET_Q_TYPE_RX] = ARRAY_SIZE(virtnet_rq_stats_desc); + ctx->desc_num[VIRTNET_Q_TYPE_TX] = ARRAY_SIZE(virtnet_sq_stats_desc); + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_CVQ) { + queue_type = VIRTNET_Q_TYPE_CQ; + + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_CVQ; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_cvq_desc); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_cvq); + } + + queue_type = VIRTNET_Q_TYPE_RX; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_BASIC; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_basic_desc); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_basic); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_CSUM; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_csum_desc); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_csum); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_SPEED; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_speed_desc); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_speed); + } + + queue_type = VIRTNET_Q_TYPE_TX; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_BASIC; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_basic_desc); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_basic); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_GSO; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_gso_desc); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_gso); + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { + ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_SPEED; + ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_speed_desc); + ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_speed); + } +} + +/* stats_sum_queue - Calculate the sum of the same fields in sq or rq. + * @sum: the position to store the sum values + * @num: field num + * @q_value: the first queue fields + * @q_num: number of the queues + */ +static void stats_sum_queue(u64 *sum, u32 num, u64 *q_value, u32 q_num) +{ + u32 step = num; + int i, j; + u64 *p; + + for (i = 0; i < num; ++i) { + p = sum + i; + *p = 0; + + for (j = 0; j < q_num; ++j) + *p += *(q_value + i + j * step); + } +} + +static void virtnet_fill_total_fields(struct virtnet_info *vi, + struct virtnet_stats_ctx *ctx) +{ + u64 *data, *first_rx_q, *first_tx_q; + u32 num_cq, num_rx, num_tx; + + num_cq = ctx->desc_num[VIRTNET_Q_TYPE_CQ]; + num_rx = ctx->desc_num[VIRTNET_Q_TYPE_RX]; + num_tx = ctx->desc_num[VIRTNET_Q_TYPE_TX]; + + first_rx_q = ctx->data + num_rx + num_tx + num_cq; + first_tx_q = first_rx_q + vi->curr_queue_pairs * num_rx; + + data = ctx->data; + + stats_sum_queue(data, num_rx, first_rx_q, vi->curr_queue_pairs); + + data = ctx->data + num_rx; + + stats_sum_queue(data, num_tx, first_tx_q, vi->curr_queue_pairs); +} + +static void virtnet_fill_stats_qstat(struct virtnet_info *vi, u32 qid, + struct virtnet_stats_ctx *ctx, + const u8 *base, bool drv_stats, u8 reply_type) +{ + const struct virtnet_stat_desc *desc; + const u64_stats_t *v_stat; + u64 offset, bitmap; + const __le64 *v; + u32 queue_type; + int i, num; + + queue_type = vq_type(vi, qid); + bitmap = ctx->bitmap[queue_type]; + + if (drv_stats) { + if (queue_type == VIRTNET_Q_TYPE_RX) { + desc = &virtnet_rq_stats_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_rq_stats_desc_qstat); + } else { + desc = &virtnet_sq_stats_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_sq_stats_desc_qstat); + } + + for (i = 0; i < num; ++i) { + offset = desc[i].qstat_offset / sizeof(*ctx->data); + v_stat = (const u64_stats_t *)(base + desc[i].offset); + ctx->data[offset] = u64_stats_read(v_stat); + } + return; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { + desc = &virtnet_stats_rx_basic_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_stats_rx_basic_desc_qstat); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_BASIC) + goto found; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { + desc = &virtnet_stats_rx_csum_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_stats_rx_csum_desc_qstat); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_CSUM) + goto found; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_RX_GSO) { + desc = &virtnet_stats_rx_gso_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_stats_rx_gso_desc_qstat); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_GSO) + goto found; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { + desc = &virtnet_stats_rx_speed_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_stats_rx_speed_desc_qstat); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_SPEED) + goto found; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { + desc = &virtnet_stats_tx_basic_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_stats_tx_basic_desc_qstat); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_BASIC) + goto found; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { + desc = &virtnet_stats_tx_csum_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_stats_tx_csum_desc_qstat); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_CSUM) + goto found; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_TX_GSO) { + desc = &virtnet_stats_tx_gso_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_stats_tx_gso_desc_qstat); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_GSO) + goto found; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { + desc = &virtnet_stats_tx_speed_desc_qstat[0]; + num = ARRAY_SIZE(virtnet_stats_tx_speed_desc_qstat); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_SPEED) + goto found; + } + + return; + +found: + for (i = 0; i < num; ++i) { + offset = desc[i].qstat_offset / sizeof(*ctx->data); + v = (const __le64 *)(base + desc[i].offset); + ctx->data[offset] = le64_to_cpu(*v); + } +} + +/* virtnet_fill_stats - copy the stats to qstats or ethtool -S + * The stats source is the device or the driver. + * + * @vi: virtio net info + * @qid: the vq id + * @ctx: stats ctx (initiated by virtnet_stats_ctx_init()) + * @base: pointer to the device reply or the driver stats structure. + * @drv_stats: designate the base type (device reply, driver stats) + * @type: the type of the device reply (if drv_stats is true, this must be zero) + */ +static void virtnet_fill_stats(struct virtnet_info *vi, u32 qid, + struct virtnet_stats_ctx *ctx, + const u8 *base, bool drv_stats, u8 reply_type) +{ + u32 queue_type, num_rx, num_tx, num_cq; + const struct virtnet_stat_desc *desc; + const u64_stats_t *v_stat; + u64 offset, bitmap; + const __le64 *v; + int i, num; + + if (ctx->to_qstat) + return virtnet_fill_stats_qstat(vi, qid, ctx, base, drv_stats, reply_type); + + num_cq = ctx->desc_num[VIRTNET_Q_TYPE_CQ]; + num_rx = ctx->desc_num[VIRTNET_Q_TYPE_RX]; + num_tx = ctx->desc_num[VIRTNET_Q_TYPE_TX]; + + queue_type = vq_type(vi, qid); + bitmap = ctx->bitmap[queue_type]; + + /* skip the total fields of pairs */ + offset = num_rx + num_tx; + + if (queue_type == VIRTNET_Q_TYPE_TX) { + offset += num_cq + num_rx * vi->curr_queue_pairs + num_tx * (qid / 2); + + num = ARRAY_SIZE(virtnet_sq_stats_desc); + if (drv_stats) { + desc = &virtnet_sq_stats_desc[0]; + goto drv_stats; + } + + offset += num; + + } else if (queue_type == VIRTNET_Q_TYPE_RX) { + offset += num_cq + num_rx * (qid / 2); + + num = ARRAY_SIZE(virtnet_rq_stats_desc); + if (drv_stats) { + desc = &virtnet_rq_stats_desc[0]; + goto drv_stats; + } + + offset += num; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_CVQ) { + desc = &virtnet_stats_cvq_desc[0]; + num = ARRAY_SIZE(virtnet_stats_cvq_desc); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_CVQ) + goto found; + + offset += num; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { + desc = &virtnet_stats_rx_basic_desc[0]; + num = ARRAY_SIZE(virtnet_stats_rx_basic_desc); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_BASIC) + goto found; + + offset += num; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { + desc = &virtnet_stats_rx_csum_desc[0]; + num = ARRAY_SIZE(virtnet_stats_rx_csum_desc); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_CSUM) + goto found; + + offset += num; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { + desc = &virtnet_stats_rx_speed_desc[0]; + num = ARRAY_SIZE(virtnet_stats_rx_speed_desc); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_SPEED) + goto found; + + offset += num; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { + desc = &virtnet_stats_tx_basic_desc[0]; + num = ARRAY_SIZE(virtnet_stats_tx_basic_desc); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_BASIC) + goto found; + + offset += num; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_TX_GSO) { + desc = &virtnet_stats_tx_gso_desc[0]; + num = ARRAY_SIZE(virtnet_stats_tx_gso_desc); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_GSO) + goto found; + + offset += num; + } + + if (bitmap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { + desc = &virtnet_stats_tx_speed_desc[0]; + num = ARRAY_SIZE(virtnet_stats_tx_speed_desc); + if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_SPEED) + goto found; + + offset += num; + } + + return; + +found: + for (i = 0; i < num; ++i) { + v = (const __le64 *)(base + desc[i].offset); + ctx->data[offset + i] = le64_to_cpu(*v); + } + + return; + +drv_stats: + for (i = 0; i < num; ++i) { + v_stat = (const u64_stats_t *)(base + desc[i].offset); + ctx->data[offset + i] = u64_stats_read(v_stat); + } +} + +static int __virtnet_get_hw_stats(struct virtnet_info *vi, + struct virtnet_stats_ctx *ctx, + struct virtio_net_ctrl_queue_stats *req, + int req_size, void *reply, int res_size) +{ + struct virtio_net_stats_reply_hdr *hdr; + struct scatterlist sgs_in, sgs_out; + void *p; + u32 qid; + int ok; + + sg_init_one(&sgs_out, req, req_size); + sg_init_one(&sgs_in, reply, res_size); + + ok = virtnet_send_command_reply(vi, VIRTIO_NET_CTRL_STATS, + VIRTIO_NET_CTRL_STATS_GET, + &sgs_out, &sgs_in); + + if (!ok) + return ok; + + for (p = reply; p - reply < res_size; p += le16_to_cpu(hdr->size)) { + hdr = p; + qid = le16_to_cpu(hdr->vq_index); + virtnet_fill_stats(vi, qid, ctx, p, false, hdr->type); + } + + return 0; +} + +static void virtnet_make_stat_req(struct virtnet_info *vi, + struct virtnet_stats_ctx *ctx, + struct virtio_net_ctrl_queue_stats *req, + int qid, int *idx) +{ + int qtype = vq_type(vi, qid); + u64 bitmap = ctx->bitmap[qtype]; + + if (!bitmap) + return; + + req->stats[*idx].vq_index = cpu_to_le16(qid); + req->stats[*idx].types_bitmap[0] = cpu_to_le64(bitmap); + *idx += 1; +} + +/* qid: -1: get stats of all vq. + * > 0: get the stats for the special vq. This must not be cvq. + */ +static int virtnet_get_hw_stats(struct virtnet_info *vi, + struct virtnet_stats_ctx *ctx, int qid) +{ + int qnum, i, j, res_size, qtype, last_vq, first_vq; + struct virtio_net_ctrl_queue_stats *req; + bool enable_cvq; + void *reply; + int ok; + + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_DEVICE_STATS)) + return 0; + + if (qid == -1) { + last_vq = vi->curr_queue_pairs * 2 - 1; + first_vq = 0; + enable_cvq = true; + } else { + last_vq = qid; + first_vq = qid; + enable_cvq = false; + } + + qnum = 0; + res_size = 0; + for (i = first_vq; i <= last_vq ; ++i) { + qtype = vq_type(vi, i); + if (ctx->bitmap[qtype]) { + ++qnum; + res_size += ctx->size[qtype]; + } + } + + if (enable_cvq && ctx->bitmap[VIRTNET_Q_TYPE_CQ]) { + res_size += ctx->size[VIRTNET_Q_TYPE_CQ]; + qnum += 1; + } + + req = kcalloc(qnum, sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + reply = kmalloc(res_size, GFP_KERNEL); + if (!reply) { + kfree(req); + return -ENOMEM; + } + + j = 0; + for (i = first_vq; i <= last_vq ; ++i) + virtnet_make_stat_req(vi, ctx, req, i, &j); + + if (enable_cvq) + virtnet_make_stat_req(vi, ctx, req, vi->max_queue_pairs * 2, &j); + + ok = __virtnet_get_hw_stats(vi, ctx, req, sizeof(*req) * j, reply, res_size); + + kfree(req); + kfree(reply); + + return ok; +} + static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) { struct virtnet_info *vi = netdev_priv(dev); - unsigned int i, j; + unsigned int i; u8 *p = data; switch (stringset) { case ETH_SS_STATS: - for (i = 0; i < vi->curr_queue_pairs; i++) { - for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) - ethtool_sprintf(&p, "rx_queue_%u_%s", i, - virtnet_rq_stats_desc[j].desc); - } + /* Generate the total field names. */ + virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_RX, -1, &p); + virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_TX, -1, &p); - for (i = 0; i < vi->curr_queue_pairs; i++) { - for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) - ethtool_sprintf(&p, "tx_queue_%u_%s", i, - virtnet_sq_stats_desc[j].desc); - } + virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_CQ, 0, &p); + + for (i = 0; i < vi->curr_queue_pairs; ++i) + virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_RX, i, &p); + + for (i = 0; i < vi->curr_queue_pairs; ++i) + virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_TX, i, &p); break; } } @@ -3323,11 +5015,17 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) static int virtnet_get_sset_count(struct net_device *dev, int sset) { struct virtnet_info *vi = netdev_priv(dev); + struct virtnet_stats_ctx ctx = {0}; + u32 pair_count; switch (sset) { case ETH_SS_STATS: - return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + - VIRTNET_SQ_STATS_LEN); + virtnet_stats_ctx_init(vi, &ctx, NULL, false); + + pair_count = ctx.desc_num[VIRTNET_Q_TYPE_RX] + ctx.desc_num[VIRTNET_Q_TYPE_TX]; + + return pair_count + ctx.desc_num[VIRTNET_Q_TYPE_CQ] + + vi->curr_queue_pairs * pair_count; default: return -EOPNOTSUPP; } @@ -3337,40 +5035,32 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct virtnet_info *vi = netdev_priv(dev); - unsigned int idx = 0, start, i, j; + struct virtnet_stats_ctx ctx = {0}; + unsigned int start, i; const u8 *stats_base; - const u64_stats_t *p; - size_t offset; + + virtnet_stats_ctx_init(vi, &ctx, data, false); + if (virtnet_get_hw_stats(vi, &ctx, -1)) + dev_warn(&vi->dev->dev, "Failed to get hw stats.\n"); for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; + struct send_queue *sq = &vi->sq[i]; stats_base = (const u8 *)&rq->stats; do { start = u64_stats_fetch_begin(&rq->stats.syncp); - for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) { - offset = virtnet_rq_stats_desc[j].offset; - p = (const u64_stats_t *)(stats_base + offset); - data[idx + j] = u64_stats_read(p); - } + virtnet_fill_stats(vi, i * 2, &ctx, stats_base, true, 0); } while (u64_stats_fetch_retry(&rq->stats.syncp, start)); - idx += VIRTNET_RQ_STATS_LEN; - } - - for (i = 0; i < vi->curr_queue_pairs; i++) { - struct send_queue *sq = &vi->sq[i]; stats_base = (const u8 *)&sq->stats; do { start = u64_stats_fetch_begin(&sq->stats.syncp); - for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) { - offset = virtnet_sq_stats_desc[j].offset; - p = (const u64_stats_t *)(stats_base + offset); - data[idx + j] = u64_stats_read(p); - } + virtnet_fill_stats(vi, i * 2 + 1, &ctx, stats_base, true, 0); } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); - idx += VIRTNET_SQ_STATS_LEN; } + + virtnet_fill_total_fields(vi, &ctx); } static void virtnet_get_channels(struct net_device *dev, @@ -3410,12 +5100,17 @@ static int virtnet_get_link_ksettings(struct net_device *dev, static int virtnet_send_tx_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec) { + struct virtio_net_ctrl_coal_tx *coal_tx __free(kfree) = NULL; struct scatterlist sgs_tx; int i; - vi->ctrl->coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs); - vi->ctrl->coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames); - sg_init_one(&sgs_tx, &vi->ctrl->coal_tx, sizeof(vi->ctrl->coal_tx)); + coal_tx = kzalloc(sizeof(*coal_tx), GFP_KERNEL); + if (!coal_tx) + return -ENOMEM; + + coal_tx->tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs); + coal_tx->tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames); + sg_init_one(&sgs_tx, coal_tx, sizeof(*coal_tx)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_TX_SET, @@ -3435,6 +5130,7 @@ static int virtnet_send_tx_notf_coal_cmds(struct virtnet_info *vi, static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec) { + struct virtio_net_ctrl_coal_rx *coal_rx __free(kfree) = NULL; bool rx_ctrl_dim_on = !!ec->use_adaptive_rx_coalesce; struct scatterlist sgs_rx; int i; @@ -3448,24 +5144,34 @@ static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, if (rx_ctrl_dim_on && !vi->rx_dim_enabled) { vi->rx_dim_enabled = true; - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->max_queue_pairs; i++) { + mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].dim_enabled = true; + mutex_unlock(&vi->rq[i].dim_lock); + } return 0; } + coal_rx = kzalloc(sizeof(*coal_rx), GFP_KERNEL); + if (!coal_rx) + return -ENOMEM; + if (!rx_ctrl_dim_on && vi->rx_dim_enabled) { vi->rx_dim_enabled = false; - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->max_queue_pairs; i++) { + mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].dim_enabled = false; + mutex_unlock(&vi->rq[i].dim_lock); + } } /* Since the per-queue coalescing params can be set, * we need apply the global new params even if they * are not updated. */ - vi->ctrl->coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs); - vi->ctrl->coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames); - sg_init_one(&sgs_rx, &vi->ctrl->coal_rx, sizeof(vi->ctrl->coal_rx)); + coal_rx->rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs); + coal_rx->rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames); + sg_init_one(&sgs_rx, coal_rx, sizeof(*coal_rx)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_RX_SET, @@ -3475,8 +5181,10 @@ static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, vi->intr_coal_rx.max_usecs = ec->rx_coalesce_usecs; vi->intr_coal_rx.max_packets = ec->rx_max_coalesced_frames; for (i = 0; i < vi->max_queue_pairs; i++) { + mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].intr_coal.max_usecs = ec->rx_coalesce_usecs; vi->rq[i].intr_coal.max_packets = ec->rx_max_coalesced_frames; + mutex_unlock(&vi->rq[i].dim_lock); } return 0; @@ -3503,19 +5211,24 @@ static int virtnet_send_rx_notf_coal_vq_cmds(struct virtnet_info *vi, u16 queue) { bool rx_ctrl_dim_on = !!ec->use_adaptive_rx_coalesce; - bool cur_rx_dim = vi->rq[queue].dim_enabled; u32 max_usecs, max_packets; + bool cur_rx_dim; int err; + mutex_lock(&vi->rq[queue].dim_lock); + cur_rx_dim = vi->rq[queue].dim_enabled; max_usecs = vi->rq[queue].intr_coal.max_usecs; max_packets = vi->rq[queue].intr_coal.max_packets; if (rx_ctrl_dim_on && (ec->rx_coalesce_usecs != max_usecs || - ec->rx_max_coalesced_frames != max_packets)) + ec->rx_max_coalesced_frames != max_packets)) { + mutex_unlock(&vi->rq[queue].dim_lock); return -EINVAL; + } if (rx_ctrl_dim_on && !cur_rx_dim) { vi->rq[queue].dim_enabled = true; + mutex_unlock(&vi->rq[queue].dim_lock); return 0; } @@ -3528,10 +5241,8 @@ static int virtnet_send_rx_notf_coal_vq_cmds(struct virtnet_info *vi, err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, queue, ec->rx_coalesce_usecs, ec->rx_max_coalesced_frames); - if (err) - return err; - - return 0; + mutex_unlock(&vi->rq[queue].dim_lock); + return err; } static int virtnet_send_notf_coal_vq_cmds(struct virtnet_info *vi, @@ -3561,39 +5272,27 @@ static void virtnet_rx_dim_work(struct work_struct *work) struct virtnet_info *vi = rq->vq->vdev->priv; struct net_device *dev = vi->dev; struct dim_cq_moder update_moder; - int i, qnum, err; + int qnum, err; - if (!rtnl_trylock()) - return; + qnum = rq - vi->rq; - /* Each rxq's work is queued by "net_dim()->schedule_work()" - * in response to NAPI traffic changes. Note that dim->profile_ix - * for each rxq is updated prior to the queuing action. - * So we only need to traverse and update profiles for all rxqs - * in the work which is holding rtnl_lock. - */ - for (i = 0; i < vi->curr_queue_pairs; i++) { - rq = &vi->rq[i]; - dim = &rq->dim; - qnum = rq - vi->rq; - - if (!rq->dim_enabled) - continue; - - update_moder = net_dim_get_rx_moderation(dim->mode, dim->profile_ix); - if (update_moder.usec != rq->intr_coal.max_usecs || - update_moder.pkts != rq->intr_coal.max_packets) { - err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum, - update_moder.usec, - update_moder.pkts); - if (err) - pr_debug("%s: Failed to send dim parameters on rxq%d\n", - dev->name, qnum); - dim->state = DIM_START_MEASURE; - } - } + mutex_lock(&rq->dim_lock); + if (!rq->dim_enabled) + goto out; - rtnl_unlock(); + update_moder = net_dim_get_rx_irq_moder(dev, dim); + if (update_moder.usec != rq->intr_coal.max_usecs || + update_moder.pkts != rq->intr_coal.max_packets) { + err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum, + update_moder.usec, + update_moder.pkts); + if (err) + pr_debug("%s: Failed to send dim parameters on rxq%d\n", + dev->name, qnum); + } +out: + dim->state = DIM_START_MEASURE; + mutex_unlock(&rq->dim_lock); } static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) @@ -3629,7 +5328,7 @@ static int virtnet_set_coalesce(struct net_device *dev, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); - int ret, queue_number, napi_weight; + int ret, queue_number, napi_weight, i; bool update_napi = false; /* Can't change NAPI weight if the link is up */ @@ -3658,6 +5357,14 @@ static int virtnet_set_coalesce(struct net_device *dev, return ret; if (update_napi) { + /* xsk xmit depends on the tx napi. So if xsk is active, + * prevent modifications to tx napi. + */ + for (i = queue_number; i < vi->max_queue_pairs; i++) { + if (vi->sq[i].xsk_pool) + return -EBUSY; + } + for (; queue_number < vi->max_queue_pairs; queue_number++) vi->sq[queue_number].napi.weight = napi_weight; } @@ -3731,11 +5438,13 @@ static int virtnet_get_per_queue_coalesce(struct net_device *dev, return -EINVAL; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) { + mutex_lock(&vi->rq[queue].dim_lock); ec->rx_coalesce_usecs = vi->rq[queue].intr_coal.max_usecs; ec->tx_coalesce_usecs = vi->sq[queue].intr_coal.max_usecs; ec->tx_max_coalesced_frames = vi->sq[queue].intr_coal.max_packets; ec->rx_max_coalesced_frames = vi->rq[queue].intr_coal.max_packets; ec->use_adaptive_rx_coalesce = vi->rq[queue].dim_enabled; + mutex_unlock(&vi->rq[queue].dim_lock); } else { ec->rx_max_coalesced_frames = 1; @@ -3754,25 +5463,6 @@ static void virtnet_init_settings(struct net_device *dev) vi->duplex = DUPLEX_UNKNOWN; } -static void virtnet_update_settings(struct virtnet_info *vi) -{ - u32 speed; - u8 duplex; - - if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) - return; - - virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed); - - if (ethtool_validate_speed(speed)) - vi->speed = speed; - - virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex); - - if (ethtool_validate_duplex(duplex)) - vi->duplex = duplex; -} - static u32 virtnet_get_rxfh_key_size(struct net_device *dev) { return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size; @@ -3791,11 +5481,11 @@ static int virtnet_get_rxfh(struct net_device *dev, if (rxfh->indir) { for (i = 0; i < vi->rss_indir_table_size; ++i) - rxfh->indir[i] = vi->ctrl->rss.indirection_table[i]; + rxfh->indir[i] = le16_to_cpu(vi->rss_hdr->indirection_table[i]); } if (rxfh->key) - memcpy(rxfh->key, vi->ctrl->rss.key, vi->rss_key_size); + memcpy(rxfh->key, vi->rss_hash_key_data, vi->rss_key_size); rxfh->hfunc = ETH_RSS_HASH_TOP; @@ -3819,7 +5509,7 @@ static int virtnet_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; for (i = 0; i < vi->rss_indir_table_size; ++i) - vi->ctrl->rss.indirection_table[i] = rxfh->indir[i]; + vi->rss_hdr->indirection_table[i] = cpu_to_le16(rxfh->indir[i]); update = true; } @@ -3831,7 +5521,7 @@ static int virtnet_set_rxfh(struct net_device *dev, if (!vi->has_rss && !vi->has_rss_hash_report) return -EOPNOTSUPP; - memcpy(vi->ctrl->rss.key, rxfh->key, vi->rss_key_size); + memcpy(vi->rss_hash_key_data, rxfh->key, vi->rss_key_size); update = true; } @@ -3905,6 +5595,101 @@ static const struct ethtool_ops virtnet_ethtool_ops = { .set_rxnfc = virtnet_set_rxnfc, }; +static void virtnet_get_queue_stats_rx(struct net_device *dev, int i, + struct netdev_queue_stats_rx *stats) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct receive_queue *rq = &vi->rq[i]; + struct virtnet_stats_ctx ctx = {0}; + + virtnet_stats_ctx_init(vi, &ctx, (void *)stats, true); + + virtnet_get_hw_stats(vi, &ctx, i * 2); + virtnet_fill_stats(vi, i * 2, &ctx, (void *)&rq->stats, true, 0); +} + +static void virtnet_get_queue_stats_tx(struct net_device *dev, int i, + struct netdev_queue_stats_tx *stats) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct send_queue *sq = &vi->sq[i]; + struct virtnet_stats_ctx ctx = {0}; + + virtnet_stats_ctx_init(vi, &ctx, (void *)stats, true); + + virtnet_get_hw_stats(vi, &ctx, i * 2 + 1); + virtnet_fill_stats(vi, i * 2 + 1, &ctx, (void *)&sq->stats, true, 0); +} + +static void virtnet_get_base_stats(struct net_device *dev, + struct netdev_queue_stats_rx *rx, + struct netdev_queue_stats_tx *tx) +{ + struct virtnet_info *vi = netdev_priv(dev); + + /* The queue stats of the virtio-net will not be reset. So here we + * return 0. + */ + rx->bytes = 0; + rx->packets = 0; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { + rx->hw_drops = 0; + rx->hw_drop_overruns = 0; + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { + rx->csum_unnecessary = 0; + rx->csum_none = 0; + rx->csum_bad = 0; + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_GSO) { + rx->hw_gro_packets = 0; + rx->hw_gro_bytes = 0; + rx->hw_gro_wire_packets = 0; + rx->hw_gro_wire_bytes = 0; + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) + rx->hw_drop_ratelimits = 0; + + tx->bytes = 0; + tx->packets = 0; + tx->stop = 0; + tx->wake = 0; + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { + tx->hw_drops = 0; + tx->hw_drop_errors = 0; + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { + tx->csum_none = 0; + tx->needs_csum = 0; + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { + tx->hw_gso_packets = 0; + tx->hw_gso_bytes = 0; + tx->hw_gso_wire_packets = 0; + tx->hw_gso_wire_bytes = 0; + } + + if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) + tx->hw_drop_ratelimits = 0; + + netdev_stat_queue_sum(dev, + dev->real_num_rx_queues, vi->max_queue_pairs, rx, + dev->real_num_tx_queues, vi->max_queue_pairs, tx); +} + +static const struct netdev_stat_ops virtnet_stat_ops = { + .get_queue_stats_rx = virtnet_get_queue_stats_rx, + .get_queue_stats_tx = virtnet_get_queue_stats_tx, + .get_base_stats = virtnet_get_base_stats, +}; + static void virtnet_freeze_down(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; @@ -3917,8 +5702,11 @@ static void virtnet_freeze_down(struct virtio_device *vdev) netif_tx_lock_bh(vi->dev); netif_device_detach(vi->dev); netif_tx_unlock_bh(vi->dev); - if (netif_running(vi->dev)) + if (netif_running(vi->dev)) { + rtnl_lock(); virtnet_close(vi->dev); + rtnl_unlock(); + } } static int init_vqs(struct virtnet_info *vi); @@ -3938,7 +5726,9 @@ static int virtnet_restore_up(struct virtio_device *vdev) enable_rx_mode_work(vi); if (netif_running(vi->dev)) { + rtnl_lock(); err = virtnet_open(vi->dev); + rtnl_unlock(); if (err) return err; } @@ -3951,10 +5741,16 @@ static int virtnet_restore_up(struct virtio_device *vdev) static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads) { + __virtio64 *_offloads __free(kfree) = NULL; struct scatterlist sg; - vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads); - sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads)); + _offloads = kzalloc(sizeof(*_offloads), GFP_KERNEL); + if (!_offloads) + return -ENOMEM; + + *_offloads = cpu_to_virtio64(vi->vdev, offloads); + + sg_init_one(&sg, _offloads, sizeof(*_offloads)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS, VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) { @@ -3985,10 +5781,196 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi) return virtnet_set_guest_offloads(vi, offloads); } +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, + struct xsk_buff_pool *pool) +{ + int err, qindex; + + qindex = rq - vi->rq; + + if (pool) { + err = xdp_rxq_info_reg(&rq->xsk_rxq_info, vi->dev, qindex, rq->napi.napi_id); + if (err < 0) + return err; + + err = xdp_rxq_info_reg_mem_model(&rq->xsk_rxq_info, + MEM_TYPE_XSK_BUFF_POOL, NULL); + if (err < 0) + goto unreg; + + xsk_pool_set_rxq_info(pool, &rq->xsk_rxq_info); + } + + virtnet_rx_pause(vi, rq); + + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf, NULL); + if (err) { + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); + + pool = NULL; + } + + rq->xsk_pool = pool; + + virtnet_rx_resume(vi, rq); + + if (pool) + return 0; + +unreg: + xdp_rxq_info_unreg(&rq->xsk_rxq_info); + return err; +} + +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, + struct send_queue *sq, + struct xsk_buff_pool *pool) +{ + int err, qindex; + + qindex = sq - vi->sq; + + virtnet_tx_pause(vi, sq); + + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf, + virtnet_sq_free_unused_buf_done); + if (err) { + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); + pool = NULL; + } + + sq->xsk_pool = pool; + + virtnet_tx_resume(vi, sq); + + return err; +} + +static int virtnet_xsk_pool_enable(struct net_device *dev, + struct xsk_buff_pool *pool, + u16 qid) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct receive_queue *rq; + struct device *dma_dev; + struct send_queue *sq; + dma_addr_t hdr_dma; + int err, size; + + if (vi->hdr_len > xsk_pool_get_headroom(pool)) + return -EINVAL; + + /* In big_packets mode, xdp cannot work, so there is no need to + * initialize xsk of rq. + */ + if (vi->big_packets && !vi->mergeable_rx_bufs) + return -ENOENT; + + if (qid >= vi->curr_queue_pairs) + return -EINVAL; + + sq = &vi->sq[qid]; + rq = &vi->rq[qid]; + + /* xsk assumes that tx and rx must have the same dma device. The af-xdp + * may use one buffer to receive from the rx and reuse this buffer to + * send by the tx. So the dma dev of sq and rq must be the same one. + * + * But vq->dma_dev allows every vq has the respective dma dev. So I + * check the dma dev of vq and sq is the same dev. + */ + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) + return -EINVAL; + + dma_dev = virtqueue_dma_dev(rq->vq); + if (!dma_dev) + return -EINVAL; + + size = virtqueue_get_vring_size(rq->vq); + + rq->xsk_buffs = kvcalloc(size, sizeof(*rq->xsk_buffs), GFP_KERNEL); + if (!rq->xsk_buffs) + return -ENOMEM; + + hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, + DMA_TO_DEVICE, 0); + if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) { + err = -ENOMEM; + goto err_free_buffs; + } + + err = xsk_pool_dma_map(pool, dma_dev, 0); + if (err) + goto err_xsk_map; + + err = virtnet_rq_bind_xsk_pool(vi, rq, pool); + if (err) + goto err_rq; + + err = virtnet_sq_bind_xsk_pool(vi, sq, pool); + if (err) + goto err_sq; + + /* Now, we do not support tx offload(such as tx csum), so all the tx + * virtnet hdr is zero. So all the tx packets can share a single hdr. + */ + sq->xsk_hdr_dma_addr = hdr_dma; + + return 0; + +err_sq: + virtnet_rq_bind_xsk_pool(vi, rq, NULL); +err_rq: + xsk_pool_dma_unmap(pool, 0); +err_xsk_map: + virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, + DMA_TO_DEVICE, 0); +err_free_buffs: + kvfree(rq->xsk_buffs); + return err; +} + +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct xsk_buff_pool *pool; + struct receive_queue *rq; + struct send_queue *sq; + int err; + + if (qid >= vi->curr_queue_pairs) + return -EINVAL; + + sq = &vi->sq[qid]; + rq = &vi->rq[qid]; + + pool = rq->xsk_pool; + + err = virtnet_rq_bind_xsk_pool(vi, rq, NULL); + err |= virtnet_sq_bind_xsk_pool(vi, sq, NULL); + + xsk_pool_dma_unmap(pool, 0); + + virtqueue_dma_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, + vi->hdr_len, DMA_TO_DEVICE, 0); + kvfree(rq->xsk_buffs); + + return err; +} + +static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp) +{ + if (xdp->xsk.pool) + return virtnet_xsk_pool_enable(dev, xdp->xsk.pool, + xdp->xsk.queue_id); + else + return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id); +} + static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { - unsigned int room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM + + unsigned int room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + sizeof(struct skb_shared_info)); unsigned int max_sz = PAGE_SIZE - room - ETH_HLEN; struct virtnet_info *vi = netdev_priv(dev); @@ -4037,12 +6019,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, if (prog) bpf_prog_add(prog, vi->max_queue_pairs - 1); + virtnet_rx_pause_all(vi); + /* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { - for (i = 0; i < vi->max_queue_pairs; i++) { - napi_disable(&vi->rq[i].napi); - virtnet_napi_tx_disable(&vi->sq[i].napi); - } + for (i = 0; i < vi->max_queue_pairs; i++) + virtnet_napi_tx_disable(&vi->sq[i]); } if (!prog) { @@ -4054,7 +6036,7 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, synchronize_net(); } - err = _virtnet_set_queues(vi, curr_qp + xdp_qp); + err = virtnet_set_queues(vi, curr_qp + xdp_qp); if (err) goto err; netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); @@ -4074,14 +6056,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, vi->xdp_enabled = false; } + virtnet_rx_resume_all(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (old_prog) bpf_prog_put(old_prog); - if (netif_running(dev)) { - virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); - virtnet_napi_tx_enable(vi, vi->sq[i].vq, - &vi->sq[i].napi); - } + if (netif_running(dev)) + virtnet_napi_tx_enable(&vi->sq[i]); } return 0; @@ -4093,12 +6073,10 @@ err: rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog); } + virtnet_rx_resume_all(vi); if (netif_running(dev)) { - for (i = 0; i < vi->max_queue_pairs; i++) { - virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); - virtnet_napi_tx_enable(vi, vi->sq[i].vq, - &vi->sq[i].napi); - } + for (i = 0; i < vi->max_queue_pairs; i++) + virtnet_napi_tx_enable(&vi->sq[i]); } if (prog) bpf_prog_sub(prog, vi->max_queue_pairs - 1); @@ -4110,6 +6088,8 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return virtnet_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_SETUP_XSK_POOL: + return virtnet_xsk_pool_setup(dev, xdp); default: return -EINVAL; } @@ -4156,9 +6136,9 @@ static int virtnet_set_features(struct net_device *dev, if ((dev->features ^ features) & NETIF_F_RXHASH) { if (features & NETIF_F_RXHASH) - vi->ctrl->rss.hash_types = vi->rss_hash_types_saved; + vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_saved); else - vi->ctrl->rss.hash_types = VIRTIO_NET_HASH_REPORT_NONE; + vi->rss_hdr->hash_types = cpu_to_le32(VIRTIO_NET_HASH_REPORT_NONE); if (!virtnet_commit_rss_command(vi)) return -EINVAL; @@ -4182,6 +6162,36 @@ static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue) jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start))); } +static int virtnet_init_irq_moder(struct virtnet_info *vi) +{ + u8 profile_flags = 0, coal_flags = 0; + int ret, i; + + profile_flags |= DIM_PROFILE_RX; + coal_flags |= DIM_COALESCE_USEC | DIM_COALESCE_PKTS; + ret = net_dim_init_irq_moder(vi->dev, profile_flags, coal_flags, + DIM_CQ_PERIOD_MODE_START_FROM_EQE, + 0, virtnet_rx_dim_work, NULL); + + if (ret) + return ret; + + for (i = 0; i < vi->max_queue_pairs; i++) + net_dim_setting(vi->dev, &vi->rq[i].dim, false); + + return 0; +} + +static void virtnet_free_irq_moder(struct virtnet_info *vi) +{ + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + return; + + rtnl_lock(); + net_dim_free_irq_moder(vi->dev); + rtnl_unlock(); +} + static const struct net_device_ops virtnet_netdev = { .ndo_open = virtnet_open, .ndo_stop = virtnet_close, @@ -4194,6 +6204,7 @@ static const struct net_device_ops virtnet_netdev = { .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, .ndo_bpf = virtnet_xdp, .ndo_xdp_xmit = virtnet_xdp_xmit, + .ndo_xsk_wakeup = virtnet_xsk_wakeup, .ndo_features_check = passthru_features_check, .ndo_get_phys_port_name = virtnet_get_phys_port_name, .ndo_set_features = virtnet_set_features, @@ -4287,7 +6298,7 @@ static void free_receive_page_frags(struct virtnet_info *vi) int i; for (i = 0; i < vi->max_queue_pairs; i++) if (vi->rq[i].alloc_frag.page) { - if (vi->rq[i].do_dma && vi->rq[i].last_dma) + if (vi->rq[i].last_dma) virtnet_rq_unmap(&vi->rq[i], vi->rq[i].last_dma, 0); put_page(vi->rq[i].alloc_frag.page); } @@ -4295,10 +6306,34 @@ static void free_receive_page_frags(struct virtnet_info *vi) static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) { - if (!is_xdp_frame(buf)) + struct virtnet_info *vi = vq->vdev->priv; + struct send_queue *sq; + int i = vq2txq(vq); + + sq = &vi->sq[i]; + + switch (virtnet_xmit_ptr_unpack(&buf)) { + case VIRTNET_XMIT_TYPE_SKB: + case VIRTNET_XMIT_TYPE_SKB_ORPHAN: dev_kfree_skb(buf); - else - xdp_return_frame(ptr_to_xdp(buf)); + break; + + case VIRTNET_XMIT_TYPE_XDP: + xdp_return_frame(buf); + break; + + case VIRTNET_XMIT_TYPE_XSK: + xsk_tx_completed(sq->xsk_pool, 1); + break; + } +} + +static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq) +{ + struct virtnet_info *vi = vq->vdev->priv; + int i = vq2txq(vq); + + netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); } static void free_unused_bufs(struct virtnet_info *vi) @@ -4351,9 +6386,8 @@ static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqu static int virtnet_find_vqs(struct virtnet_info *vi) { - vq_callback_t **callbacks; + struct virtqueue_info *vqs_info; struct virtqueue **vqs; - const char **names; int ret = -ENOMEM; int total_vqs; bool *ctx; @@ -4370,12 +6404,9 @@ static int virtnet_find_vqs(struct virtnet_info *vi) vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL); if (!vqs) goto err_vq; - callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL); - if (!callbacks) - goto err_callback; - names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL); - if (!names) - goto err_names; + vqs_info = kcalloc(total_vqs, sizeof(*vqs_info), GFP_KERNEL); + if (!vqs_info) + goto err_vqs_info; if (!vi->big_packets || vi->mergeable_rx_bufs) { ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -4386,24 +6417,22 @@ static int virtnet_find_vqs(struct virtnet_info *vi) /* Parameters for control virtqueue, if any */ if (vi->has_cvq) { - callbacks[total_vqs - 1] = NULL; - names[total_vqs - 1] = "control"; + vqs_info[total_vqs - 1].name = "control"; } /* Allocate/initialize parameters for send/receive virtqueues */ for (i = 0; i < vi->max_queue_pairs; i++) { - callbacks[rxq2vq(i)] = skb_recv_done; - callbacks[txq2vq(i)] = skb_xmit_done; + vqs_info[rxq2vq(i)].callback = skb_recv_done; + vqs_info[txq2vq(i)].callback = skb_xmit_done; sprintf(vi->rq[i].name, "input.%u", i); sprintf(vi->sq[i].name, "output.%u", i); - names[rxq2vq(i)] = vi->rq[i].name; - names[txq2vq(i)] = vi->sq[i].name; + vqs_info[rxq2vq(i)].name = vi->rq[i].name; + vqs_info[txq2vq(i)].name = vi->sq[i].name; if (ctx) - ctx[rxq2vq(i)] = true; + vqs_info[rxq2vq(i)].ctx = true; } - ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks, - names, ctx, NULL); + ret = virtio_find_vqs(vi->vdev, total_vqs, vqs, vqs_info, NULL); if (ret) goto err_find; @@ -4425,10 +6454,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi) err_find: kfree(ctx); err_ctx: - kfree(names); -err_names: - kfree(callbacks); -err_callback: + kfree(vqs_info); +err_vqs_info: kfree(vqs); err_vq: return ret; @@ -4455,21 +6482,20 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) INIT_DELAYED_WORK(&vi->refill, refill_work); for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].pages = NULL; - netif_napi_add_weight(vi->dev, &vi->rq[i].napi, virtnet_poll, - napi_weight); + netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll, + i); + vi->rq[i].napi.weight = napi_weight; netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, napi_tx ? napi_weight : 0); - INIT_WORK(&vi->rq[i].dim.work, virtnet_rx_dim_work); - vi->rq[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; - sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); u64_stats_init(&vi->rq[i].stats.syncp); u64_stats_init(&vi->sq[i].stats.syncp); + mutex_init(&vi->rq[i].dim_lock); } return 0; @@ -4495,8 +6521,6 @@ static int init_vqs(struct virtnet_info *vi) if (ret) goto err_free; - virtnet_rq_set_premapped(vi); - cpus_read_lock(); virtnet_set_affinity(vi); cpus_read_unlock(); @@ -4637,6 +6661,48 @@ static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu) } } +#define VIRTIO_NET_HASH_REPORT_MAX_TABLE 10 +static enum xdp_rss_hash_type +virtnet_xdp_rss_type[VIRTIO_NET_HASH_REPORT_MAX_TABLE] = { + [VIRTIO_NET_HASH_REPORT_NONE] = XDP_RSS_TYPE_NONE, + [VIRTIO_NET_HASH_REPORT_IPv4] = XDP_RSS_TYPE_L3_IPV4, + [VIRTIO_NET_HASH_REPORT_TCPv4] = XDP_RSS_TYPE_L4_IPV4_TCP, + [VIRTIO_NET_HASH_REPORT_UDPv4] = XDP_RSS_TYPE_L4_IPV4_UDP, + [VIRTIO_NET_HASH_REPORT_IPv6] = XDP_RSS_TYPE_L3_IPV6, + [VIRTIO_NET_HASH_REPORT_TCPv6] = XDP_RSS_TYPE_L4_IPV6_TCP, + [VIRTIO_NET_HASH_REPORT_UDPv6] = XDP_RSS_TYPE_L4_IPV6_UDP, + [VIRTIO_NET_HASH_REPORT_IPv6_EX] = XDP_RSS_TYPE_L3_IPV6_EX, + [VIRTIO_NET_HASH_REPORT_TCPv6_EX] = XDP_RSS_TYPE_L4_IPV6_TCP_EX, + [VIRTIO_NET_HASH_REPORT_UDPv6_EX] = XDP_RSS_TYPE_L4_IPV6_UDP_EX +}; + +static int virtnet_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) +{ + const struct xdp_buff *xdp = (void *)_ctx; + struct virtio_net_hdr_v1_hash *hdr_hash; + struct virtnet_info *vi; + u16 hash_report; + + if (!(xdp->rxq->dev->features & NETIF_F_RXHASH)) + return -ENODATA; + + vi = netdev_priv(xdp->rxq->dev); + hdr_hash = (struct virtio_net_hdr_v1_hash *)(xdp->data - vi->hdr_len); + hash_report = __le16_to_cpu(hdr_hash->hash_report); + + if (hash_report >= VIRTIO_NET_HASH_REPORT_MAX_TABLE) + hash_report = VIRTIO_NET_HASH_REPORT_NONE; + + *rss_type = virtnet_xdp_rss_type[hash_report]; + *hash = __le32_to_cpu(hdr_hash->hash_value); + return 0; +} + +static const struct xdp_metadata_ops virtnet_xdp_metadata_ops = { + .xmo_rx_hash = virtnet_xdp_rx_hash, +}; + static int virtnet_probe(struct virtio_device *vdev) { int i, err = -ENOMEM; @@ -4666,6 +6732,7 @@ static int virtnet_probe(struct virtio_device *vdev) dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE | IFF_TX_SKB_NO_LINEAR; dev->netdev_ops = &virtnet_netdev; + dev->stat_ops = &virtnet_stat_ops; dev->features = NETIF_F_HIGHDMA; dev->ethtool_ops = &virtnet_ethtool_ops; @@ -4698,8 +6765,16 @@ static int virtnet_probe(struct virtio_device *vdev) dev->features |= dev->hw_features & NETIF_F_ALL_TSO; /* (!csum && gso) case will be fixed by register_netdev() */ } - if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM)) - dev->features |= NETIF_F_RXCSUM; + + /* 1. With VIRTIO_NET_F_GUEST_CSUM negotiation, the driver doesn't + * need to calculate checksums for partially checksummed packets, + * as they're considered valid by the upper layer. + * 2. Without VIRTIO_NET_F_GUEST_CSUM negotiation, the driver only + * receives fully checksummed packets. The device may assist in + * validating these packets' checksums, so the driver won't have to. + */ + dev->features |= NETIF_F_RXCSUM; + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) dev->features |= NETIF_F_GRO_HW; @@ -4707,7 +6782,8 @@ static int virtnet_probe(struct virtio_device *vdev) dev->hw_features |= NETIF_F_GRO_HW; dev->vlan_features = dev->features; - dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT; + dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_XSK_ZEROCOPY; /* MTU range: 68 - 65535 */ dev->min_mtu = MIN_MTU; @@ -4752,10 +6828,21 @@ static int virtnet_probe(struct virtio_device *vdev) virtio_cread16(vdev, offsetof(struct virtio_net_config, rss_max_indirection_table_length)); } + vi->rss_hdr = devm_kzalloc(&vdev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); + if (!vi->rss_hdr) { + err = -ENOMEM; + goto free; + } if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size)); + if (vi->rss_key_size > VIRTIO_NET_RSS_MAX_KEY_SIZE) { + dev_err(&vdev->dev, "rss_max_key_size=%u exceeds the limit %u.\n", + vi->rss_key_size, VIRTIO_NET_RSS_MAX_KEY_SIZE); + err = -EINVAL; + goto free; + } vi->rss_hash_types_supported = virtio_cread32(vdev, offsetof(struct virtio_net_config, supported_hash_types)); @@ -4765,6 +6852,7 @@ static int virtnet_probe(struct virtio_device *vdev) VIRTIO_NET_RSS_HASH_TYPE_UDP_EX); dev->hw_features |= NETIF_F_RXHASH; + dev->xdp_metadata_ops = &virtnet_xdp_metadata_ops; } if (vi->has_rss_hash_report) @@ -4782,6 +6870,8 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) vi->has_cvq = true; + mutex_init(&vi->cvq_lock); + if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { mtu = virtio_cread16(vdev, offsetof(struct virtio_net_config, @@ -4837,6 +6927,10 @@ static int virtnet_probe(struct virtio_device *vdev) for (i = 0; i < vi->max_queue_pairs; i++) if (vi->sq[i].napi.weight) vi->sq[i].intr_coal.max_packets = 1; + + err = virtnet_init_irq_moder(vi); + if (err) + goto free; } #ifdef CONFIG_SYSFS @@ -4871,9 +6965,21 @@ static int virtnet_probe(struct virtio_device *vdev) goto free_failover; } + /* Disable config change notification until ndo_open. */ + virtio_config_driver_disable(vi->vdev); + virtio_device_ready(vdev); - _virtnet_set_queues(vi, vi->curr_queue_pairs); + if (vi->has_rss || vi->has_rss_hash_report) { + if (!virtnet_commit_rss_command(vi)) { + dev_warn(&vdev->dev, "RSS disabled because committing failed.\n"); + dev->hw_features &= ~NETIF_F_RXHASH; + vi->has_rss_hash_report = false; + vi->has_rss = false; + } + } + + virtnet_set_queues(vi, vi->curr_queue_pairs); /* a random MAC address has been assigned, notify the device. * We don't fail probe if VIRTIO_NET_F_CTRL_MAC_ADDR is not there @@ -4893,19 +6999,38 @@ static int virtnet_probe(struct virtio_device *vdev) } } - rtnl_unlock(); + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_DEVICE_STATS)) { + struct virtio_net_stats_capabilities *stats_cap __free(kfree) = NULL; + struct scatterlist sg; + __le64 v; - err = virtnet_cpu_notif_add(vi); - if (err) { - pr_debug("virtio_net: registering cpu notifier failed\n"); - goto free_unregister_netdev; + stats_cap = kzalloc(sizeof(*stats_cap), GFP_KERNEL); + if (!stats_cap) { + rtnl_unlock(); + err = -ENOMEM; + goto free_unregister_netdev; + } + + sg_init_one(&sg, stats_cap, sizeof(*stats_cap)); + + if (!virtnet_send_command_reply(vi, VIRTIO_NET_CTRL_STATS, + VIRTIO_NET_CTRL_STATS_QUERY, + NULL, &sg)) { + pr_debug("virtio_net: fail to get stats capability\n"); + rtnl_unlock(); + err = -EINVAL; + goto free_unregister_netdev; + } + + v = stats_cap->supported_stats_types[0]; + vi->device_stats_cap = le64_to_cpu(v); } /* Assume link up if device can't report link status, otherwise get link status from config. */ netif_carrier_off(dev); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { - schedule_work(&vi->config_work); + virtnet_config_changed_work(&vi->config_work); } else { vi->status = VIRTIO_NET_S_LINK_UP; virtnet_update_settings(vi); @@ -4917,6 +7042,14 @@ static int virtnet_probe(struct virtio_device *vdev) set_bit(guest_offloads[i], &vi->guest_offloads); vi->guest_offloads_capable = vi->guest_offloads; + rtnl_unlock(); + + err = virtnet_cpu_notif_add(vi); + if (err) { + pr_debug("virtio_net: registering cpu notifier failed\n"); + goto free_unregister_netdev; + } + pr_debug("virtnet: registered device %s with %d RX and TX vq's\n", dev->name, max_queue_pairs); @@ -4938,11 +7071,20 @@ free: static void remove_vq_common(struct virtnet_info *vi) { + int i; + virtio_reset_device(vi->vdev); /* Free unused buffers in both send and recv, if any. */ free_unused_bufs(vi); + /* + * Rule of thumb is netdev_tx_reset_queue() should follow any + * skb freeing not followed by netdev_tx_completed_queue() + */ + for (i = 0; i < vi->max_queue_pairs; i++) + netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); + free_receive_bufs(vi); free_receive_page_frags(vi); @@ -4961,6 +7103,8 @@ static void virtnet_remove(struct virtio_device *vdev) disable_rx_mode_work(vi); flush_work(&vi->rx_mode_work); + virtnet_free_irq_moder(vi); + unregister_netdev(vi->dev); net_failover_destroy(vi->failover); @@ -5021,7 +7165,7 @@ static struct virtio_device_id id_table[] = { VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \ VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL, \ VIRTIO_NET_F_VQ_NOTF_COAL, \ - VIRTIO_NET_F_GUEST_HDRLEN + VIRTIO_NET_F_GUEST_HDRLEN, VIRTIO_NET_F_DEVICE_STATS static unsigned int features[] = { VIRTNET_FEATURES, @@ -5039,7 +7183,6 @@ static struct virtio_driver virtio_net_driver = { .feature_table_legacy = features_legacy, .feature_table_size_legacy = ARRAY_SIZE(features_legacy), .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .validate = virtnet_validate, .probe = virtnet_probe, |