diff options
Diffstat (limited to 'drivers/net/virtio_net.c')
| -rw-r--r-- | drivers/net/virtio_net.c | 836 |
1 files changed, 528 insertions, 308 deletions
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 64c87bb48a41..1bb3aeca66c6 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -35,6 +35,23 @@ module_param(csum, bool, 0444); module_param(gso, bool, 0444); module_param(napi_tx, bool, 0644); +#define VIRTIO_OFFLOAD_MAP_MIN 46 +#define VIRTIO_OFFLOAD_MAP_MAX 47 +#define VIRTIO_FEATURES_MAP_MIN 65 +#define VIRTIO_O2F_DELTA (VIRTIO_FEATURES_MAP_MIN - \ + VIRTIO_OFFLOAD_MAP_MIN) + +static bool virtio_is_mapped_offload(unsigned int obit) +{ + return obit >= VIRTIO_OFFLOAD_MAP_MIN && + obit <= VIRTIO_OFFLOAD_MAP_MAX; +} + +static unsigned int virtio_offload_to_feature(unsigned int obit) +{ + return virtio_is_mapped_offload(obit) ? obit + VIRTIO_O2F_DELTA : obit; +} + /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 @@ -62,15 +79,19 @@ static const unsigned long guest_offloads[] = { VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, - VIRTIO_NET_F_GUEST_HDRLEN + VIRTIO_NET_F_GUEST_HDRLEN, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED, }; #define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ - (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ - (1ULL << VIRTIO_NET_F_GUEST_UFO) | \ - (1ULL << VIRTIO_NET_F_GUEST_USO4) | \ - (1ULL << VIRTIO_NET_F_GUEST_USO6)) + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ + (1ULL << VIRTIO_NET_F_GUEST_UFO) | \ + (1ULL << VIRTIO_NET_F_GUEST_USO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_USO6) | \ + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED) | \ + (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED)) struct virtnet_stat_desc { char desc[ETH_GSTRING_LEN]; @@ -360,24 +381,7 @@ struct receive_queue { struct xdp_buff **xsk_buffs; }; -/* This structure can contain rss message with maximum settings for indirection table and keysize - * Note, that default structure that describes RSS configuration virtio_net_rss_config - * contains same info but can't handle table values. - * In any case, structure would be passed to virtio hw through sg_buf split by parts - * because table sizes may be differ according to the device configuration. - */ #define VIRTIO_NET_RSS_MAX_KEY_SIZE 40 -struct virtio_net_ctrl_rss { - u32 hash_types; - u16 indirection_table_mask; - u16 unclassified_queue; - u16 hash_cfg_reserved; /* for HASH_CONFIG (see virtio_net_hash_config for details) */ - u16 max_tx_vq; - u8 hash_key_length; - u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; - - u16 *indirection_table; -}; /* Control VQ buffers: protected by the rtnl lock */ struct control_buf { @@ -421,7 +425,9 @@ struct virtnet_info { u16 rss_indir_table_size; u32 rss_hash_types_supported; u32 rss_hash_types_saved; - struct virtio_net_ctrl_rss rss; + struct virtio_net_rss_config_hdr *rss_hdr; + struct virtio_net_rss_config_trailer rss_trailer; + u8 rss_hash_key_data[VIRTIO_NET_RSS_MAX_KEY_SIZE]; /* Has control virtqueue */ bool has_cvq; @@ -438,6 +444,13 @@ struct virtnet_info { /* Work struct for delayed refilling if we run low on memory. */ struct delayed_work refill; + /* UDP tunnel support */ + bool tx_tnl; + + bool rx_tnl; + + bool rx_tnl_csum; + /* Is delayed refill enabled? */ bool refill_enabled; @@ -497,12 +510,14 @@ struct virtio_net_common_hdr { struct virtio_net_hdr hdr; struct virtio_net_hdr_mrg_rxbuf mrg_hdr; struct virtio_net_hdr_v1_hash hash_v1_hdr; + struct virtio_net_hdr_v1_hash_tunnel tnl_hdr; }; }; static struct virtio_net_common_hdr xsk_hdr; static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); +static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq); static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, struct net_device *dev, unsigned int *xdp_xmit, @@ -522,23 +537,16 @@ enum virtnet_xmit_type { VIRTNET_XMIT_TYPE_XSK, }; -static int rss_indirection_table_alloc(struct virtio_net_ctrl_rss *rss, u16 indir_table_size) +static size_t virtnet_rss_hdr_size(const struct virtnet_info *vi) { - if (!indir_table_size) { - rss->indirection_table = NULL; - return 0; - } + u16 indir_table_size = vi->has_rss ? vi->rss_indir_table_size : 1; - rss->indirection_table = kmalloc_array(indir_table_size, sizeof(u16), GFP_KERNEL); - if (!rss->indirection_table) - return -ENOMEM; - - return 0; + return struct_size(vi->rss_hdr, indirection_table, indir_table_size); } -static void rss_indirection_table_free(struct virtio_net_ctrl_rss *rss) +static size_t virtnet_rss_trailer_size(const struct virtnet_info *vi) { - kfree(rss->indirection_table); + return struct_size(&vi->rss_trailer, hash_key_data, vi->rss_key_size); } /* We use the last two bits of the pointer to distinguish the xmit type. */ @@ -767,10 +775,26 @@ static bool virtqueue_napi_complete(struct napi_struct *napi, return false; } +static void virtnet_tx_wake_queue(struct virtnet_info *vi, + struct send_queue *sq) +{ + unsigned int index = vq2txq(sq->vq); + struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index); + + if (netif_tx_queue_stopped(txq)) { + u64_stats_update_begin(&sq->stats.syncp); + u64_stats_inc(&sq->stats.wake); + u64_stats_update_end(&sq->stats.syncp); + netif_tx_wake_queue(txq); + } +} + static void skb_xmit_done(struct virtqueue *vq) { struct virtnet_info *vi = vq->vdev->priv; - struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi; + unsigned int index = vq2txq(vq); + struct send_queue *sq = &vi->sq[index]; + struct napi_struct *napi = &sq->napi; /* Suppress further interrupts. */ virtqueue_disable_cb(vq); @@ -778,8 +802,7 @@ static void skb_xmit_done(struct virtqueue *vq) if (napi->weight) virtqueue_napi_schedule(napi, vq); else - /* We were probably waiting for more output buffers. */ - netif_wake_subqueue(vi->dev, vq2txq(vq)); + virtnet_tx_wake_queue(vi, sq); } #define MRG_CTX_HEADER_SHIFT 22 @@ -799,6 +822,26 @@ static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1); } +static int check_mergeable_len(struct net_device *dev, void *mrg_ctx, + unsigned int len) +{ + unsigned int headroom, tailroom, room, truesize; + + truesize = mergeable_ctx_to_truesize(mrg_ctx); + headroom = mergeable_ctx_to_headroom(mrg_ctx); + tailroom = headroom ? sizeof(struct skb_shared_info) : 0; + room = SKB_DATA_ALIGN(headroom + tailroom); + + if (len > truesize - room) { + pr_debug("%s: rx error: len %u exceeds truesize %lu\n", + dev->name, len, (unsigned long)(truesize - room)); + DEV_STATS_INC(dev, rx_length_errors); + return -1; + } + + return 0; +} + static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, unsigned int headroom, unsigned int len) @@ -882,17 +925,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, goto ok; } - /* - * Verify that we can indeed put this data into a skb. - * This is here to handle cases when the device erroneously - * tries to receive more than is possible. This is usually - * the case of a broken device. - */ - if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) { - net_dbg_ratelimited("%s: too much data\n", skb->dev->name); - dev_kfree_skb(skb); - return NULL; - } BUG_ON(offset >= PAGE_SIZE); while (len) { unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len); @@ -934,7 +966,7 @@ static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) if (dma->need_sync && len) { offset = buf - (head + sizeof(*dma)); - virtqueue_dma_sync_single_range_for_cpu(rq->vq, dma->addr, + virtqueue_map_sync_single_range_for_cpu(rq->vq, dma->addr, offset, len, DMA_FROM_DEVICE); } @@ -942,8 +974,8 @@ static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) if (dma->ref) return; - virtqueue_dma_unmap_single_attrs(rq->vq, dma->addr, dma->len, - DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); + virtqueue_unmap_single_attrs(rq->vq, dma->addr, dma->len, + DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); put_page(page); } @@ -1010,13 +1042,13 @@ static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) dma->len = alloc_frag->size - sizeof(*dma); - addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1, - dma->len, DMA_FROM_DEVICE, 0); - if (virtqueue_dma_mapping_error(rq->vq, addr)) + addr = virtqueue_map_single_attrs(rq->vq, dma + 1, + dma->len, DMA_FROM_DEVICE, 0); + if (virtqueue_map_mapping_error(rq->vq, addr)) return NULL; dma->addr = addr; - dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr); + dma->need_sync = virtqueue_map_need_sync(rq->vq, addr); /* Add a reference to dma to prevent the entire dma from * being released during error handling. This reference @@ -1087,11 +1119,10 @@ static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) return false; } -static void check_sq_full_and_disable(struct virtnet_info *vi, - struct net_device *dev, - struct send_queue *sq) +static bool tx_may_stop(struct virtnet_info *vi, + struct net_device *dev, + struct send_queue *sq) { - bool use_napi = sq->napi.weight; int qnum; qnum = sq - vi->sq; @@ -1106,20 +1137,39 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, * Since most packets only take 1 or 2 ring slots, stopping the queue * early means 16 slots are typically wasted. */ - if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { + if (sq->vq->num_free < MAX_SKB_FRAGS + 2) { struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); netif_tx_stop_queue(txq); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.stop); u64_stats_update_end(&sq->stats.syncp); + + return true; + } + + return false; +} + +static void check_sq_full_and_disable(struct virtnet_info *vi, + struct net_device *dev, + struct send_queue *sq) +{ + bool use_napi = sq->napi.weight; + int qnum; + + qnum = sq - vi->sq; + + if (tx_may_stop(vi, dev, sq)) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); + if (use_napi) { if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) virtqueue_napi_schedule(&sq->napi, sq->vq); } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */ free_old_xmit(sq, txq, false); - if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { + if (sq->vq->num_free >= MAX_SKB_FRAGS + 2) { netif_start_subqueue(dev, qnum); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.wake); @@ -1130,15 +1180,29 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, } } +/* Note that @len is the length of received data without virtio header */ static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, - struct receive_queue *rq, void *buf, u32 len) + struct receive_queue *rq, void *buf, + u32 len, bool first_buf) { struct xdp_buff *xdp; u32 bufsize; xdp = (struct xdp_buff *)buf; - bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; + /* In virtnet_add_recvbuf_xsk, we use part of XDP_PACKET_HEADROOM for + * virtio header and ask the vhost to fill data from + * hard_start + XDP_PACKET_HEADROOM - vi->hdr_len + * The first buffer has virtio header so the remaining region for frame + * data is + * xsk_pool_get_rx_frame_size() + * While other buffers than the first one do not have virtio header, so + * the maximum frame data's length can be + * xsk_pool_get_rx_frame_size() + vi->hdr_len + */ + bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool); + if (!first_buf) + bufsize += vi->hdr_len; if (unlikely(len > bufsize)) { pr_debug("%s: rx error: len %u exceeds truesize %u\n", @@ -1148,7 +1212,14 @@ static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, return NULL; } - xsk_buff_set_size(xdp, len); + if (first_buf) { + xsk_buff_set_size(xdp, len); + } else { + xdp_prepare_buff(xdp, xdp->data_hard_start, + XDP_PACKET_HEADROOM - vi->hdr_len, len, 1); + xdp->flags = 0; + } + xsk_buff_dma_sync_for_cpu(xdp); return xdp; @@ -1263,7 +1334,7 @@ static int xsk_append_merge_buffer(struct virtnet_info *vi, u64_stats_add(&stats->bytes, len); - xdp = buf_to_xdp(vi, rq, buf, len); + xdp = buf_to_xdp(vi, rq, buf, len, false); if (!xdp) goto err; @@ -1273,7 +1344,7 @@ static int xsk_append_merge_buffer(struct virtnet_info *vi, goto err; } - memcpy(buf, xdp->data - vi->hdr_len, len); + memcpy(buf, xdp->data, len); xsk_buff_free(xdp); @@ -1312,9 +1383,14 @@ static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct ret = XDP_PASS; rcu_read_lock(); prog = rcu_dereference(rq->xdp_prog); - /* TODO: support multi buffer. */ - if (prog && num_buf == 1) - ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); + if (prog) { + /* TODO: support multi buffer. */ + if (num_buf == 1) + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, + stats); + else + ret = XDP_ABORTED; + } rcu_read_unlock(); switch (ret) { @@ -1361,7 +1437,7 @@ static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queu u64_stats_add(&stats->bytes, len); - xdp = buf_to_xdp(vi, rq, buf, len); + xdp = buf_to_xdp(vi, rq, buf, len, true); if (!xdp) return; @@ -1800,7 +1876,8 @@ static unsigned int virtnet_get_headroom(struct virtnet_info *vi) * across multiple buffers (num_buf > 1), and we make sure buffers * have enough headroom. */ -static struct page *xdp_linearize_page(struct receive_queue *rq, +static struct page *xdp_linearize_page(struct net_device *dev, + struct receive_queue *rq, int *num_buf, struct page *p, int offset, @@ -1820,18 +1897,27 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, memcpy(page_address(page) + page_off, page_address(p) + offset, *len); page_off += *len; + /* Only mergeable mode can go inside this while loop. In small mode, + * *num_buf == 1, so it cannot go inside. + */ while (--*num_buf) { unsigned int buflen; void *buf; + void *ctx; int off; - buf = virtnet_rq_get_buf(rq, &buflen, NULL); + buf = virtnet_rq_get_buf(rq, &buflen, &ctx); if (unlikely(!buf)) goto err_buf; p = virt_to_head_page(buf); off = buf - page_address(p); + if (check_mergeable_len(dev, ctx, buflen)) { + put_page(p); + goto err_buf; + } + /* guard against a misconfigured or uncooperative backend that * is sending packet larger than the MTU. */ @@ -1920,7 +2006,7 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev, headroom = vi->hdr_len + header_offset; buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - xdp_page = xdp_linearize_page(rq, &num_buf, page, + xdp_page = xdp_linearize_page(dev, rq, &num_buf, page, offset, header_offset, &tlen); if (!xdp_page) @@ -2030,9 +2116,19 @@ static struct sk_buff *receive_big(struct net_device *dev, struct virtnet_rq_stats *stats) { struct page *page = buf; - struct sk_buff *skb = - page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0); + struct sk_buff *skb; + + /* Make sure that len does not exceed the size allocated in + * add_recvbuf_big. + */ + if (unlikely(len > (vi->big_packets_num_skbfrags + 1) * PAGE_SIZE)) { + pr_debug("%s: rx error: len %u exceeds allocated size %lu\n", + dev->name, len, + (vi->big_packets_num_skbfrags + 1) * PAGE_SIZE); + goto err; + } + skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0); u64_stats_add(&stats->bytes, len - vi->hdr_len); if (unlikely(!skb)) goto err; @@ -2108,10 +2204,9 @@ static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev, skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - xdp_frags_truesz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + xdp_frags_truesz, + xdp_buff_get_skb_flags(xdp)); return skb; } @@ -2129,10 +2224,9 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; - unsigned int headroom, tailroom, room; - unsigned int truesize, cur_frag_size; struct skb_shared_info *shinfo; unsigned int xdp_frags_truesz = 0; + unsigned int truesize; struct page *page; skb_frag_t *frag; int offset; @@ -2175,21 +2269,14 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, page = virt_to_head_page(buf); offset = buf - page_address(page); - truesize = mergeable_ctx_to_truesize(ctx); - headroom = mergeable_ctx_to_headroom(ctx); - tailroom = headroom ? sizeof(struct skb_shared_info) : 0; - room = SKB_DATA_ALIGN(headroom + tailroom); - - cur_frag_size = truesize; - xdp_frags_truesz += cur_frag_size; - if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { + if (check_mergeable_len(dev, ctx, len)) { put_page(page); - pr_debug("%s: rx error: len %u exceeds truesize %lu\n", - dev->name, len, (unsigned long)(truesize - room)); - DEV_STATS_INC(dev, rx_length_errors); goto err; } + truesize = mergeable_ctx_to_truesize(ctx); + xdp_frags_truesz += truesize; + frag = &shinfo->frags[shinfo->nr_frags++]; skb_frag_fill_page_desc(frag, page, offset, len); if (page_is_pfmemalloc(page)) @@ -2255,7 +2342,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, */ if (!xdp_prog->aux->xdp_has_frags) { /* linearize data for XDP */ - xdp_page = xdp_linearize_page(rq, num_buf, + xdp_page = xdp_linearize_page(vi->dev, rq, num_buf, *page, offset, XDP_PACKET_HEADROOM, len); @@ -2403,18 +2490,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct sk_buff *head_skb, *curr_skb; unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); - unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; - unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); head_skb = NULL; u64_stats_add(&stats->bytes, len - vi->hdr_len); - if (unlikely(len > truesize - room)) { - pr_debug("%s: rx error: len %u exceeds truesize %lu\n", - dev->name, len, (unsigned long)(truesize - room)); - DEV_STATS_INC(dev, rx_length_errors); + if (check_mergeable_len(dev, ctx, len)) goto err_skb; - } if (unlikely(vi->xdp_enabled)) { struct bpf_prog *xdp_prog; @@ -2449,17 +2530,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, u64_stats_add(&stats->bytes, len); page = virt_to_head_page(buf); - truesize = mergeable_ctx_to_truesize(ctx); - headroom = mergeable_ctx_to_headroom(ctx); - tailroom = headroom ? sizeof(struct skb_shared_info) : 0; - room = SKB_DATA_ALIGN(headroom + tailroom); - if (unlikely(len > truesize - room)) { - pr_debug("%s: rx error: len %u exceeds truesize %lu\n", - dev->name, len, (unsigned long)(truesize - room)); - DEV_STATS_INC(dev, rx_length_errors); + if (check_mergeable_len(dev, ctx, len)) goto err_skb; - } + truesize = mergeable_ctx_to_truesize(ctx); curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, buf, len, truesize); if (!curr_skb) @@ -2479,6 +2553,13 @@ err_buf: return NULL; } +static inline u32 +virtio_net_hash_value(const struct virtio_net_hdr_v1_hash *hdr_hash) +{ + return __le16_to_cpu(hdr_hash->hash_value_lo) | + (__le16_to_cpu(hdr_hash->hash_value_hi) << 16); +} + static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, struct sk_buff *skb) { @@ -2505,7 +2586,7 @@ static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, default: rss_hash_type = PKT_HASH_TYPE_NONE; } - skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type); + skb_set_hash(skb, virtio_net_hash_value(hdr_hash), rss_hash_type); } static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, @@ -2518,14 +2599,21 @@ static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue * if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report) virtio_skb_set_hash(&hdr->hash_v1_hdr, skb); - if (flags & VIRTIO_NET_HDR_F_DATA_VALID) - skb->ip_summed = CHECKSUM_UNNECESSARY; + hdr->hdr.flags = flags; + if (virtio_net_handle_csum_offload(skb, &hdr->hdr, vi->rx_tnl_csum)) { + net_warn_ratelimited("%s: bad csum: flags: %x, gso_type: %x rx_tnl_csum %d\n", + dev->name, hdr->hdr.flags, + hdr->hdr.gso_type, vi->rx_tnl_csum); + goto frame_err; + } - if (virtio_net_hdr_to_skb(skb, &hdr->hdr, - virtio_is_little_endian(vi->vdev))) { - net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n", + if (virtio_net_hdr_tnl_to_skb(skb, &hdr->tnl_hdr, vi->rx_tnl, + vi->rx_tnl_csum, + virtio_is_little_endian(vi->vdev))) { + net_warn_ratelimited("%s: bad gso: type: %x, size: %u, flags %x tunnel %d tnl csum %d\n", dev->name, hdr->hdr.gso_type, - hdr->hdr.gso_size); + hdr->hdr.gso_size, hdr->hdr.flags, + vi->rx_tnl, vi->rx_tnl_csum); goto frame_err; } @@ -2558,22 +2646,28 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, return; } - /* 1. Save the flags early, as the XDP program might overwrite them. + /* About the flags below: + * 1. Save the flags early, as the XDP program might overwrite them. * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID * stay valid after XDP processing. * 2. XDP doesn't work with partially checksummed packets (refer to * virtnet_xdp_set()), so packets marked as * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing. */ - flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; - if (vi->mergeable_rx_bufs) + if (vi->mergeable_rx_bufs) { + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); - else if (vi->big_packets) + } else if (vi->big_packets) { + void *p = page_address((struct page *)buf); + + flags = ((struct virtio_net_common_hdr *)p)->hdr.flags; skb = receive_big(dev, vi, rq, buf, len, stats); - else + } else { + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); + } if (unlikely(!skb)) return; @@ -2788,7 +2882,8 @@ static void skb_recv_done(struct virtqueue *rvq) virtqueue_napi_schedule(&rq->napi, rvq); } -static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi) +static void virtnet_napi_do_enable(struct virtqueue *vq, + struct napi_struct *napi) { napi_enable(napi); @@ -2801,10 +2896,21 @@ static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi) local_bh_enable(); } -static void virtnet_napi_tx_enable(struct virtnet_info *vi, - struct virtqueue *vq, - struct napi_struct *napi) +static void virtnet_napi_enable(struct receive_queue *rq) +{ + struct virtnet_info *vi = rq->vq->vdev->priv; + int qidx = vq2rxq(rq->vq); + + virtnet_napi_do_enable(rq->vq, &rq->napi); + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, &rq->napi); +} + +static void virtnet_napi_tx_enable(struct send_queue *sq) { + struct virtnet_info *vi = sq->vq->vdev->priv; + struct napi_struct *napi = &sq->napi; + int qidx = vq2txq(sq->vq); + if (!napi->weight) return; @@ -2816,13 +2922,30 @@ static void virtnet_napi_tx_enable(struct virtnet_info *vi, return; } - return virtnet_napi_enable(vq, napi); + virtnet_napi_do_enable(sq->vq, napi); + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, napi); } -static void virtnet_napi_tx_disable(struct napi_struct *napi) +static void virtnet_napi_tx_disable(struct send_queue *sq) { - if (napi->weight) + struct virtnet_info *vi = sq->vq->vdev->priv; + struct napi_struct *napi = &sq->napi; + int qidx = vq2txq(sq->vq); + + if (napi->weight) { + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, NULL); napi_disable(napi); + } +} + +static void virtnet_napi_disable(struct receive_queue *rq) +{ + struct virtnet_info *vi = rq->vq->vdev->priv; + struct napi_struct *napi = &rq->napi; + int qidx = vq2rxq(rq->vq); + + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, NULL); + napi_disable(napi); } static void refill_work(struct work_struct *work) @@ -2835,9 +2958,23 @@ static void refill_work(struct work_struct *work) for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; + /* + * When queue API support is added in the future and the call + * below becomes napi_disable_locked, this driver will need to + * be refactored. + * + * One possible solution would be to: + * - cancel refill_work with cancel_delayed_work (note: + * non-sync) + * - cancel refill_work with cancel_delayed_work_sync in + * virtnet_remove after the netdev is unregistered + * - wrap all of the work in a lock (perhaps the netdev + * instance lock) + * - check netif_running() and return early to avoid a race + */ napi_disable(&rq->napi); still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); - virtnet_napi_enable(rq->vq, &rq->napi); + virtnet_napi_do_enable(rq->vq, &rq->napi); /* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again. @@ -2958,14 +3095,8 @@ static void virtnet_poll_cleantx(struct receive_queue *rq, int budget) free_old_xmit(sq, txq, !!budget); } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); - if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { - if (netif_tx_queue_stopped(txq)) { - u64_stats_update_begin(&sq->stats.syncp); - u64_stats_inc(&sq->stats.wake); - u64_stats_update_end(&sq->stats.syncp); - } - netif_tx_wake_queue(txq); - } + if (sq->vq->num_free >= MAX_SKB_FRAGS + 2) + virtnet_tx_wake_queue(vi, sq); __netif_tx_unlock(txq); } @@ -3034,8 +3165,8 @@ static int virtnet_poll(struct napi_struct *napi, int budget) static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index) { - virtnet_napi_tx_disable(&vi->sq[qp_index].napi); - napi_disable(&vi->rq[qp_index].napi); + virtnet_napi_tx_disable(&vi->sq[qp_index]); + virtnet_napi_disable(&vi->rq[qp_index]); xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); } @@ -3054,9 +3185,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) if (err < 0) goto err_xdp_reg_mem_model; - netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); - virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); - virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); + virtnet_napi_enable(&vi->rq[qp_index]); + virtnet_napi_tx_enable(&vi->sq[qp_index]); return 0; @@ -3156,14 +3286,8 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) else free_old_xmit(sq, txq, !!budget); - if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { - if (netif_tx_queue_stopped(txq)) { - u64_stats_update_begin(&sq->stats.syncp); - u64_stats_inc(&sq->stats.wake); - u64_stats_update_end(&sq->stats.syncp); - } - netif_tx_wake_queue(txq); - } + if (sq->vq->num_free >= MAX_SKB_FRAGS + 2) + virtnet_tx_wake_queue(vi, sq); if (xsk_done >= budget) { __netif_tx_unlock(txq); @@ -3195,32 +3319,37 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) { - struct virtio_net_hdr_mrg_rxbuf *hdr; const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; struct virtnet_info *vi = sq->vq->vdev->priv; + struct virtio_net_hdr_v1_hash_tunnel *hdr; int num_sg; unsigned hdr_len = vi->hdr_len; bool can_push; pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); + /* Make sure it's safe to cast between formats */ + BUILD_BUG_ON(__alignof__(*hdr) != __alignof__(hdr->hash_hdr)); + BUILD_BUG_ON(__alignof__(*hdr) != __alignof__(hdr->hash_hdr.hdr)); + can_push = vi->any_header_sg && !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; /* Even if we can, don't push here yet as this would skew * csum_start offset below. */ if (can_push) - hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len); + hdr = (struct virtio_net_hdr_v1_hash_tunnel *)(skb->data - + hdr_len); else - hdr = &skb_vnet_common_hdr(skb)->mrg_hdr; + hdr = &skb_vnet_common_hdr(skb)->tnl_hdr; - if (virtio_net_hdr_from_skb(skb, &hdr->hdr, - virtio_is_little_endian(vi->vdev), false, - 0)) + if (virtio_net_hdr_tnl_from_skb(skb, hdr, vi->tx_tnl, + virtio_is_little_endian(vi->vdev), 0, + false)) return -EPROTO; if (vi->mergeable_rx_bufs) - hdr->num_buffers = 0; + hdr->hash_hdr.hdr.num_buffers = 0; sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2)); if (can_push) { @@ -3253,15 +3382,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) bool use_napi = sq->napi.weight; bool kick; - /* Free up any pending old buffers before queueing new ones. */ - do { - if (use_napi) - virtqueue_disable_cb(sq->vq); - + if (!use_napi) free_old_xmit(sq, txq, false); - - } while (use_napi && !xmit_more && - unlikely(!virtqueue_enable_cb_delayed(sq->vq))); + else + virtqueue_disable_cb(sq->vq); /* timestamp packet in software */ skb_tx_timestamp(skb); @@ -3287,7 +3411,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset_ct(skb); } - check_sq_full_and_disable(vi, dev, sq); + if (use_napi) + tx_may_stop(vi, dev, sq); + else + check_sq_full_and_disable(vi, dev,sq); kick = use_napi ? __netdev_tx_sent_queue(txq, skb->len, xmit_more) : !xmit_more || netif_xmit_stopped(txq); @@ -3299,28 +3426,81 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) } } + if (use_napi && kick && unlikely(!virtqueue_enable_cb_delayed(sq->vq))) + virtqueue_napi_schedule(&sq->napi, sq->vq); + return NETDEV_TX_OK; } -static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) +static void __virtnet_rx_pause(struct virtnet_info *vi, + struct receive_queue *rq) { bool running = netif_running(vi->dev); if (running) { - napi_disable(&rq->napi); + virtnet_napi_disable(rq); virtnet_cancel_dim(vi, &rq->dim); } } -static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) +static void virtnet_rx_pause_all(struct virtnet_info *vi) +{ + int i; + + /* + * Make sure refill_work does not run concurrently to + * avoid napi_disable race which leads to deadlock. + */ + disable_delayed_refill(vi); + cancel_delayed_work_sync(&vi->refill); + for (i = 0; i < vi->max_queue_pairs; i++) + __virtnet_rx_pause(vi, &vi->rq[i]); +} + +static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) +{ + /* + * Make sure refill_work does not run concurrently to + * avoid napi_disable race which leads to deadlock. + */ + disable_delayed_refill(vi); + cancel_delayed_work_sync(&vi->refill); + __virtnet_rx_pause(vi, rq); +} + +static void __virtnet_rx_resume(struct virtnet_info *vi, + struct receive_queue *rq, + bool refill) { bool running = netif_running(vi->dev); + bool schedule_refill = false; + + if (refill && !try_fill_recv(vi, rq, GFP_KERNEL)) + schedule_refill = true; + if (running) + virtnet_napi_enable(rq); - if (!try_fill_recv(vi, rq, GFP_KERNEL)) + if (schedule_refill) schedule_delayed_work(&vi->refill, 0); +} - if (running) - virtnet_napi_enable(rq->vq, &rq->napi); +static void virtnet_rx_resume_all(struct virtnet_info *vi) +{ + int i; + + enable_delayed_refill(vi); + for (i = 0; i < vi->max_queue_pairs; i++) { + if (i < vi->curr_queue_pairs) + __virtnet_rx_resume(vi, &vi->rq[i], true); + else + __virtnet_rx_resume(vi, &vi->rq[i], false); + } +} + +static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) +{ + enable_delayed_refill(vi); + __virtnet_rx_resume(vi, rq, true); } static int virtnet_rx_resize(struct virtnet_info *vi, @@ -3332,7 +3512,7 @@ static int virtnet_rx_resize(struct virtnet_info *vi, virtnet_rx_pause(vi, rq); - err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf); + err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf, NULL); if (err) netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); @@ -3349,7 +3529,7 @@ static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq) qindex = sq - vi->sq; if (running) - virtnet_napi_tx_disable(&sq->napi); + virtnet_napi_tx_disable(sq); txq = netdev_get_tx_queue(vi->dev, qindex); @@ -3363,6 +3543,9 @@ static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq) /* Prevent the upper layer from trying to send packets. */ netif_stop_subqueue(vi->dev, qindex); + u64_stats_update_begin(&sq->stats.syncp); + u64_stats_inc(&sq->stats.stop); + u64_stats_update_end(&sq->stats.syncp); __netif_tx_unlock_bh(txq); } @@ -3379,11 +3562,11 @@ static void virtnet_tx_resume(struct virtnet_info *vi, struct send_queue *sq) __netif_tx_lock_bh(txq); sq->reset = false; - netif_tx_wake_queue(txq); + virtnet_tx_wake_queue(vi, sq); __netif_tx_unlock_bh(txq); if (running) - virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); + virtnet_napi_tx_enable(sq); } static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, @@ -3391,11 +3574,18 @@ static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, { int qindex, err; + if (ring_num <= MAX_SKB_FRAGS + 2) { + netdev_err(vi->dev, "tx size (%d) cannot be smaller than %d\n", + ring_num, MAX_SKB_FRAGS + 2); + return -EINVAL; + } + qindex = sq - vi->sq; virtnet_tx_pause(vi, sq); - err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf); + err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf, + virtnet_sq_free_unused_buf_done); if (err) netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); @@ -3575,15 +3765,16 @@ static void virtnet_rss_update_by_qpairs(struct virtnet_info *vi, u16 queue_pair for (; i < vi->rss_indir_table_size; ++i) { indir_val = ethtool_rxfh_indir_default(i, queue_pairs); - vi->rss.indirection_table[i] = indir_val; + vi->rss_hdr->indirection_table[i] = cpu_to_le16(indir_val); } - vi->rss.max_tx_vq = queue_pairs; + vi->rss_trailer.max_tx_vq = cpu_to_le16(queue_pairs); } static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) { struct virtio_net_ctrl_mq *mq __free(kfree) = NULL; - struct virtio_net_ctrl_rss old_rss; + struct virtio_net_rss_config_hdr *old_rss_hdr; + struct virtio_net_rss_config_trailer old_rss_trailer; struct net_device *dev = vi->dev; struct scatterlist sg; @@ -3594,28 +3785,32 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) * (2) no user configuration. * * During rss command processing, device updates queue_pairs using rss.max_tx_vq. That is, - * the device updates queue_pairs together with rss, so we can skip the sperate queue_pairs + * the device updates queue_pairs together with rss, so we can skip the separate queue_pairs * update (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. */ if (vi->has_rss && !netif_is_rxfh_configured(dev)) { - memcpy(&old_rss, &vi->rss, sizeof(old_rss)); - if (rss_indirection_table_alloc(&vi->rss, vi->rss_indir_table_size)) { - vi->rss.indirection_table = old_rss.indirection_table; + old_rss_hdr = vi->rss_hdr; + old_rss_trailer = vi->rss_trailer; + vi->rss_hdr = devm_kzalloc(&dev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); + if (!vi->rss_hdr) { + vi->rss_hdr = old_rss_hdr; return -ENOMEM; } + *vi->rss_hdr = *old_rss_hdr; virtnet_rss_update_by_qpairs(vi, queue_pairs); if (!virtnet_commit_rss_command(vi)) { /* restore ctrl_rss if commit_rss_command failed */ - rss_indirection_table_free(&vi->rss); - memcpy(&vi->rss, &old_rss, sizeof(old_rss)); + devm_kfree(&dev->dev, vi->rss_hdr); + vi->rss_hdr = old_rss_hdr; + vi->rss_trailer = old_rss_trailer; dev_warn(&dev->dev, "Fail to set num of queue pairs to %d, because committing RSS failed\n", queue_pairs); return -EINVAL; } - rss_indirection_table_free(&old_rss); + devm_kfree(&dev->dev, old_rss_hdr); goto succ; } @@ -3635,8 +3830,10 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) succ: vi->curr_queue_pairs = queue_pairs; /* virtnet_open() will refill when device is going to up. */ - if (dev->flags & IFF_UP) + spin_lock_bh(&vi->refill_lock); + if (dev->flags & IFF_UP && vi->refill_enabled) schedule_delayed_work(&vi->refill, 0); + spin_unlock_bh(&vi->refill_lock); return 0; } @@ -3825,7 +4022,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) cpumask_var_t mask; int stragglers; int group_size; - int i, j, cpu; + int i, start = 0, cpu; int num_cpu; int stride; @@ -3839,16 +4036,18 @@ static void virtnet_set_affinity(struct virtnet_info *vi) stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; - cpu = cpumask_first(cpu_online_mask); for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); - for (j = 0; j < group_size; j++) { + for_each_online_cpu_wrap(cpu, start) { + if (!group_size--) { + start = cpu; + break; + } cpumask_set_cpu(cpu, mask); - cpu = cpumask_next_wrap(cpu, cpu_online_mask, - nr_cpu_ids, false); } + virtqueue_set_affinity(vi->rq[i].vq, mask); virtqueue_set_affinity(vi->sq[i].vq, mask); __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS); @@ -4058,28 +4257,12 @@ static int virtnet_set_ringparam(struct net_device *dev, static bool virtnet_commit_rss_command(struct virtnet_info *vi) { struct net_device *dev = vi->dev; - struct scatterlist sgs[4]; - unsigned int sg_buf_size; + struct scatterlist sgs[2]; /* prepare sgs */ - sg_init_table(sgs, 4); - - sg_buf_size = offsetof(struct virtio_net_ctrl_rss, hash_cfg_reserved); - sg_set_buf(&sgs[0], &vi->rss, sg_buf_size); - - if (vi->has_rss) { - sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; - sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); - } else { - sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, sizeof(uint16_t)); - } - - sg_buf_size = offsetof(struct virtio_net_ctrl_rss, key) - - offsetof(struct virtio_net_ctrl_rss, max_tx_vq); - sg_set_buf(&sgs[2], &vi->rss.max_tx_vq, sg_buf_size); - - sg_buf_size = vi->rss_key_size; - sg_set_buf(&sgs[3], vi->rss.key, sg_buf_size); + sg_init_table(sgs, 2); + sg_set_buf(&sgs[0], vi->rss_hdr, virtnet_rss_hdr_size(vi)); + sg_set_buf(&sgs[1], &vi->rss_trailer, virtnet_rss_trailer_size(vi)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, vi->has_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG @@ -4096,21 +4279,24 @@ err: static void virtnet_init_default_rss(struct virtnet_info *vi) { - vi->rss.hash_types = vi->rss_hash_types_supported; + vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_supported); vi->rss_hash_types_saved = vi->rss_hash_types_supported; - vi->rss.indirection_table_mask = vi->rss_indir_table_size - ? vi->rss_indir_table_size - 1 : 0; - vi->rss.unclassified_queue = 0; + vi->rss_hdr->indirection_table_mask = vi->rss_indir_table_size + ? cpu_to_le16(vi->rss_indir_table_size - 1) : 0; + vi->rss_hdr->unclassified_queue = 0; virtnet_rss_update_by_qpairs(vi, vi->curr_queue_pairs); - vi->rss.hash_key_length = vi->rss_key_size; + vi->rss_trailer.hash_key_length = vi->rss_key_size; - netdev_rss_key_fill(vi->rss.key, vi->rss_key_size); + netdev_rss_key_fill(vi->rss_hash_key_data, vi->rss_key_size); } -static void virtnet_get_hashflow(const struct virtnet_info *vi, struct ethtool_rxnfc *info) +static int virtnet_get_hashflow(struct net_device *dev, + struct ethtool_rxfh_fields *info) { + struct virtnet_info *vi = netdev_priv(dev); + info->data = 0; switch (info->flow_type) { case TCP_V4_FLOW: @@ -4159,17 +4345,22 @@ static void virtnet_get_hashflow(const struct virtnet_info *vi, struct ethtool_r info->data = 0; break; } + + return 0; } -static bool virtnet_set_hashflow(struct virtnet_info *vi, struct ethtool_rxnfc *info) +static int virtnet_set_hashflow(struct net_device *dev, + const struct ethtool_rxfh_fields *info, + struct netlink_ext_ack *extack) { + struct virtnet_info *vi = netdev_priv(dev); u32 new_hashtypes = vi->rss_hash_types_saved; bool is_disable = info->data & RXH_DISCARD; bool is_l4 = info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3); /* supports only 'sd', 'sdfn' and 'r' */ if (!((info->data == (RXH_IP_SRC | RXH_IP_DST)) | is_l4 | is_disable)) - return false; + return -EINVAL; switch (info->flow_type) { case TCP_V4_FLOW: @@ -4208,21 +4399,22 @@ static bool virtnet_set_hashflow(struct virtnet_info *vi, struct ethtool_rxnfc * break; default: /* unsupported flow */ - return false; + return -EINVAL; } /* if unsupported hashtype was set */ if (new_hashtypes != (new_hashtypes & vi->rss_hash_types_supported)) - return false; + return -EINVAL; if (new_hashtypes != vi->rss_hash_types_saved) { vi->rss_hash_types_saved = new_hashtypes; - vi->rss.hash_types = vi->rss_hash_types_saved; + vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_saved); if (vi->dev->features & NETIF_F_RXHASH) - return virtnet_commit_rss_command(vi); + if (!virtnet_commit_rss_command(vi)) + return -EINVAL; } - return true; + return 0; } static void virtnet_get_drvinfo(struct net_device *dev, @@ -5397,11 +5589,11 @@ static int virtnet_get_rxfh(struct net_device *dev, if (rxfh->indir) { for (i = 0; i < vi->rss_indir_table_size; ++i) - rxfh->indir[i] = vi->rss.indirection_table[i]; + rxfh->indir[i] = le16_to_cpu(vi->rss_hdr->indirection_table[i]); } if (rxfh->key) - memcpy(rxfh->key, vi->rss.key, vi->rss_key_size); + memcpy(rxfh->key, vi->rss_hash_key_data, vi->rss_key_size); rxfh->hfunc = ETH_RSS_HASH_TOP; @@ -5425,7 +5617,7 @@ static int virtnet_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; for (i = 0; i < vi->rss_indir_table_size; ++i) - vi->rss.indirection_table[i] = rxfh->indir[i]; + vi->rss_hdr->indirection_table[i] = cpu_to_le16(rxfh->indir[i]); update = true; } @@ -5437,7 +5629,7 @@ static int virtnet_set_rxfh(struct net_device *dev, if (!vi->has_rss && !vi->has_rss_hash_report) return -EOPNOTSUPP; - memcpy(vi->rss.key, rxfh->key, vi->rss_key_size); + memcpy(vi->rss_hash_key_data, rxfh->key, vi->rss_key_size); update = true; } @@ -5447,41 +5639,11 @@ static int virtnet_set_rxfh(struct net_device *dev, return 0; } -static int virtnet_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs) -{ - struct virtnet_info *vi = netdev_priv(dev); - int rc = 0; - - switch (info->cmd) { - case ETHTOOL_GRXRINGS: - info->data = vi->curr_queue_pairs; - break; - case ETHTOOL_GRXFH: - virtnet_get_hashflow(vi, info); - break; - default: - rc = -EOPNOTSUPP; - } - - return rc; -} - -static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info) +static u32 virtnet_get_rx_ring_count(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); - int rc = 0; - - switch (info->cmd) { - case ETHTOOL_SRXFH: - if (!virtnet_set_hashflow(vi, info)) - rc = -EINVAL; - break; - default: - rc = -EOPNOTSUPP; - } - - return rc; + return vi->curr_queue_pairs; } static const struct ethtool_ops virtnet_ethtool_ops = { @@ -5507,8 +5669,9 @@ static const struct ethtool_ops virtnet_ethtool_ops = { .get_rxfh_indir_size = virtnet_get_rxfh_indir_size, .get_rxfh = virtnet_get_rxfh, .set_rxfh = virtnet_set_rxfh, - .get_rxnfc = virtnet_get_rxnfc, - .set_rxnfc = virtnet_set_rxnfc, + .get_rxfh_fields = virtnet_get_hashflow, + .set_rxfh_fields = virtnet_set_hashflow, + .get_rx_ring_count = virtnet_get_rx_ring_count, }; static void virtnet_get_queue_stats_rx(struct net_device *dev, int i, @@ -5594,6 +5757,10 @@ static void virtnet_get_base_stats(struct net_device *dev, if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) tx->hw_drop_ratelimits = 0; + + netdev_stat_queue_sum(dev, + dev->real_num_rx_queues, vi->max_queue_pairs, rx, + dev->real_num_tx_queues, vi->max_queue_pairs, tx); } static const struct netdev_stat_ops virtnet_stat_ops = { @@ -5611,11 +5778,15 @@ static void virtnet_freeze_down(struct virtio_device *vdev) disable_rx_mode_work(vi); flush_work(&vi->rx_mode_work); + if (netif_running(vi->dev)) { + rtnl_lock(); + virtnet_close(vi->dev); + rtnl_unlock(); + } + netif_tx_lock_bh(vi->dev); netif_device_detach(vi->dev); netif_tx_unlock_bh(vi->dev); - if (netif_running(vi->dev)) - virtnet_close(vi->dev); } static int init_vqs(struct virtnet_info *vi); @@ -5635,7 +5806,9 @@ static int virtnet_restore_up(struct virtio_device *vdev) enable_rx_mode_work(vi); if (netif_running(vi->dev)) { + rtnl_lock(); err = virtnet_open(vi->dev); + rtnl_unlock(); if (err) return err; } @@ -5710,7 +5883,7 @@ static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queu virtnet_rx_pause(vi, rq); - err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf, NULL); if (err) { netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); @@ -5739,7 +5912,8 @@ static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, virtnet_tx_pause(vi, sq); - err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf, + virtnet_sq_free_unused_buf_done); if (err) { netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); pool = NULL; @@ -5798,10 +5972,12 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, if (!rq->xsk_buffs) return -ENOMEM; - hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, - DMA_TO_DEVICE, 0); - if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) - return -ENOMEM; + hdr_dma = virtqueue_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, + DMA_TO_DEVICE, 0); + if (virtqueue_map_mapping_error(sq->vq, hdr_dma)) { + err = -ENOMEM; + goto err_free_buffs; + } err = xsk_pool_dma_map(pool, dma_dev, 0); if (err) @@ -5827,8 +6003,10 @@ err_sq: err_rq: xsk_pool_dma_unmap(pool, 0); err_xsk_map: - virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, - DMA_TO_DEVICE, 0); + virtqueue_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, + DMA_TO_DEVICE, 0); +err_free_buffs: + kvfree(rq->xsk_buffs); return err; } @@ -5853,8 +6031,8 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) xsk_pool_dma_unmap(pool, 0); - virtqueue_dma_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, - vi->hdr_len, DMA_TO_DEVICE, 0); + virtqueue_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, + vi->hdr_len, DMA_TO_DEVICE, 0); kvfree(rq->xsk_buffs); return err; @@ -5921,12 +6099,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, if (prog) bpf_prog_add(prog, vi->max_queue_pairs - 1); + virtnet_rx_pause_all(vi); + /* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { - for (i = 0; i < vi->max_queue_pairs; i++) { - napi_disable(&vi->rq[i].napi); - virtnet_napi_tx_disable(&vi->sq[i].napi); - } + for (i = 0; i < vi->max_queue_pairs; i++) + virtnet_napi_tx_disable(&vi->sq[i]); } if (!prog) { @@ -5958,14 +6136,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, vi->xdp_enabled = false; } + virtnet_rx_resume_all(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (old_prog) bpf_prog_put(old_prog); - if (netif_running(dev)) { - virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); - virtnet_napi_tx_enable(vi, vi->sq[i].vq, - &vi->sq[i].napi); - } + if (netif_running(dev)) + virtnet_napi_tx_enable(&vi->sq[i]); } return 0; @@ -5977,12 +6153,10 @@ err: rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog); } + virtnet_rx_resume_all(vi); if (netif_running(dev)) { - for (i = 0; i < vi->max_queue_pairs; i++) { - virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); - virtnet_napi_tx_enable(vi, vi->sq[i].vq, - &vi->sq[i].napi); - } + for (i = 0; i < vi->max_queue_pairs; i++) + virtnet_napi_tx_enable(&vi->sq[i]); } if (prog) bpf_prog_sub(prog, vi->max_queue_pairs - 1); @@ -6042,9 +6216,9 @@ static int virtnet_set_features(struct net_device *dev, if ((dev->features ^ features) & NETIF_F_RXHASH) { if (features & NETIF_F_RXHASH) - vi->rss.hash_types = vi->rss_hash_types_saved; + vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_saved); else - vi->rss.hash_types = VIRTIO_NET_HASH_REPORT_NONE; + vi->rss_hdr->hash_types = cpu_to_le32(VIRTIO_NET_HASH_REPORT_NONE); if (!virtnet_commit_rss_command(vi)) return -EINVAL; @@ -6214,7 +6388,7 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) { struct virtnet_info *vi = vq->vdev->priv; struct send_queue *sq; - int i = vq2rxq(vq); + int i = vq2txq(vq); sq = &vi->sq[i]; @@ -6234,6 +6408,14 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) } } +static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq) +{ + struct virtnet_info *vi = vq->vdev->priv; + int i = vq2txq(vq); + + netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); +} + static void free_unused_bufs(struct virtnet_info *vi) { void *buf; @@ -6380,8 +6562,9 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) INIT_DELAYED_WORK(&vi->refill, refill_work); for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].pages = NULL; - netif_napi_add_weight(vi->dev, &vi->rq[i].napi, virtnet_poll, - napi_weight); + netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll, + i); + vi->rq[i].napi.weight = napi_weight; netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, napi_tx ? napi_weight : 0); @@ -6592,7 +6775,7 @@ static int virtnet_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash, hash_report = VIRTIO_NET_HASH_REPORT_NONE; *rss_type = virtnet_xdp_rss_type[hash_report]; - *hash = __le32_to_cpu(hdr_hash->hash_value); + *hash = virtio_net_hash_value(hdr_hash); return 0; } @@ -6656,10 +6839,20 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_USO)) dev->hw_features |= NETIF_F_GSO_UDP_L4; + if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO)) { + dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL; + dev->hw_enc_features = dev->hw_features; + } + if (dev->hw_features & NETIF_F_GSO_UDP_TUNNEL && + virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM)) { + dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; + dev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; + } + dev->features |= NETIF_F_GSO_ROBUST; if (gso) - dev->features |= dev->hw_features & NETIF_F_ALL_TSO; + dev->features |= dev->hw_features; /* (!csum && gso) case will be fixed by register_netdev() */ } @@ -6725,9 +6918,11 @@ static int virtnet_probe(struct virtio_device *vdev) virtio_cread16(vdev, offsetof(struct virtio_net_config, rss_max_indirection_table_length)); } - err = rss_indirection_table_alloc(&vi->rss, vi->rss_indir_table_size); - if (err) + vi->rss_hdr = devm_kzalloc(&vdev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); + if (!vi->rss_hdr) { + err = -ENOMEM; goto free; + } if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = @@ -6750,7 +6945,10 @@ static int virtnet_probe(struct virtio_device *vdev) dev->xdp_metadata_ops = &virtnet_xdp_metadata_ops; } - if (vi->has_rss_hash_report) + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) || + virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO)) + vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash_tunnel); + else if (vi->has_rss_hash_report) vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash); else if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) || virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) @@ -6758,6 +6956,13 @@ static int virtnet_probe(struct virtio_device *vdev) else vi->hdr_len = sizeof(struct virtio_net_hdr); + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM)) + vi->rx_tnl_csum = true; + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO)) + vi->rx_tnl = true; + if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO)) + vi->tx_tnl = true; + if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) || virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) vi->any_header_sg = true; @@ -6925,16 +7130,20 @@ static int virtnet_probe(struct virtio_device *vdev) otherwise get link status from config. */ netif_carrier_off(dev); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { - virtnet_config_changed_work(&vi->config_work); + virtio_config_changed(vi->vdev); } else { vi->status = VIRTIO_NET_S_LINK_UP; virtnet_update_settings(vi); netif_carrier_on(dev); } - for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) - if (virtio_has_feature(vi->vdev, guest_offloads[i])) + for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) { + unsigned int fbit; + + fbit = virtio_offload_to_feature(guest_offloads[i]); + if (virtio_has_feature(vi->vdev, fbit)) set_bit(guest_offloads[i], &vi->guest_offloads); + } vi->guest_offloads_capable = vi->guest_offloads; rtnl_unlock(); @@ -6966,11 +7175,20 @@ free: static void remove_vq_common(struct virtnet_info *vi) { + int i; + virtio_reset_device(vi->vdev); /* Free unused buffers in both send and recv, if any. */ free_unused_bufs(vi); + /* + * Rule of thumb is netdev_tx_reset_queue() should follow any + * skb freeing not followed by netdev_tx_completed_queue() + */ + for (i = 0; i < vi->max_queue_pairs; i++) + netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); + free_receive_bufs(vi); free_receive_page_frags(vi); @@ -6997,8 +7215,6 @@ static void virtnet_remove(struct virtio_device *vdev) remove_vq_common(vi); - rss_indirection_table_free(&vi->rss); - free_netdev(vi->dev); } @@ -7057,6 +7273,10 @@ static struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTNET_FEATURES, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, }; static unsigned int features_legacy[] = { |
