summaryrefslogtreecommitdiff
path: root/drivers/net/hyperv/netvsc.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/hyperv/netvsc.c')
-rw-r--r--drivers/net/hyperv/netvsc.c643
1 files changed, 543 insertions, 100 deletions
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 922054c1d544..60a4629fe6ba 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2009, Microsoft Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Authors:
* Haiyang Zhang <haiyangz@microsoft.com>
* Hank Janssen <hjanssen@microsoft.com>
@@ -31,8 +20,10 @@
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <linux/prefetch.h>
+#include <linux/filter.h>
#include <asm/sync_bitops.h>
+#include <asm/mshyperv.h>
#include "hyperv_net.h"
#include "netvsc_trace.h"
@@ -41,12 +32,17 @@
* Switch the data path from the synthetic interface to the VF
* interface.
*/
-void netvsc_switch_datapath(struct net_device *ndev, bool vf)
+int netvsc_switch_datapath(struct net_device *ndev, bool vf)
{
struct net_device_context *net_device_ctx = netdev_priv(ndev);
struct hv_device *dev = net_device_ctx->device_ctx;
struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx->nvdev);
struct nvsp_message *init_pkt = &nv_dev->channel_init_pkt;
+ int ret, retry = 0;
+
+ /* Block sending traffic to VF if it's about to be gone */
+ if (!vf)
+ net_device_ctx->data_path_is_vf = vf;
memset(init_pkt, 0, sizeof(struct nvsp_message));
init_pkt->hdr.msg_type = NVSP_MSG4_TYPE_SWITCH_DATA_PATH;
@@ -57,12 +53,41 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf)
init_pkt->msg.v4_msg.active_dp.active_datapath =
NVSP_DATAPATH_SYNTHETIC;
+again:
trace_nvsp_send(ndev, init_pkt);
- vmbus_sendpacket(dev->channel, init_pkt,
+ ret = vmbus_sendpacket(dev->channel, init_pkt,
sizeof(struct nvsp_message),
- (unsigned long)init_pkt,
- VM_PKT_DATA_INBAND, 0);
+ (unsigned long)init_pkt, VM_PKT_DATA_INBAND,
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+
+ /* If failed to switch to/from VF, let data_path_is_vf stay false,
+ * so we use synthetic path to send data.
+ */
+ if (ret) {
+ if (ret != -EAGAIN) {
+ netdev_err(ndev,
+ "Unable to send sw datapath msg, err: %d\n",
+ ret);
+ return ret;
+ }
+
+ if (retry++ < RETRY_MAX) {
+ usleep_range(RETRY_US_LO, RETRY_US_HI);
+ goto again;
+ } else {
+ netdev_err(
+ ndev,
+ "Retry failed to send sw datapath msg, err: %d\n",
+ ret);
+ return ret;
+ }
+ }
+
+ wait_for_completion(&nv_dev->channel_init_wait);
+ net_device_ctx->data_path_is_vf = vf;
+
+ return 0;
}
/* Worker to setup sub channels on initial setup
@@ -84,7 +109,7 @@ static void netvsc_subchan_work(struct work_struct *w)
rdev = nvdev->extension;
if (rdev) {
- ret = rndis_set_subchannel(rdev->ndev, nvdev);
+ ret = rndis_set_subchannel(rdev->ndev, nvdev, NULL);
if (ret == 0) {
netif_device_attach(rdev->ndev);
} else {
@@ -110,6 +135,7 @@ static struct netvsc_device *alloc_net_device(void)
init_waitqueue_head(&net_device->wait_drain);
net_device->destroy = false;
+ net_device->tx_disable = true;
net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;
@@ -128,12 +154,18 @@ static void free_netvsc_device(struct rcu_head *head)
int i;
kfree(nvdev->extension);
- vfree(nvdev->recv_buf);
- vfree(nvdev->send_buf);
- kfree(nvdev->send_section_map);
- for (i = 0; i < VRSS_CHANNEL_MAX; i++)
+ if (!nvdev->recv_buf_gpadl_handle.decrypted)
+ vfree(nvdev->recv_buf);
+ if (!nvdev->send_buf_gpadl_handle.decrypted)
+ vfree(nvdev->send_buf);
+ bitmap_free(nvdev->send_section_map);
+
+ for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
+ xdp_rxq_info_unreg(&nvdev->chan_table[i].xdp_rxq);
+ kfree(nvdev->chan_table[i].recv_buf);
vfree(nvdev->chan_table[i].mrc.slots);
+ }
kfree(nvdev);
}
@@ -171,7 +203,7 @@ static void netvsc_revoke_recv_buf(struct hv_device *device,
ret = vmbus_sendpacket(device->channel,
revoke_packet,
sizeof(struct nvsp_message),
- (unsigned long)revoke_packet,
+ VMBUS_RQST_ID_NO_RESPONSE,
VM_PKT_DATA_INBAND, 0);
/* If the failure is because the channel is rescinded;
* ignore the failure since we cannot send on a rescinded
@@ -221,7 +253,7 @@ static void netvsc_revoke_send_buf(struct hv_device *device,
ret = vmbus_sendpacket(device->channel,
revoke_packet,
sizeof(struct nvsp_message),
- (unsigned long)revoke_packet,
+ VMBUS_RQST_ID_NO_RESPONSE,
VM_PKT_DATA_INBAND, 0);
/* If the failure is because the channel is rescinded;
@@ -250,9 +282,9 @@ static void netvsc_teardown_recv_gpadl(struct hv_device *device,
{
int ret;
- if (net_device->recv_buf_gpadl_handle) {
+ if (net_device->recv_buf_gpadl_handle.gpadl_handle) {
ret = vmbus_teardown_gpadl(device->channel,
- net_device->recv_buf_gpadl_handle);
+ &net_device->recv_buf_gpadl_handle);
/* If we failed here, we might as well return and have a leak
* rather than continue and a bugchk
@@ -262,7 +294,6 @@ static void netvsc_teardown_recv_gpadl(struct hv_device *device,
"unable to teardown receive buffer's gpadl\n");
return;
}
- net_device->recv_buf_gpadl_handle = 0;
}
}
@@ -272,9 +303,9 @@ static void netvsc_teardown_send_gpadl(struct hv_device *device,
{
int ret;
- if (net_device->send_buf_gpadl_handle) {
+ if (net_device->send_buf_gpadl_handle.gpadl_handle) {
ret = vmbus_teardown_gpadl(device->channel,
- net_device->send_buf_gpadl_handle);
+ &net_device->send_buf_gpadl_handle);
/* If we failed here, we might as well return and have a leak
* rather than continue and a bugchk
@@ -284,7 +315,6 @@ static void netvsc_teardown_send_gpadl(struct hv_device *device,
"unable to teardown send buffer's gpadl\n");
return;
}
- net_device->send_buf_gpadl_handle = 0;
}
}
@@ -310,8 +340,7 @@ static int netvsc_init_buf(struct hv_device *device,
struct net_device *ndev = hv_get_drvdata(device);
struct nvsp_message *init_packet;
unsigned int buf_size;
- size_t map_words;
- int ret = 0;
+ int i, ret = 0;
/* Get receive buffer area. */
buf_size = device_info->recv_sections * device_info->recv_section_size;
@@ -352,7 +381,7 @@ static int netvsc_init_buf(struct hv_device *device,
memset(init_packet, 0, sizeof(struct nvsp_message));
init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_RECV_BUF;
init_packet->msg.v1_msg.send_recv_buf.
- gpadl_handle = net_device->recv_buf_gpadl_handle;
+ gpadl_handle = net_device->recv_buf_gpadl_handle.gpadl_handle;
init_packet->msg.v1_msg.
send_recv_buf.id = NETVSC_RECEIVE_BUFFER_ID;
@@ -396,10 +425,30 @@ static int netvsc_init_buf(struct hv_device *device,
net_device->recv_section_size = resp->sections[0].sub_alloc_size;
net_device->recv_section_cnt = resp->sections[0].num_sub_allocs;
- /* Setup receive completion ring */
- net_device->recv_completion_cnt
- = round_up(net_device->recv_section_cnt + 1,
- PAGE_SIZE / sizeof(u64));
+ /* Ensure buffer will not overflow */
+ if (net_device->recv_section_size < NETVSC_MTU_MIN || (u64)net_device->recv_section_size *
+ (u64)net_device->recv_section_cnt > (u64)buf_size) {
+ netdev_err(ndev, "invalid recv_section_size %u\n",
+ net_device->recv_section_size);
+ ret = -EINVAL;
+ goto cleanup;
+ }
+
+ for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
+ struct netvsc_channel *nvchan = &net_device->chan_table[i];
+
+ nvchan->recv_buf = kzalloc(net_device->recv_section_size, GFP_KERNEL);
+ if (nvchan->recv_buf == NULL) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+ }
+
+ /* Setup receive completion ring.
+ * Add 1 to the recv_section_cnt because at least one entry in a
+ * ring buffer has to be empty.
+ */
+ net_device->recv_completion_cnt = net_device->recv_section_cnt + 1;
ret = netvsc_alloc_recv_comp_ring(net_device, 0);
if (ret)
goto cleanup;
@@ -415,6 +464,7 @@ static int netvsc_init_buf(struct hv_device *device,
ret = -ENOMEM;
goto cleanup;
}
+ net_device->send_buf_size = buf_size;
/* Establish the gpadl handle for this buffer on this
* channel. Note: This call uses the vmbus connection rather
@@ -434,7 +484,7 @@ static int netvsc_init_buf(struct hv_device *device,
memset(init_packet, 0, sizeof(struct nvsp_message));
init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_SEND_BUF;
init_packet->msg.v1_msg.send_send_buf.gpadl_handle =
- net_device->send_buf_gpadl_handle;
+ net_device->send_buf_gpadl_handle.gpadl_handle;
init_packet->msg.v1_msg.send_send_buf.id = NETVSC_SEND_BUFFER_ID;
trace_nvsp_send(ndev, init_packet);
@@ -467,6 +517,12 @@ static int netvsc_init_buf(struct hv_device *device,
/* Parse the response */
net_device->send_section_size = init_packet->msg.
v1_msg.send_send_buf_complete.section_size;
+ if (net_device->send_section_size < NETVSC_MTU_MIN) {
+ netdev_err(ndev, "invalid send_section_size %u\n",
+ net_device->send_section_size);
+ ret = -EINVAL;
+ goto cleanup;
+ }
/* Section count is simply the size divided by the section size. */
net_device->send_section_cnt = buf_size / net_device->send_section_size;
@@ -475,10 +531,9 @@ static int netvsc_init_buf(struct hv_device *device,
net_device->send_section_size, net_device->send_section_cnt);
/* Setup state for managing the send buffer. */
- map_words = DIV_ROUND_UP(net_device->send_section_cnt, BITS_PER_LONG);
-
- net_device->send_section_map = kcalloc(map_words, sizeof(ulong), GFP_KERNEL);
- if (net_device->send_section_map == NULL) {
+ net_device->send_section_map = bitmap_zalloc(net_device->send_section_cnt,
+ GFP_KERNEL);
+ if (!net_device->send_section_map) {
ret = -ENOMEM;
goto cleanup;
}
@@ -536,7 +591,10 @@ static int negotiate_nvsp_ver(struct hv_device *device,
init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1;
if (nvsp_ver >= NVSP_PROTOCOL_VERSION_5) {
- init_packet->msg.v2_msg.send_ndis_config.capability.sriov = 1;
+ if (hv_is_isolation_supported())
+ netdev_info(ndev, "SR-IOV not advertised by guests on the host supporting isolation\n");
+ else
+ init_packet->msg.v2_msg.send_ndis_config.capability.sriov = 1;
/* Teaming bit is needed to receive link speed updates */
init_packet->msg.v2_msg.send_ndis_config.capability.teaming = 1;
@@ -549,7 +607,7 @@ static int negotiate_nvsp_ver(struct hv_device *device,
ret = vmbus_sendpacket(device->channel, init_packet,
sizeof(struct nvsp_message),
- (unsigned long)init_packet,
+ VMBUS_RQST_ID_NO_RESPONSE,
VM_PKT_DATA_INBAND, 0);
return ret;
@@ -583,6 +641,13 @@ static int netvsc_connect_vsp(struct hv_device *device,
goto cleanup;
}
+ if (hv_is_isolation_supported() && net_device->nvsp_version < NVSP_PROTOCOL_VERSION_61) {
+ netdev_err(ndev, "Invalid NVSP version 0x%x (expected >= 0x%x) from the host supporting isolation\n",
+ net_device->nvsp_version, NVSP_PROTOCOL_VERSION_61);
+ ret = -EPROTO;
+ goto cleanup;
+ }
+
pr_debug("Negotiated NVSP version:%x\n", net_device->nvsp_version);
/* Send the ndis version */
@@ -606,7 +671,7 @@ static int netvsc_connect_vsp(struct hv_device *device,
/* Send the init request */
ret = vmbus_sendpacket(device->channel, init_packet,
sizeof(struct nvsp_message),
- (unsigned long)init_packet,
+ VMBUS_RQST_ID_NO_RESPONSE,
VM_PKT_DATA_INBAND, 0);
if (ret != 0)
goto cleanup;
@@ -643,9 +708,20 @@ void netvsc_device_remove(struct hv_device *device)
RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
- /* And disassociate NAPI context from device */
- for (i = 0; i < net_device->num_chn; i++)
+ /* Disable NAPI and disassociate its context from the device. */
+ for (i = 0; i < net_device->num_chn; i++) {
+ /* See also vmbus_reset_channel_cb(). */
+ /* only disable enabled NAPI channel */
+ if (i < ndev->real_num_rx_queues) {
+ netif_queue_set_napi(ndev, i, NETDEV_QUEUE_TYPE_TX,
+ NULL);
+ netif_queue_set_napi(ndev, i, NETDEV_QUEUE_TYPE_RX,
+ NULL);
+ napi_disable(&net_device->chan_table[i].napi);
+ }
+
netif_napi_del(&net_device->chan_table[i].napi);
+ }
/*
* At this point, no one should be accessing net_device
@@ -684,17 +760,26 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
const struct vmpacket_descriptor *desc,
int budget)
{
- struct sk_buff *skb = (struct sk_buff *)(unsigned long)desc->trans_id;
struct net_device_context *ndev_ctx = netdev_priv(ndev);
+ struct sk_buff *skb;
u16 q_idx = 0;
int queue_sends;
+ u64 cmd_rqst;
+
+ cmd_rqst = channel->request_addr_callback(channel, desc->trans_id);
+ if (cmd_rqst == VMBUS_RQST_ERROR) {
+ netdev_err(ndev, "Invalid transaction ID %llx\n", desc->trans_id);
+ return;
+ }
+
+ skb = (struct sk_buff *)(unsigned long)cmd_rqst;
/* Notify the layer above us */
if (likely(skb)) {
- const struct hv_netvsc_packet *packet
+ struct hv_netvsc_packet *packet
= (struct hv_netvsc_packet *)skb->cb;
u32 send_index = packet->send_buf_index;
- struct netvsc_stats *tx_stats;
+ struct netvsc_stats_tx *tx_stats;
if (send_index != NETVSC_INVALID_INDEX)
netvsc_free_send_slot(net_device, send_index);
@@ -707,6 +792,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
tx_stats->bytes += packet->total_bytes;
u64_stats_update_end(&tx_stats->syncp);
+ netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
napi_consume_skb(skb, budget);
}
@@ -719,7 +805,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
} else {
struct netdev_queue *txq = netdev_get_tx_queue(ndev, q_idx);
- if (netif_tx_queue_stopped(txq) &&
+ if (netif_tx_queue_stopped(txq) && !net_device->tx_disable &&
(hv_get_avail_to_write_percent(&channel->outbound) >
RING_AVAIL_PERCENT_HIWATER || queue_sends < 1)) {
netif_tx_wake_queue(txq);
@@ -734,29 +820,111 @@ static void netvsc_send_completion(struct net_device *ndev,
const struct vmpacket_descriptor *desc,
int budget)
{
- const struct nvsp_message *nvsp_packet = hv_pkt_data(desc);
+ const struct nvsp_message *nvsp_packet;
+ u32 msglen = hv_pkt_datalen(desc);
+ struct nvsp_message *pkt_rqst;
+ u64 cmd_rqst;
+ u32 status;
+
+ /* First check if this is a VMBUS completion without data payload */
+ if (!msglen) {
+ cmd_rqst = incoming_channel->request_addr_callback(incoming_channel,
+ desc->trans_id);
+ if (cmd_rqst == VMBUS_RQST_ERROR) {
+ netdev_err(ndev, "Invalid transaction ID %llx\n", desc->trans_id);
+ return;
+ }
+
+ pkt_rqst = (struct nvsp_message *)(uintptr_t)cmd_rqst;
+ switch (pkt_rqst->hdr.msg_type) {
+ case NVSP_MSG4_TYPE_SWITCH_DATA_PATH:
+ complete(&net_device->channel_init_wait);
+ break;
+ default:
+ netdev_err(ndev, "Unexpected VMBUS completion!!\n");
+ }
+ return;
+ }
+
+ /* Ensure packet is big enough to read header fields */
+ if (msglen < sizeof(struct nvsp_message_header)) {
+ netdev_err(ndev, "nvsp_message length too small: %u\n", msglen);
+ return;
+ }
+
+ nvsp_packet = hv_pkt_data(desc);
switch (nvsp_packet->hdr.msg_type) {
case NVSP_MSG_TYPE_INIT_COMPLETE:
+ if (msglen < sizeof(struct nvsp_message_header) +
+ sizeof(struct nvsp_message_init_complete)) {
+ netdev_err(ndev, "nvsp_msg length too small: %u\n",
+ msglen);
+ return;
+ }
+ break;
+
case NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE:
+ if (msglen < sizeof(struct nvsp_message_header) +
+ struct_size_t(struct nvsp_1_message_send_receive_buffer_complete,
+ sections, 1)) {
+ netdev_err(ndev, "nvsp_msg1 length too small: %u\n",
+ msglen);
+ return;
+ }
+ break;
+
case NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE:
+ if (msglen < sizeof(struct nvsp_message_header) +
+ sizeof(struct nvsp_1_message_send_send_buffer_complete)) {
+ netdev_err(ndev, "nvsp_msg1 length too small: %u\n",
+ msglen);
+ return;
+ }
+ break;
+
case NVSP_MSG5_TYPE_SUBCHANNEL:
- /* Copy the response back */
- memcpy(&net_device->channel_init_pkt, nvsp_packet,
- sizeof(struct nvsp_message));
- complete(&net_device->channel_init_wait);
+ if (msglen < sizeof(struct nvsp_message_header) +
+ sizeof(struct nvsp_5_subchannel_complete)) {
+ netdev_err(ndev, "nvsp_msg5 length too small: %u\n",
+ msglen);
+ return;
+ }
break;
case NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE:
+ if (msglen < sizeof(struct nvsp_message_header) +
+ sizeof(struct nvsp_1_message_send_rndis_packet_complete)) {
+ if (net_ratelimit())
+ netdev_err(ndev, "nvsp_rndis_pkt_complete length too small: %u\n",
+ msglen);
+ return;
+ }
+
+ /* If status indicates an error, output a message so we know
+ * there's a problem. But process the completion anyway so the
+ * resources are released.
+ */
+ status = nvsp_packet->msg.v1_msg.send_rndis_pkt_complete.status;
+ if (status != NVSP_STAT_SUCCESS && net_ratelimit())
+ netdev_err(ndev, "nvsp_rndis_pkt_complete error status: %x\n",
+ status);
+
netvsc_send_tx_complete(ndev, net_device, incoming_channel,
desc, budget);
- break;
+ return;
default:
netdev_err(ndev,
"Unknown send completion type %d received!!\n",
nvsp_packet->hdr.msg_type);
+ return;
}
+
+ /* Copy the response back */
+ memcpy(&net_device->channel_init_pkt, nvsp_packet,
+ sizeof(struct nvsp_message));
+ complete(&net_device->channel_init_wait);
}
static u32 netvsc_get_next_send_section(struct netvsc_device *net_device)
@@ -785,8 +953,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
+ pend_size;
int i;
u32 padding = 0;
- u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt :
- packet->page_buf_cnt;
+ u32 page_count = packet->cp_partial ? 1 : packet->page_buf_cnt;
u32 remain;
/* Add padding */
@@ -798,7 +965,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
}
for (i = 0; i < page_count; i++) {
- char *src = phys_to_virt(pb[i].pfn << PAGE_SHIFT);
+ char *src = phys_to_virt(pb[i].pfn << HV_HYP_PAGE_SHIFT);
u32 offset = pb[i].offset;
u32 len = pb[i].len;
@@ -810,6 +977,119 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
memset(dest, 0, padding);
}
+void netvsc_dma_unmap(struct hv_device *hv_dev,
+ struct hv_netvsc_packet *packet)
+{
+ int i;
+
+ if (!hv_is_isolation_supported())
+ return;
+
+ if (!packet->dma_range)
+ return;
+
+ for (i = 0; i < packet->page_buf_cnt; i++)
+ dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,
+ packet->dma_range[i].mapping_size,
+ DMA_TO_DEVICE);
+
+ kfree(packet->dma_range);
+}
+
+/* netvsc_dma_map - Map swiotlb bounce buffer with data page of
+ * packet sent by vmbus_sendpacket_pagebuffer() in the Isolation
+ * VM.
+ *
+ * In isolation VM, netvsc send buffer has been marked visible to
+ * host and so the data copied to send buffer doesn't need to use
+ * bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer()
+ * may not be copied to send buffer and so these pages need to be
+ * mapped with swiotlb bounce buffer. netvsc_dma_map() is to do
+ * that. The pfns in the struct hv_page_buffer need to be converted
+ * to bounce buffer's pfn. The loop here is necessary because the
+ * entries in the page buffer array are not necessarily full
+ * pages of data. Each entry in the array has a separate offset and
+ * len that may be non-zero, even for entries in the middle of the
+ * array. And the entries are not physically contiguous. So each
+ * entry must be individually mapped rather than as a contiguous unit.
+ * So not use dma_map_sg() here.
+ */
+static int netvsc_dma_map(struct hv_device *hv_dev,
+ struct hv_netvsc_packet *packet,
+ struct hv_page_buffer *pb)
+{
+ u32 page_count = packet->page_buf_cnt;
+ dma_addr_t dma;
+ int i;
+
+ if (!hv_is_isolation_supported())
+ return 0;
+
+ packet->dma_range = kcalloc(page_count,
+ sizeof(*packet->dma_range),
+ GFP_ATOMIC);
+ if (!packet->dma_range)
+ return -ENOMEM;
+
+ for (i = 0; i < page_count; i++) {
+ char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT)
+ + pb[i].offset);
+ u32 len = pb[i].len;
+
+ dma = dma_map_single(&hv_dev->device, src, len,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(&hv_dev->device, dma)) {
+ kfree(packet->dma_range);
+ return -ENOMEM;
+ }
+
+ /* pb[].offset and pb[].len are not changed during dma mapping
+ * and so not reassign.
+ */
+ packet->dma_range[i].dma = dma;
+ packet->dma_range[i].mapping_size = len;
+ pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT;
+ }
+
+ return 0;
+}
+
+/* Build an "array" of mpb entries describing the data to be transferred
+ * over VMBus. After the desc header fields, each "array" entry is variable
+ * size, and each entry starts after the end of the previous entry. The
+ * "offset" and "len" fields for each entry imply the size of the entry.
+ *
+ * The pfns are in HV_HYP_PAGE_SIZE, because all communication with Hyper-V
+ * uses that granularity, even if the system page size of the guest is larger.
+ * Each entry in the input "pb" array must describe a contiguous range of
+ * guest physical memory so that the pfns are sequential if the range crosses
+ * a page boundary. The offset field must be < HV_HYP_PAGE_SIZE.
+ */
+static inline void netvsc_build_mpb_array(struct hv_page_buffer *pb,
+ u32 page_buffer_count,
+ struct vmbus_packet_mpb_array *desc,
+ u32 *desc_size)
+{
+ struct hv_mpb_array *mpb_entry = &desc->range;
+ int i, j;
+
+ for (i = 0; i < page_buffer_count; i++) {
+ u32 offset = pb[i].offset;
+ u32 len = pb[i].len;
+
+ mpb_entry->offset = offset;
+ mpb_entry->len = len;
+
+ for (j = 0; j < HVPFN_UP(offset + len); j++)
+ mpb_entry->pfn_array[j] = pb[i].pfn + j;
+
+ mpb_entry = (struct hv_mpb_array *)&mpb_entry->pfn_array[j];
+ }
+
+ desc->rangecount = page_buffer_count;
+ *desc_size = (char *)mpb_entry - (char *)desc;
+}
+
static inline int netvsc_send_pkt(
struct hv_device *device,
struct hv_netvsc_packet *packet,
@@ -830,6 +1110,7 @@ static inline int netvsc_send_pkt(
int ret;
u32 ring_avail = hv_get_avail_to_write_percent(&out_channel->outbound);
+ memset(&nvmsg, 0, sizeof(struct nvsp_message));
nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
if (skb)
rpkt->channel_type = 0; /* 0 is RMC_DATA */
@@ -849,14 +1130,28 @@ static inline int netvsc_send_pkt(
trace_nvsp_send_pkt(ndev, out_channel, rpkt);
+ packet->dma_range = NULL;
if (packet->page_buf_cnt) {
+ struct vmbus_channel_packet_page_buffer desc;
+ u32 desc_size;
+
if (packet->cp_partial)
- pb += packet->rmsg_pgcnt;
+ pb++;
- ret = vmbus_sendpacket_pagebuffer(out_channel,
- pb, packet->page_buf_cnt,
- &nvmsg, sizeof(nvmsg),
- req_id);
+ ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb);
+ if (ret) {
+ ret = -EAGAIN;
+ goto exit;
+ }
+
+ netvsc_build_mpb_array(pb, packet->page_buf_cnt,
+ (struct vmbus_packet_mpb_array *)&desc,
+ &desc_size);
+ ret = vmbus_sendpacket_mpb_desc(out_channel,
+ (struct vmbus_packet_mpb_array *)&desc,
+ desc_size, &nvmsg, sizeof(nvmsg), req_id);
+ if (ret)
+ netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
} else {
ret = vmbus_sendpacket(out_channel,
&nvmsg, sizeof(nvmsg),
@@ -864,6 +1159,7 @@ static inline int netvsc_send_pkt(
VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
}
+exit:
if (ret == 0) {
atomic_inc_return(&nvchan->queue_sends);
@@ -874,11 +1170,6 @@ static inline int netvsc_send_pkt(
} else if (ret == -EAGAIN) {
netif_tx_stop_queue(txq);
ndev_ctx->eth_stats.stop_queue++;
- if (atomic_read(&nvchan->queue_sends) < 1) {
- netif_tx_wake_queue(txq);
- ndev_ctx->eth_stats.wake_queue++;
- ret = -ENOSPC;
- }
} else {
netdev_err(ndev,
"Unable to send packet pages %u len %u, ret %d\n",
@@ -886,6 +1177,15 @@ static inline int netvsc_send_pkt(
ret);
}
+ if (netif_tx_queue_stopped(txq) &&
+ atomic_read(&nvchan->queue_sends) < 1 &&
+ !net_device->tx_disable) {
+ netif_tx_wake_queue(txq);
+ ndev_ctx->eth_stats.wake_queue++;
+ if (ret == -EAGAIN)
+ ret = -ENOSPC;
+ }
+
return ret;
}
@@ -902,11 +1202,32 @@ static inline void move_pkt_msd(struct hv_netvsc_packet **msd_send,
}
/* RCU already held by caller */
+/* Batching/bouncing logic is designed to attempt to optimize
+ * performance.
+ *
+ * For small, non-LSO packets we copy the packet to a send buffer
+ * which is pre-registered with the Hyper-V side. This enables the
+ * hypervisor to avoid remapping the aperture to access the packet
+ * descriptor and data.
+ *
+ * If we already started using a buffer and the netdev is transmitting
+ * a burst of packets, keep on copying into the buffer until it is
+ * full or we are done collecting a burst. If there is an existing
+ * buffer with space for the RNDIS descriptor but not the packet, copy
+ * the RNDIS descriptor to the buffer, keeping the packet in place.
+ *
+ * If we do batching and send more than one packet using a single
+ * NetVSC message, free the SKBs of the packets copied, except for the
+ * last packet. This is done to streamline the handling of the case
+ * where the last packet only had the RNDIS descriptor copied to the
+ * send buffer, with the data pointers included in the NetVSC message.
+ */
int netvsc_send(struct net_device *ndev,
struct hv_netvsc_packet *packet,
struct rndis_message *rndis_msg,
struct hv_page_buffer *pb,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ bool xdp_tx)
{
struct net_device_context *ndev_ctx = netdev_priv(ndev);
struct netvsc_device *net_device
@@ -929,10 +1250,11 @@ int netvsc_send(struct net_device *ndev,
packet->send_buf_index = NETVSC_INVALID_INDEX;
packet->cp_partial = false;
- /* Send control message directly without accessing msd (Multi-Send
- * Data) field which may be changed during data packet processing.
+ /* Send a control message or XDP packet directly without accessing
+ * msd (Multi-Send Data) field which may be changed during data packet
+ * processing.
*/
- if (!skb)
+ if (!skb || xdp_tx)
return netvsc_send_pkt(device, packet, net_device, pb, skb);
/* batch packets in send buffer if possible */
@@ -964,7 +1286,7 @@ int netvsc_send(struct net_device *ndev,
/* Keep aggregating only if stack says more data is coming
* and not doing mixed modes send and not flow blocked
*/
- xmit_more = skb->xmit_more &&
+ xmit_more = netdev_xmit_more() &&
!packet->cp_partial &&
!netif_xmit_stopped(netdev_get_tx_queue(ndev, packet->q_idx));
@@ -976,7 +1298,7 @@ int netvsc_send(struct net_device *ndev,
packet->send_buf_index = section_index;
if (packet->cp_partial) {
- packet->page_buf_cnt -= packet->rmsg_pgcnt;
+ packet->page_buf_cnt--;
packet->total_data_buflen = msd_len + packet->rmsg_size;
} else {
packet->page_buf_cnt = 0;
@@ -1115,19 +1437,28 @@ static void enq_receive_complete(struct net_device *ndev,
static int netvsc_receive(struct net_device *ndev,
struct netvsc_device *net_device,
struct netvsc_channel *nvchan,
- const struct vmpacket_descriptor *desc,
- const struct nvsp_message *nvsp)
+ const struct vmpacket_descriptor *desc)
{
struct net_device_context *net_device_ctx = netdev_priv(ndev);
struct vmbus_channel *channel = nvchan->channel;
const struct vmtransfer_page_packet_header *vmxferpage_packet
= container_of(desc, const struct vmtransfer_page_packet_header, d);
+ const struct nvsp_message *nvsp = hv_pkt_data(desc);
+ u32 msglen = hv_pkt_datalen(desc);
u16 q_idx = channel->offermsg.offer.sub_channel_index;
char *recv_buf = net_device->recv_buf;
u32 status = NVSP_STAT_SUCCESS;
int i;
int count = 0;
+ /* Ensure packet is big enough to read header fields */
+ if (msglen < sizeof(struct nvsp_message_header)) {
+ netif_err(net_device_ctx, rx_err, ndev,
+ "invalid nvsp header, length too small: %u\n",
+ msglen);
+ return 0;
+ }
+
/* Make sure this is a valid nvsp packet */
if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) {
netif_err(net_device_ctx, rx_err, ndev,
@@ -1136,6 +1467,14 @@ static int netvsc_receive(struct net_device *ndev,
return 0;
}
+ /* Validate xfer page pkt header */
+ if ((desc->offset8 << 3) < sizeof(struct vmtransfer_page_packet_header)) {
+ netif_err(net_device_ctx, rx_err, ndev,
+ "Invalid xfer page pkt, offset too small: %u\n",
+ desc->offset8 << 3);
+ return 0;
+ }
+
if (unlikely(vmxferpage_packet->xfer_pageset_id != NETVSC_RECEIVE_BUFFER_ID)) {
netif_err(net_device_ctx, rx_err, ndev,
"Invalid xfer page set id - expecting %x got %x\n",
@@ -1146,6 +1485,14 @@ static int netvsc_receive(struct net_device *ndev,
count = vmxferpage_packet->range_cnt;
+ /* Check count for a valid value */
+ if (NETVSC_XFER_HEADER_SIZE(count) > desc->offset8 << 3) {
+ netif_err(net_device_ctx, rx_err, ndev,
+ "Range count is not valid: %d\n",
+ count);
+ return 0;
+ }
+
/* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
for (i = 0; i < count; i++) {
u32 offset = vmxferpage_packet->ranges[i].byte_offset;
@@ -1153,7 +1500,8 @@ static int netvsc_receive(struct net_device *ndev,
void *data;
int ret;
- if (unlikely(offset + buflen > net_device->recv_buf_size)) {
+ if (unlikely(offset > net_device->recv_buf_size ||
+ buflen > net_device->recv_buf_size - offset)) {
nvchan->rsc.cnt = 0;
status = NVSP_STAT_FAIL;
netif_err(net_device_ctx, rx_err, ndev,
@@ -1163,6 +1511,19 @@ static int netvsc_receive(struct net_device *ndev,
continue;
}
+ /* We're going to copy (sections of) the packet into nvchan->recv_buf;
+ * make sure that nvchan->recv_buf is large enough to hold the packet.
+ */
+ if (unlikely(buflen > net_device->recv_section_size)) {
+ nvchan->rsc.cnt = 0;
+ status = NVSP_STAT_FAIL;
+ netif_err(net_device_ctx, rx_err, ndev,
+ "Packet too big: buflen=%u recv_section_size=%u\n",
+ buflen, net_device->recv_section_size);
+
+ continue;
+ }
+
data = recv_buf + offset;
nvchan->rsc.is_last = (i == count - 1);
@@ -1173,8 +1534,11 @@ static int netvsc_receive(struct net_device *ndev,
ret = rndis_filter_receive(ndev, net_device,
nvchan, data, buflen);
- if (unlikely(ret != NVSP_STAT_SUCCESS))
+ if (unlikely(ret != NVSP_STAT_SUCCESS)) {
+ /* Drop incomplete packet */
+ nvchan->rsc.cnt = 0;
status = NVSP_STAT_FAIL;
+ }
}
enq_receive_complete(ndev, net_device, q_idx,
@@ -1184,47 +1548,98 @@ static int netvsc_receive(struct net_device *ndev,
}
static void netvsc_send_table(struct net_device *ndev,
- const struct nvsp_message *nvmsg)
+ struct netvsc_device *nvscdev,
+ const struct nvsp_message *nvmsg,
+ u32 msglen)
{
struct net_device_context *net_device_ctx = netdev_priv(ndev);
- u32 count, *tab;
+ u32 count, offset, *tab;
int i;
+ /* Ensure packet is big enough to read send_table fields */
+ if (msglen < sizeof(struct nvsp_message_header) +
+ sizeof(struct nvsp_5_send_indirect_table)) {
+ netdev_err(ndev, "nvsp_v5_msg length too small: %u\n", msglen);
+ return;
+ }
+
count = nvmsg->msg.v5_msg.send_table.count;
+ offset = nvmsg->msg.v5_msg.send_table.offset;
+
if (count != VRSS_SEND_TAB_SIZE) {
netdev_err(ndev, "Received wrong send-table size:%u\n", count);
return;
}
- tab = (u32 *)((unsigned long)&nvmsg->msg.v5_msg.send_table +
- nvmsg->msg.v5_msg.send_table.offset);
+ /* If negotiated version <= NVSP_PROTOCOL_VERSION_6, the offset may be
+ * wrong due to a host bug. So fix the offset here.
+ */
+ if (nvscdev->nvsp_version <= NVSP_PROTOCOL_VERSION_6 &&
+ msglen >= sizeof(struct nvsp_message_header) +
+ sizeof(union nvsp_6_message_uber) + count * sizeof(u32))
+ offset = sizeof(struct nvsp_message_header) +
+ sizeof(union nvsp_6_message_uber);
+
+ /* Boundary check for all versions */
+ if (msglen < count * sizeof(u32) || offset > msglen - count * sizeof(u32)) {
+ netdev_err(ndev, "Received send-table offset too big:%u\n",
+ offset);
+ return;
+ }
+
+ tab = (void *)nvmsg + offset;
for (i = 0; i < count; i++)
net_device_ctx->tx_table[i] = tab[i];
}
static void netvsc_send_vf(struct net_device *ndev,
- const struct nvsp_message *nvmsg)
+ const struct nvsp_message *nvmsg,
+ u32 msglen)
{
struct net_device_context *net_device_ctx = netdev_priv(ndev);
+ /* Ensure packet is big enough to read its fields */
+ if (msglen < sizeof(struct nvsp_message_header) +
+ sizeof(struct nvsp_4_send_vf_association)) {
+ netdev_err(ndev, "nvsp_v4_msg length too small: %u\n", msglen);
+ return;
+ }
+
net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
+
+ if (net_device_ctx->vf_alloc)
+ complete(&net_device_ctx->vf_add);
+
netdev_info(ndev, "VF slot %u %s\n",
net_device_ctx->vf_serial,
net_device_ctx->vf_alloc ? "added" : "removed");
}
-static void netvsc_receive_inband(struct net_device *ndev,
- const struct nvsp_message *nvmsg)
+static void netvsc_receive_inband(struct net_device *ndev,
+ struct netvsc_device *nvscdev,
+ const struct vmpacket_descriptor *desc)
{
+ const struct nvsp_message *nvmsg = hv_pkt_data(desc);
+ u32 msglen = hv_pkt_datalen(desc);
+
+ /* Ensure packet is big enough to read header fields */
+ if (msglen < sizeof(struct nvsp_message_header)) {
+ netdev_err(ndev, "inband nvsp_message length too small: %u\n", msglen);
+ return;
+ }
+
switch (nvmsg->hdr.msg_type) {
case NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE:
- netvsc_send_table(ndev, nvmsg);
+ netvsc_send_table(ndev, nvscdev, nvmsg, msglen);
break;
case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION:
- netvsc_send_vf(ndev, nvmsg);
+ if (hv_is_isolation_supported())
+ netdev_err(ndev, "Ignore VF_ASSOCIATION msg from the host supporting isolation\n");
+ else
+ netvsc_send_vf(ndev, nvmsg, msglen);
break;
}
}
@@ -1243,17 +1658,14 @@ static int netvsc_process_raw_pkt(struct hv_device *device,
switch (desc->type) {
case VM_PKT_COMP:
- netvsc_send_completion(ndev, net_device, channel,
- desc, budget);
+ netvsc_send_completion(ndev, net_device, channel, desc, budget);
break;
case VM_PKT_DATA_USING_XFER_PAGES:
- return netvsc_receive(ndev, net_device, nvchan,
- desc, nvmsg);
- break;
+ return netvsc_receive(ndev, net_device, nvchan, desc);
case VM_PKT_DATA_INBAND:
- netvsc_receive_inband(ndev, nvmsg);
+ netvsc_receive_inband(ndev, net_device, desc);
break;
default:
@@ -1291,12 +1703,17 @@ int netvsc_poll(struct napi_struct *napi, int budget)
if (!nvchan->desc)
nvchan->desc = hv_pkt_iter_first(channel);
+ nvchan->xdp_flush = false;
+
while (nvchan->desc && work_done < budget) {
work_done += netvsc_process_raw_pkt(device, nvchan, net_device,
ndev, nvchan->desc, budget);
nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
}
+ if (nvchan->xdp_flush)
+ xdp_do_flush();
+
/* Send any pending receive completions */
ret = send_recv_completions(ndev, net_device, nvchan);
@@ -1331,7 +1748,7 @@ void netvsc_channel_cb(void *context)
prefetch(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
if (napi_schedule_prep(&nvchan->napi)) {
- /* disable interupts from host */
+ /* disable interrupts from host */
hv_begin_read(rbi);
__napi_schedule_irqoff(&nvchan->napi);
@@ -1376,13 +1793,37 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
nvchan->net_device = net_device;
u64_stats_init(&nvchan->tx_stats.syncp);
u64_stats_init(&nvchan->rx_stats.syncp);
+
+ ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i, 0);
+
+ if (ret) {
+ netdev_err(ndev, "xdp_rxq_info_reg fail: %d\n", ret);
+ goto cleanup2;
+ }
+
+ ret = xdp_rxq_info_reg_mem_model(&nvchan->xdp_rxq,
+ MEM_TYPE_PAGE_SHARED, NULL);
+
+ if (ret) {
+ netdev_err(ndev, "xdp reg_mem_model fail: %d\n", ret);
+ goto cleanup2;
+ }
}
/* Enable NAPI handler before init callbacks */
- netif_napi_add(ndev, &net_device->chan_table[0].napi,
- netvsc_poll, NAPI_POLL_WEIGHT);
+ netif_napi_add(ndev, &net_device->chan_table[0].napi, netvsc_poll);
+ napi_enable(&net_device->chan_table[0].napi);
+ netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX,
+ &net_device->chan_table[0].napi);
+ netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX,
+ &net_device->chan_table[0].napi);
/* Open the channel */
+ device->channel->next_request_id_callback = vmbus_next_request_id;
+ device->channel->request_addr_callback = vmbus_request_addr;
+ device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
+ device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE;
+
ret = vmbus_open(device->channel, netvsc_ring_bytes,
netvsc_ring_bytes, NULL, 0,
netvsc_channel_cb, net_device->chan_table);
@@ -1395,8 +1836,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
/* Channel is opened */
netdev_dbg(ndev, "hv_netvsc channel opened successfully\n");
- napi_enable(&net_device->chan_table[0].napi);
-
/* Connect with the NetVsp */
ret = netvsc_connect_vsp(device, net_device, device_info);
if (ret != 0) {
@@ -1414,13 +1853,17 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
close:
RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
- napi_disable(&net_device->chan_table[0].napi);
/* Now, we can close the channel safely */
vmbus_close(device->channel);
cleanup:
+ netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL);
+ netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL);
+ napi_disable(&net_device->chan_table[0].napi);
netif_napi_del(&net_device->chan_table[0].napi);
+
+cleanup2:
free_netvsc_device(&net_device->rcu);
return ERR_PTR(ret);