diff options
Diffstat (limited to 'drivers/net/hyperv')
| -rw-r--r-- | drivers/net/hyperv/Kconfig | 3 | ||||
| -rw-r--r-- | drivers/net/hyperv/hyperv_net.h | 23 | ||||
| -rw-r--r-- | drivers/net/hyperv/netvsc.c | 106 | ||||
| -rw-r--r-- | drivers/net/hyperv/netvsc_bpf.c | 2 | ||||
| -rw-r--r-- | drivers/net/hyperv/netvsc_drv.c | 384 | ||||
| -rw-r--r-- | drivers/net/hyperv/netvsc_trace.h | 8 | ||||
| -rw-r--r-- | drivers/net/hyperv/rndis_filter.c | 58 |
7 files changed, 378 insertions, 206 deletions
diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig index ca7bf7f897d3..982964c1a9fb 100644 --- a/drivers/net/hyperv/Kconfig +++ b/drivers/net/hyperv/Kconfig @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only config HYPERV_NET tristate "Microsoft Hyper-V virtual network driver" - depends on HYPERV + depends on HYPERV_VMBUS select UCS2_STRING + select NLS help Select this option to enable the Hyper-V virtual network driver. diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index c9dd69dbe1b8..7397c693f984 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -16,6 +16,7 @@ #include <linux/hyperv.h> #include <linux/rndis.h> #include <linux/jhash.h> +#include <net/xdp.h> /* RSS related */ #define OID_GEN_RECEIVE_SCALE_CAPABILITIES 0x00010203 /* query only */ @@ -157,7 +158,6 @@ struct hv_netvsc_packet { u8 cp_partial; /* partial copy into send buffer */ u8 rmsg_size; /* RNDIS header and PPI size */ - u8 rmsg_pgcnt; /* page count of RNDIS header and PPI */ u8 page_buf_cnt; u16 q_idx; @@ -463,7 +463,7 @@ struct nvsp_1_message_send_receive_buffer_complete { * LargeOffset SmallOffset */ - struct nvsp_1_receive_buffer_section sections[1]; + struct nvsp_1_receive_buffer_section sections[]; } __packed; /* @@ -881,7 +881,7 @@ struct nvsp_message { #define VRSS_SEND_TAB_SIZE 16 /* must be power of 2 */ #define VRSS_CHANNEL_MAX 64 -#define VRSS_CHANNEL_DEFAULT 8 +#define VRSS_CHANNEL_DEFAULT 16 #define RNDIS_MAX_PKT_DEFAULT 8 #define RNDIS_PKT_ALIGN_DEFAULT 8 @@ -892,6 +892,18 @@ struct nvsp_message { sizeof(struct nvsp_message)) #define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor) +/* Maximum # of contiguous data ranges that can make up a trasmitted packet. + * Typically it's the max SKB fragments plus 2 for the rndis packet and the + * linear portion of the SKB. But if MAX_SKB_FRAGS is large, the value may + * need to be limited to MAX_PAGE_BUFFER_COUNT, which is the max # of entries + * in a GPA direct packet sent to netvsp over VMBus. + */ +#if MAX_SKB_FRAGS + 2 < MAX_PAGE_BUFFER_COUNT +#define MAX_DATA_RANGES (MAX_SKB_FRAGS + 2) +#else +#define MAX_DATA_RANGES MAX_PAGE_BUFFER_COUNT +#endif + /* Estimated requestor size: * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size */ @@ -1049,6 +1061,7 @@ struct net_device_context { struct net_device __rcu *vf_netdev; struct netvsc_vf_pcpu_stats __percpu *vf_stats; struct delayed_work vf_takeover; + struct delayed_work vfns_work; /* 1: allocated, serial number is valid. 0: not allocated */ u32 vf_alloc; @@ -1063,6 +1076,8 @@ struct net_device_context { struct netvsc_device_info *saved_netvsc_dev_info; }; +void netvsc_vfns_work(struct work_struct *w); + /* Azure hosts don't support non-TCP port numbers in hashing for fragmented * packets. We can use ethtool to change UDP hash level when necessary. */ @@ -1165,6 +1180,8 @@ struct netvsc_device { u32 max_chn; u32 num_chn; + u32 netvsc_gso_max_size; + atomic_t open_chn; struct work_struct subchan_work; wait_queue_head_t subchan_open; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 82e9796c8f5e..60a4629fe6ba 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -154,8 +154,11 @@ static void free_netvsc_device(struct rcu_head *head) int i; kfree(nvdev->extension); - vfree(nvdev->recv_buf); - vfree(nvdev->send_buf); + + if (!nvdev->recv_buf_gpadl_handle.decrypted) + vfree(nvdev->recv_buf); + if (!nvdev->send_buf_gpadl_handle.decrypted) + vfree(nvdev->send_buf); bitmap_free(nvdev->send_section_map); for (i = 0; i < VRSS_CHANNEL_MAX; i++) { @@ -708,7 +711,15 @@ void netvsc_device_remove(struct hv_device *device) /* Disable NAPI and disassociate its context from the device. */ for (i = 0; i < net_device->num_chn; i++) { /* See also vmbus_reset_channel_cb(). */ - napi_disable(&net_device->chan_table[i].napi); + /* only disable enabled NAPI channel */ + if (i < ndev->real_num_rx_queues) { + netif_queue_set_napi(ndev, i, NETDEV_QUEUE_TYPE_TX, + NULL); + netif_queue_set_napi(ndev, i, NETDEV_QUEUE_TYPE_RX, + NULL); + napi_disable(&net_device->chan_table[i].napi); + } + netif_napi_del(&net_device->chan_table[i].napi); } @@ -851,16 +862,17 @@ static void netvsc_send_completion(struct net_device *ndev, msglen); return; } - fallthrough; + break; case NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE: if (msglen < sizeof(struct nvsp_message_header) + - sizeof(struct nvsp_1_message_send_receive_buffer_complete)) { + struct_size_t(struct nvsp_1_message_send_receive_buffer_complete, + sections, 1)) { netdev_err(ndev, "nvsp_msg1 length too small: %u\n", msglen); return; } - fallthrough; + break; case NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE: if (msglen < sizeof(struct nvsp_message_header) + @@ -869,7 +881,7 @@ static void netvsc_send_completion(struct net_device *ndev, msglen); return; } - fallthrough; + break; case NVSP_MSG5_TYPE_SUBCHANNEL: if (msglen < sizeof(struct nvsp_message_header) + @@ -878,10 +890,6 @@ static void netvsc_send_completion(struct net_device *ndev, msglen); return; } - /* Copy the response back */ - memcpy(&net_device->channel_init_pkt, nvsp_packet, - sizeof(struct nvsp_message)); - complete(&net_device->channel_init_wait); break; case NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE: @@ -904,13 +912,19 @@ static void netvsc_send_completion(struct net_device *ndev, netvsc_send_tx_complete(ndev, net_device, incoming_channel, desc, budget); - break; + return; default: netdev_err(ndev, "Unknown send completion type %d received!!\n", nvsp_packet->hdr.msg_type); + return; } + + /* Copy the response back */ + memcpy(&net_device->channel_init_pkt, nvsp_packet, + sizeof(struct nvsp_message)); + complete(&net_device->channel_init_wait); } static u32 netvsc_get_next_send_section(struct netvsc_device *net_device) @@ -939,8 +953,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, + pend_size; int i; u32 padding = 0; - u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt : - packet->page_buf_cnt; + u32 page_count = packet->cp_partial ? 1 : packet->page_buf_cnt; u32 remain; /* Add padding */ @@ -1041,6 +1054,42 @@ static int netvsc_dma_map(struct hv_device *hv_dev, return 0; } +/* Build an "array" of mpb entries describing the data to be transferred + * over VMBus. After the desc header fields, each "array" entry is variable + * size, and each entry starts after the end of the previous entry. The + * "offset" and "len" fields for each entry imply the size of the entry. + * + * The pfns are in HV_HYP_PAGE_SIZE, because all communication with Hyper-V + * uses that granularity, even if the system page size of the guest is larger. + * Each entry in the input "pb" array must describe a contiguous range of + * guest physical memory so that the pfns are sequential if the range crosses + * a page boundary. The offset field must be < HV_HYP_PAGE_SIZE. + */ +static inline void netvsc_build_mpb_array(struct hv_page_buffer *pb, + u32 page_buffer_count, + struct vmbus_packet_mpb_array *desc, + u32 *desc_size) +{ + struct hv_mpb_array *mpb_entry = &desc->range; + int i, j; + + for (i = 0; i < page_buffer_count; i++) { + u32 offset = pb[i].offset; + u32 len = pb[i].len; + + mpb_entry->offset = offset; + mpb_entry->len = len; + + for (j = 0; j < HVPFN_UP(offset + len); j++) + mpb_entry->pfn_array[j] = pb[i].pfn + j; + + mpb_entry = (struct hv_mpb_array *)&mpb_entry->pfn_array[j]; + } + + desc->rangecount = page_buffer_count; + *desc_size = (char *)mpb_entry - (char *)desc; +} + static inline int netvsc_send_pkt( struct hv_device *device, struct hv_netvsc_packet *packet, @@ -1083,8 +1132,11 @@ static inline int netvsc_send_pkt( packet->dma_range = NULL; if (packet->page_buf_cnt) { + struct vmbus_channel_packet_page_buffer desc; + u32 desc_size; + if (packet->cp_partial) - pb += packet->rmsg_pgcnt; + pb++; ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb); if (ret) { @@ -1092,11 +1144,12 @@ static inline int netvsc_send_pkt( goto exit; } - ret = vmbus_sendpacket_pagebuffer(out_channel, - pb, packet->page_buf_cnt, - &nvmsg, sizeof(nvmsg), - req_id); - + netvsc_build_mpb_array(pb, packet->page_buf_cnt, + (struct vmbus_packet_mpb_array *)&desc, + &desc_size); + ret = vmbus_sendpacket_mpb_desc(out_channel, + (struct vmbus_packet_mpb_array *)&desc, + desc_size, &nvmsg, sizeof(nvmsg), req_id); if (ret) netvsc_dma_unmap(ndev_ctx->device_ctx, packet); } else { @@ -1245,7 +1298,7 @@ int netvsc_send(struct net_device *ndev, packet->send_buf_index = section_index; if (packet->cp_partial) { - packet->page_buf_cnt -= packet->rmsg_pgcnt; + packet->page_buf_cnt--; packet->total_data_buflen = msd_len + packet->rmsg_size; } else { packet->page_buf_cnt = 0; @@ -1759,6 +1812,11 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, /* Enable NAPI handler before init callbacks */ netif_napi_add(ndev, &net_device->chan_table[0].napi, netvsc_poll); + napi_enable(&net_device->chan_table[0].napi); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, + &net_device->chan_table[0].napi); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, + &net_device->chan_table[0].napi); /* Open the channel */ device->channel->next_request_id_callback = vmbus_next_request_id; @@ -1778,8 +1836,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, /* Channel is opened */ netdev_dbg(ndev, "hv_netvsc channel opened successfully\n"); - napi_enable(&net_device->chan_table[0].napi); - /* Connect with the NetVsp */ ret = netvsc_connect_vsp(device, net_device, device_info); if (ret != 0) { @@ -1797,12 +1853,14 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, close: RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); - napi_disable(&net_device->chan_table[0].napi); /* Now, we can close the channel safely */ vmbus_close(device->channel); cleanup: + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL); + napi_disable(&net_device->chan_table[0].napi); netif_napi_del(&net_device->chan_table[0].napi); cleanup2: diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 4a9522689fa4..1dd3755d9e6d 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -183,7 +183,7 @@ int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog) xdp.command = XDP_SETUP_PROG; xdp.prog = prog; - ret = vf_netdev->netdev_ops->ndo_bpf(vf_netdev, &xdp); + ret = netif_xdp_propagate(vf_netdev, &xdp); if (ret && prog) bpf_prog_put(prog); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 3ba3c8fb28a5..3d47d749ef9f 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -29,6 +29,7 @@ #include <linux/bpf.h> #include <net/arp.h> +#include <net/netdev_lock.h> #include <net/route.h> #include <net/sock.h> #include <net/pkt_sched.h> @@ -42,9 +43,13 @@ #define LINKCHANGE_INT (2 * HZ) #define VF_TAKEOVER_INT (HZ / 10) +/* Macros to define the context of vf registration */ +#define VF_REG_IN_PROBE 1 +#define VF_REG_IN_NOTIFIER 2 + static unsigned int ring_size __ro_after_init = 128; module_param(ring_size, uint, 0444); -MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)"); +MODULE_PARM_DESC(ring_size, "Ring buffer size (# of 4K pages)"); unsigned int netvsc_ring_bytes __ro_after_init; static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | @@ -321,43 +326,10 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, return txq; } -static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len, - struct hv_page_buffer *pb) -{ - int j = 0; - - hvpfn += offset >> HV_HYP_PAGE_SHIFT; - offset = offset & ~HV_HYP_PAGE_MASK; - - while (len > 0) { - unsigned long bytes; - - bytes = HV_HYP_PAGE_SIZE - offset; - if (bytes > len) - bytes = len; - pb[j].pfn = hvpfn; - pb[j].offset = offset; - pb[j].len = bytes; - - offset += bytes; - len -= bytes; - - if (offset == HV_HYP_PAGE_SIZE && len) { - hvpfn++; - offset = 0; - j++; - } - } - - return j + 1; -} - static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, struct hv_netvsc_packet *packet, struct hv_page_buffer *pb) { - u32 slots_used = 0; - char *data = skb->data; int frags = skb_shinfo(skb)->nr_frags; int i; @@ -366,28 +338,27 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, * 2. skb linear data * 3. skb fragment data */ - slots_used += fill_pg_buf(virt_to_hvpfn(hdr), - offset_in_hvpage(hdr), - len, - &pb[slots_used]); + pb[0].offset = offset_in_hvpage(hdr); + pb[0].len = len; + pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = slots_used; - slots_used += fill_pg_buf(virt_to_hvpfn(data), - offset_in_hvpage(data), - skb_headlen(skb), - &pb[slots_used]); + pb[1].offset = offset_in_hvpage(skb->data); + pb[1].len = skb_headlen(skb); + pb[1].pfn = virt_to_hvpfn(skb->data); for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; + struct hv_page_buffer *cur_pb = &pb[i + 2]; + u64 pfn = page_to_hvpfn(skb_frag_page(frag)); + u32 offset = skb_frag_off(frag); - slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)), - skb_frag_off(frag), - skb_frag_size(frag), - &pb[slots_used]); + cur_pb->offset = offset_in_hvpage(offset); + cur_pb->len = skb_frag_size(frag); + cur_pb->pfn = pfn + (offset >> HV_HYP_PAGE_SHIFT); } - return slots_used; + return frags + 2; } static int count_skb_frag_slots(struct sk_buff *skb) @@ -478,7 +449,7 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx) struct net_device *vf_netdev; u32 rndis_msg_size; u32 hash; - struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT]; + struct hv_page_buffer pb[MAX_DATA_RANGES]; /* If VF is present and up then redirect packets to it. * Skip the VF if it is marked down or has no carrier. @@ -983,7 +954,8 @@ struct netvsc_device_info *netvsc_devinfo_get(struct netvsc_device *nvdev) dev_info->bprog = prog; } } else { - dev_info->num_chn = VRSS_CHANNEL_DEFAULT; + dev_info->num_chn = max(VRSS_CHANNEL_DEFAULT, + netif_get_num_default_rss_queues()); dev_info->send_sections = NETVSC_DEFAULT_TX; dev_info->send_section_size = NETVSC_SEND_SECTION_SIZE; dev_info->recv_sections = NETVSC_DEFAULT_RX; @@ -1229,14 +1201,14 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu) if (ret) goto rollback_vf; - ndev->mtu = mtu; + WRITE_ONCE(ndev->mtu, mtu); ret = netvsc_attach(ndev, device_info); if (!ret) goto out; /* Attempt rollback to original MTU */ - ndev->mtu = orig_mtu; + WRITE_ONCE(ndev->mtu, orig_mtu); if (netvsc_attach(ndev, device_info)) netdev_err(ndev, "restoring mtu failed\n"); @@ -1399,7 +1371,7 @@ static int netvsc_set_mac_addr(struct net_device *ndev, void *p) struct net_device_context *ndc = netdev_priv(ndev); struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev); struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev); - struct sockaddr *addr = p; + struct sockaddr_storage *addr = p; int err; err = eth_prepare_mac_addr_change(ndev, p); @@ -1415,12 +1387,12 @@ static int netvsc_set_mac_addr(struct net_device *ndev, void *p) return err; } - err = rndis_filter_set_device_mac(nvdev, addr->sa_data); + err = rndis_filter_set_device_mac(nvdev, addr->__data); if (!err) { eth_commit_mac_addr_change(ndev, p); } else if (vf_netdev) { /* rollback change on VF */ - memcpy(addr->sa_data, ndev->dev_addr, ETH_ALEN); + memcpy(addr->__data, ndev->dev_addr, ETH_ALEN); dev_set_mac_address(vf_netdev, addr, NULL); } @@ -1552,7 +1524,7 @@ static void netvsc_get_ethtool_stats(struct net_device *dev, data[i++] = xdp_tx; } - pcpu_sum = kvmalloc_array(num_possible_cpus(), + pcpu_sum = kvmalloc_array(nr_cpu_ids, sizeof(struct netvsc_ethtool_pcpu_stats), GFP_KERNEL); if (!pcpu_sum) @@ -1582,10 +1554,10 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data) switch (stringset) { case ETH_SS_STATS: for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++) - ethtool_sprintf(&p, netvsc_stats[i].name); + ethtool_puts(&p, netvsc_stats[i].name); for (i = 0; i < ARRAY_SIZE(vf_stats); i++) - ethtool_sprintf(&p, vf_stats[i].name); + ethtool_puts(&p, vf_stats[i].name); for (i = 0; i < nvdev->num_chn; i++) { ethtool_sprintf(&p, "tx_queue_%u_packets", i); @@ -1608,9 +1580,10 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data) } static int -netvsc_get_rss_hash_opts(struct net_device_context *ndc, - struct ethtool_rxnfc *info) +netvsc_get_rxfh_fields(struct net_device *ndev, + struct ethtool_rxfh_fields *info) { + struct net_device_context *ndc = netdev_priv(ndev); const u32 l4_flag = RXH_L4_B_0_1 | RXH_L4_B_2_3; info->data = RXH_IP_SRC | RXH_IP_DST; @@ -1651,30 +1624,24 @@ netvsc_get_rss_hash_opts(struct net_device_context *ndc, return 0; } -static int -netvsc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, - u32 *rules) +static u32 netvsc_get_rx_ring_count(struct net_device *dev) { struct net_device_context *ndc = netdev_priv(dev); struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev); if (!nvdev) - return -ENODEV; - - switch (info->cmd) { - case ETHTOOL_GRXRINGS: - info->data = nvdev->num_chn; return 0; - case ETHTOOL_GRXFH: - return netvsc_get_rss_hash_opts(ndc, info); - } - return -EOPNOTSUPP; + return nvdev->num_chn; } -static int netvsc_set_rss_hash_opts(struct net_device_context *ndc, - struct ethtool_rxnfc *info) +static int +netvsc_set_rxfh_fields(struct net_device *dev, + const struct ethtool_rxfh_fields *info, + struct netlink_ext_ack *extack) { + struct net_device_context *ndc = netdev_priv(dev); + if (info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) { switch (info->flow_type) { @@ -1729,17 +1696,6 @@ static int netvsc_set_rss_hash_opts(struct net_device_context *ndc, return -EOPNOTSUPP; } -static int -netvsc_set_rxnfc(struct net_device *ndev, struct ethtool_rxnfc *info) -{ - struct net_device_context *ndc = netdev_priv(ndev); - - if (info->cmd == ETHTOOL_SRXFH) - return netvsc_set_rss_hash_opts(ndc, info); - - return -EOPNOTSUPP; -} - static u32 netvsc_get_rxfh_key_size(struct net_device *dev) { return NETVSC_HASH_KEYLEN; @@ -1752,8 +1708,8 @@ static u32 netvsc_rss_indir_size(struct net_device *dev) return ndc->rx_table_sz; } -static int netvsc_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, - u8 *hfunc) +static int netvsc_get_rxfh(struct net_device *dev, + struct ethtool_rxfh_param *rxfh) { struct net_device_context *ndc = netdev_priv(dev); struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev); @@ -1763,47 +1719,49 @@ static int netvsc_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, if (!ndev) return -ENODEV; - if (hfunc) - *hfunc = ETH_RSS_HASH_TOP; /* Toeplitz */ + rxfh->hfunc = ETH_RSS_HASH_TOP; /* Toeplitz */ rndis_dev = ndev->extension; - if (indir) { + if (rxfh->indir) { for (i = 0; i < ndc->rx_table_sz; i++) - indir[i] = ndc->rx_table[i]; + rxfh->indir[i] = ndc->rx_table[i]; } - if (key) - memcpy(key, rndis_dev->rss_key, NETVSC_HASH_KEYLEN); + if (rxfh->key) + memcpy(rxfh->key, rndis_dev->rss_key, NETVSC_HASH_KEYLEN); return 0; } -static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir, - const u8 *key, const u8 hfunc) +static int netvsc_set_rxfh(struct net_device *dev, + struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack) { struct net_device_context *ndc = netdev_priv(dev); struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev); struct rndis_device *rndis_dev; + u8 *key = rxfh->key; int i; if (!ndev) return -ENODEV; - if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) + if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && + rxfh->hfunc != ETH_RSS_HASH_TOP) return -EOPNOTSUPP; rndis_dev = ndev->extension; - if (indir) { + if (rxfh->indir) { for (i = 0; i < ndc->rx_table_sz; i++) - if (indir[i] >= ndev->num_chn) + if (rxfh->indir[i] >= ndev->num_chn) return -EINVAL; for (i = 0; i < ndc->rx_table_sz; i++) - ndc->rx_table[i] = indir[i]; + ndc->rx_table[i] = rxfh->indir[i]; } if (!key) { - if (!indir) + if (!rxfh->indir) return 0; key = rndis_dev->rss_key; @@ -2004,12 +1962,13 @@ static const struct ethtool_ops ethtool_ops = { .get_channels = netvsc_get_channels, .set_channels = netvsc_set_channels, .get_ts_info = ethtool_op_get_ts_info, - .get_rxnfc = netvsc_get_rxnfc, - .set_rxnfc = netvsc_set_rxnfc, + .get_rx_ring_count = netvsc_get_rx_ring_count, .get_rxfh_key_size = netvsc_get_rxfh_key_size, .get_rxfh_indir_size = netvsc_rss_indir_size, .get_rxfh = netvsc_get_rxfh, .set_rxfh = netvsc_set_rxfh, + .get_rxfh_fields = netvsc_get_rxfh_fields, + .set_rxfh_fields = netvsc_set_rxfh_fields, .get_link_ksettings = netvsc_get_link_ksettings, .set_link_ksettings = netvsc_set_link_ksettings, .get_ringparam = netvsc_get_ringparam, @@ -2183,7 +2142,7 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb) } static int netvsc_vf_join(struct net_device *vf_netdev, - struct net_device *ndev) + struct net_device *ndev, int context) { struct net_device_context *ndev_ctx = netdev_priv(ndev); int ret; @@ -2206,10 +2165,11 @@ static int netvsc_vf_join(struct net_device *vf_netdev, goto upper_link_failed; } - /* set slave flag before open to prevent IPv6 addrconf */ - vf_netdev->flags |= IFF_SLAVE; - - schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); + /* If this registration is called from probe context vf_takeover + * is taken care of later in probe itself. + */ + if (context == VF_REG_IN_NOTIFIER) + schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); @@ -2315,16 +2275,18 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) } - /* Fallback path to check synthetic vf with - * help of mac addr + /* Fallback path to check synthetic vf with help of mac addr. + * Because this function can be called before vf_netdev is + * initialized (NETDEV_POST_INIT) when its perm_addr has not been copied + * from dev_addr, also try to match to its dev_addr. + * Note: On Hyper-V and Azure, it's not possible to set a MAC address + * on a VF that matches to the MAC of a unrelated NETVSC device. */ list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) { ndev = hv_get_drvdata(ndev_ctx->device_ctx); - if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) { - netdev_notice(vf_netdev, - "falling back to mac addr based matching\n"); + if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr) || + ether_addr_equal(vf_netdev->dev_addr, ndev->perm_addr)) return ndev; - } } netdev_notice(vf_netdev, @@ -2332,7 +2294,23 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) return NULL; } -static int netvsc_register_vf(struct net_device *vf_netdev) +static int netvsc_prepare_bonding(struct net_device *vf_netdev) +{ + struct net_device *ndev; + + ndev = get_netvsc_byslot(vf_netdev); + if (!ndev) + return NOTIFY_DONE; + + /* Set slave flag and no addrconf flag before open + * to prevent IPv6 addrconf. + */ + vf_netdev->flags |= IFF_SLAVE; + vf_netdev->priv_flags |= IFF_NO_ADDRCONF; + return NOTIFY_DONE; +} + +static int netvsc_register_vf(struct net_device *vf_netdev, int context) { struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; @@ -2372,7 +2350,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev) netdev_info(ndev, "VF registering: %s\n", vf_netdev->name); - if (netvsc_vf_join(vf_netdev, ndev) != 0) + if (netvsc_vf_join(vf_netdev, ndev, context) != 0) return NOTIFY_DONE; dev_hold(vf_netdev); @@ -2438,6 +2416,21 @@ static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event) } else { netdev_info(ndev, "Data path switched %s VF: %s\n", vf_is_up ? "to" : "from", vf_netdev->name); + + /* In Azure, when accelerated networking in enabled, other NICs + * like MANA, MLX, are configured as a bonded nic with + * Netvsc(failover) NIC. For bonded NICs, the min of the max + * pkt aggregate size of the members is propagated in the stack. + * In order to allow these NICs (MANA/MLX) to use up to + * GSO_MAX_SIZE gso packet size, we need to allow Netvsc NIC to + * also support this in the guest. + * This value is only increased for netvsc NIC when datapath is + * switched over to the VF + */ + if (vf_is_up) + netif_set_tso_max_size(ndev, vf_netdev->tso_max_size); + else + netif_set_tso_max_size(ndev, netvsc_dev->netvsc_gso_max_size); } return NOTIFY_OK; @@ -2457,8 +2450,6 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name); - netvsc_vf_setxdp(vf_netdev, NULL); - reinit_completion(&net_device_ctx->vf_add); netdev_rx_handler_unregister(vf_netdev); netdev_upper_dev_unlink(vf_netdev, ndev); @@ -2470,10 +2461,31 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) return NOTIFY_OK; } +static int check_dev_is_matching_vf(struct net_device *event_ndev) +{ + /* Skip NetVSC interfaces */ + if (event_ndev->netdev_ops == &device_ops) + return -ENODEV; + + /* Avoid non-Ethernet type devices */ + if (event_ndev->type != ARPHRD_ETHER) + return -ENODEV; + + /* Avoid Vlan dev with same MAC registering as VF */ + if (is_vlan_dev(event_ndev)) + return -ENODEV; + + /* Avoid Bonding master dev with same MAC registering as VF */ + if (netif_is_bond_master(event_ndev)) + return -ENODEV; + + return 0; +} + static int netvsc_probe(struct hv_device *dev, const struct hv_vmbus_device_id *dev_id) { - struct net_device *net = NULL; + struct net_device *net = NULL, *vf_netdev; struct net_device_context *net_device_ctx; struct netvsc_device_info *device_info = NULL; struct netvsc_device *nvdev; @@ -2503,6 +2515,7 @@ static int netvsc_probe(struct hv_device *dev, spin_lock_init(&net_device_ctx->lock); INIT_LIST_HEAD(&net_device_ctx->reconfig_events); INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup); + INIT_DELAYED_WORK(&net_device_ctx->vfns_work, netvsc_vfns_work); net_device_ctx->vf_stats = netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats); @@ -2531,15 +2544,6 @@ static int netvsc_probe(struct hv_device *dev, goto devinfo_failed; } - nvdev = rndis_filter_device_add(dev, device_info); - if (IS_ERR(nvdev)) { - ret = PTR_ERR(nvdev); - netdev_err(net, "unable to add netvsc device (ret %d)\n", ret); - goto rndis_failed; - } - - eth_hw_addr_set(net, device_info->mac_adr); - /* We must get rtnl lock before scheduling nvdev->subchan_work, * otherwise netvsc_subchan_work() can get rtnl lock first and wait * all subchannels to show up, but that may not happen because @@ -2547,9 +2551,23 @@ static int netvsc_probe(struct hv_device *dev, * -> ... -> device_add() -> ... -> __device_attach() can't get * the device lock, so all the subchannels can't be processed -- * finally netvsc_subchan_work() hangs forever. + * + * The rtnl lock also needs to be held before rndis_filter_device_add() + * which advertises nvsp_2_vsc_capability / sriov bit, and triggers + * VF NIC offering and registering. If VF NIC finished register_netdev() + * earlier it may cause name based config failure. */ rtnl_lock(); + nvdev = rndis_filter_device_add(dev, device_info); + if (IS_ERR(nvdev)) { + ret = PTR_ERR(nvdev); + netdev_err(net, "unable to add netvsc device (ret %d)\n", ret); + goto rndis_failed; + } + + eth_hw_addr_set(net, device_info->mac_adr); + if (nvdev->num_chn > 1) schedule_work(&nvdev->subchan_work); @@ -2580,15 +2598,41 @@ static int netvsc_probe(struct hv_device *dev, } list_add(&net_device_ctx->list, &netvsc_dev_list); + + /* When the hv_netvsc driver is unloaded and reloaded, the + * NET_DEVICE_REGISTER for the vf device is replayed before probe + * is complete. This is because register_netdevice_notifier() gets + * registered before vmbus_driver_register() so that callback func + * is set before probe and we don't miss events like NETDEV_POST_INIT + * So, in this section we try to register the matching vf device that + * is present as a netdevice, knowing that its register call is not + * processed in the netvsc_netdev_notifier(as probing is progress and + * get_netvsc_byslot fails). + */ + for_each_netdev(dev_net(net), vf_netdev) { + ret = check_dev_is_matching_vf(vf_netdev); + if (ret != 0) + continue; + + if (net != get_netvsc_byslot(vf_netdev)) + continue; + + netvsc_prepare_bonding(vf_netdev); + netdev_lock_ops(vf_netdev); + netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE); + netdev_unlock_ops(vf_netdev); + __netvsc_vf_setup(net, vf_netdev); + break; + } rtnl_unlock(); netvsc_devinfo_put(device_info); return 0; register_failed: - rtnl_unlock(); rndis_filter_device_remove(dev, nvdev); rndis_failed: + rtnl_unlock(); netvsc_devinfo_put(device_info); devinfo_failed: free_percpu(net_device_ctx->vf_stats); @@ -2616,6 +2660,8 @@ static void netvsc_remove(struct hv_device *dev) cancel_delayed_work_sync(&ndev_ctx->dwork); rtnl_lock(); + cancel_delayed_work_sync(&ndev_ctx->vfns_work); + nvdev = rtnl_dereference(ndev_ctx->nvdev); if (nvdev) { cancel_work_sync(&nvdev->subchan_work); @@ -2657,6 +2703,7 @@ static int netvsc_suspend(struct hv_device *dev) cancel_delayed_work_sync(&ndev_ctx->dwork); rtnl_lock(); + cancel_delayed_work_sync(&ndev_ctx->vfns_work); nvdev = rtnl_dereference(ndev_ctx->nvdev); if (nvdev == NULL) { @@ -2725,6 +2772,52 @@ static struct hv_driver netvsc_drv = { }, }; +/* Set VF's namespace same as the synthetic NIC */ +static void netvsc_event_set_vf_ns(struct net_device *ndev) +{ + struct net_device_context *ndev_ctx = netdev_priv(ndev); + struct net_device *vf_netdev; + int ret; + + vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); + if (!vf_netdev) + return; + + if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) { + ret = dev_change_net_namespace(vf_netdev, dev_net(ndev), + "eth%d"); + if (ret) + netdev_err(vf_netdev, + "Cannot move to same namespace as %s: %d\n", + ndev->name, ret); + else + netdev_info(vf_netdev, + "Moved VF to namespace with: %s\n", + ndev->name); + } +} + +void netvsc_vfns_work(struct work_struct *w) +{ + struct net_device_context *ndev_ctx = + container_of(w, struct net_device_context, vfns_work.work); + struct net_device *ndev; + + if (!rtnl_trylock()) { + schedule_delayed_work(&ndev_ctx->vfns_work, 1); + return; + } + + ndev = hv_get_drvdata(ndev_ctx->device_ctx); + if (!ndev) + goto out; + + netvsc_event_set_vf_ns(ndev); + +out: + rtnl_unlock(); +} + /* * On Hyper-V, every VF interface is matched with a corresponding * synthetic interface. The synthetic interface is presented first @@ -2735,26 +2828,24 @@ static int netvsc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + struct net_device_context *ndev_ctx; + int ret = 0; - /* Skip our own events */ - if (event_dev->netdev_ops == &device_ops) - return NOTIFY_DONE; - - /* Avoid non-Ethernet type devices */ - if (event_dev->type != ARPHRD_ETHER) - return NOTIFY_DONE; - - /* Avoid Vlan dev with same MAC registering as VF */ - if (is_vlan_dev(event_dev)) + if (event_dev->netdev_ops == &device_ops && event == NETDEV_REGISTER) { + ndev_ctx = netdev_priv(event_dev); + schedule_delayed_work(&ndev_ctx->vfns_work, 0); return NOTIFY_DONE; + } - /* Avoid Bonding master dev with same MAC registering as VF */ - if (netif_is_bond_master(event_dev)) + ret = check_dev_is_matching_vf(event_dev); + if (ret != 0) return NOTIFY_DONE; switch (event) { + case NETDEV_POST_INIT: + return netvsc_prepare_bonding(event_dev); case NETDEV_REGISTER: - return netvsc_register_vf(event_dev); + return netvsc_register_vf(event_dev, VF_REG_IN_NOTIFIER); case NETDEV_UNREGISTER: return netvsc_unregister_vf(event_dev); case NETDEV_UP: @@ -2786,14 +2877,19 @@ static int __init netvsc_drv_init(void) pr_info("Increased ring_size to %u (min allowed)\n", ring_size); } - netvsc_ring_bytes = ring_size * PAGE_SIZE; + netvsc_ring_bytes = VMBUS_RING_SIZE(ring_size * 4096); + + register_netdevice_notifier(&netvsc_netdev_notifier); ret = vmbus_driver_register(&netvsc_drv); if (ret) - return ret; + goto err_vmbus_reg; - register_netdevice_notifier(&netvsc_netdev_notifier); return 0; + +err_vmbus_reg: + unregister_netdevice_notifier(&netvsc_netdev_notifier); + return ret; } MODULE_LICENSE("GPL"); diff --git a/drivers/net/hyperv/netvsc_trace.h b/drivers/net/hyperv/netvsc_trace.h index f7585563dea5..05e620cbdd29 100644 --- a/drivers/net/hyperv/netvsc_trace.h +++ b/drivers/net/hyperv/netvsc_trace.h @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(rndis_msg_class, __field( u32, msg_len ) ), TP_fast_assign( - __assign_str(name, ndev->name); + __assign_str(name); __entry->queue = q; __entry->req_id = msg->msg.init_req.req_id; __entry->msg_type = msg->ndis_msg_type; @@ -121,7 +121,7 @@ TRACE_EVENT(nvsp_send, __field( u32, msg_type ) ), TP_fast_assign( - __assign_str(name, ndev->name); + __assign_str(name); __entry->msg_type = msg->hdr.msg_type; ), TP_printk("dev=%s type=%s", @@ -142,7 +142,7 @@ TRACE_EVENT(nvsp_send_pkt, __field( u32, section_size ) ), TP_fast_assign( - __assign_str(name, ndev->name); + __assign_str(name); __entry->qid = chan->offermsg.offer.sub_channel_index; __entry->channel_type = rpkt->channel_type; __entry->section_index = rpkt->send_buf_section_index; @@ -165,7 +165,7 @@ TRACE_EVENT(nvsp_recv, __field( u32, msg_type ) ), TP_fast_assign( - __assign_str(name, ndev->name); + __assign_str(name); __entry->qid = chan->offermsg.offer.sub_channel_index; __entry->msg_type = msg->hdr.msg_type; ), diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index af95947a87c5..c35f9685b6bf 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -21,7 +21,6 @@ #include <linux/rtnetlink.h> #include <linux/ucs2_string.h> #include <linux/string.h> -#include <linux/slab.h> #include "hyperv_net.h" #include "netvsc_trace.h" @@ -226,8 +225,7 @@ static int rndis_filter_send_request(struct rndis_device *dev, struct rndis_request *req) { struct hv_netvsc_packet *packet; - struct hv_page_buffer page_buf[2]; - struct hv_page_buffer *pb = page_buf; + struct hv_page_buffer pb; int ret; /* Setup the packet to send it */ @@ -236,27 +234,14 @@ static int rndis_filter_send_request(struct rndis_device *dev, packet->total_data_buflen = req->request_msg.msg_len; packet->page_buf_cnt = 1; - pb[0].pfn = virt_to_phys(&req->request_msg) >> - HV_HYP_PAGE_SHIFT; - pb[0].len = req->request_msg.msg_len; - pb[0].offset = offset_in_hvpage(&req->request_msg); - - /* Add one page_buf when request_msg crossing page boundary */ - if (pb[0].offset + pb[0].len > HV_HYP_PAGE_SIZE) { - packet->page_buf_cnt++; - pb[0].len = HV_HYP_PAGE_SIZE - - pb[0].offset; - pb[1].pfn = virt_to_phys((void *)&req->request_msg - + pb[0].len) >> HV_HYP_PAGE_SHIFT; - pb[1].offset = 0; - pb[1].len = req->request_msg.msg_len - - pb[0].len; - } + pb.pfn = virt_to_phys(&req->request_msg) >> HV_HYP_PAGE_SHIFT; + pb.len = req->request_msg.msg_len; + pb.offset = offset_in_hvpage(&req->request_msg); trace_rndis_send(dev->ndev, 0, &req->request_msg); rcu_read_lock_bh(); - ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL, false); + ret = netvsc_send(dev->ndev, packet, NULL, &pb, NULL, false); rcu_read_unlock_bh(); return ret; @@ -1267,13 +1252,27 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE; + /* Enable napi before opening the vmbus channel to avoid races + * as the host placing data on the host->guest ring may be left + * out if napi was not enabled. + */ + napi_enable(&nvchan->napi); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX, + &nvchan->napi); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX, + &nvchan->napi); + ret = vmbus_open(new_sc, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, nvchan); - if (ret == 0) - napi_enable(&nvchan->napi); - else + if (ret != 0) { netdev_notice(ndev, "sub channel open failed: %d\n", ret); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX, + NULL); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX, + NULL); + napi_disable(&nvchan->napi); + } if (atomic_inc_return(&nvscdev->open_chn) == nvscdev->num_chn) wake_up(&nvscdev->subchan_open); @@ -1352,9 +1351,10 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, struct net_device_context *net_device_ctx = netdev_priv(net); struct ndis_offload hwcaps; struct ndis_offload_params offloads; - unsigned int gso_max_size = GSO_LEGACY_MAX_SIZE; int ret; + nvdev->netvsc_gso_max_size = GSO_LEGACY_MAX_SIZE; + /* Find HW offload capabilities */ ret = rndis_query_hwcaps(rndis_device, nvdev, &hwcaps); if (ret != 0) @@ -1386,8 +1386,8 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, offloads.lso_v2_ipv4 = NDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; net->hw_features |= NETIF_F_TSO; - if (hwcaps.lsov2.ip4_maxsz < gso_max_size) - gso_max_size = hwcaps.lsov2.ip4_maxsz; + if (hwcaps.lsov2.ip4_maxsz < nvdev->netvsc_gso_max_size) + nvdev->netvsc_gso_max_size = hwcaps.lsov2.ip4_maxsz; } if (hwcaps.csum.ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { @@ -1407,8 +1407,8 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, offloads.lso_v2_ipv6 = NDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; net->hw_features |= NETIF_F_TSO6; - if (hwcaps.lsov2.ip6_maxsz < gso_max_size) - gso_max_size = hwcaps.lsov2.ip6_maxsz; + if (hwcaps.lsov2.ip6_maxsz < nvdev->netvsc_gso_max_size) + nvdev->netvsc_gso_max_size = hwcaps.lsov2.ip6_maxsz; } if (hwcaps.csum.ip6_txcsum & NDIS_TXCSUM_CAP_UDP6) { @@ -1434,7 +1434,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, */ net->features &= ~NETVSC_SUPPORTED_HW_FEATURES | net->hw_features; - netif_set_tso_max_size(net, gso_max_size); + netif_set_tso_max_size(net, nvdev->netvsc_gso_max_size); ret = rndis_filter_set_offload_params(net, nvdev, &offloads); |
