diff options
Diffstat (limited to 'drivers/vdpa')
40 files changed, 9212 insertions, 1303 deletions
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 50f45d037611..857cf288c876 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -11,8 +11,7 @@ if VDPA config VDPA_SIM tristate "vDPA device simulator core" - depends on RUNTIME_TESTING_MENU && HAS_DMA - select DMA_OPS + depends on RUNTIME_TESTING_MENU select VHOST_RING select IOMMU_IOVA help @@ -35,8 +34,7 @@ config VDPA_SIM_BLOCK config VDPA_USER tristate "VDUSE (vDPA Device in Userspace) support" - depends on EVENTFD && MMU && HAS_DMA - select DMA_OPS + depends on EVENTFD && MMU select VHOST_IOTLB select IOMMU_IOVA help @@ -71,6 +69,18 @@ config MLX5_VDPA_NET be executed by the hardware. It also supports a variety of stateless offloads depending on the actual device used and firmware version. +config MLX5_VDPA_STEERING_DEBUG + bool "expose steering counters on debugfs" + select MLX5_VDPA + help + Expose RX steering counters in debugfs to aid in debugging. For each VLAN + or non VLAN interface, two hardware counters are added to the RX flow + table: one for unicast and one for multicast. + The counters counts the number of packets and bytes and exposes them in + debugfs. Once can read the counters using, e.g.: + cat /sys/kernel/debug/mlx5/mlx5_core.sf.1/vdpa-0/rx/untagged/ucast/packets + cat /sys/kernel/debug/mlx5/mlx5_core.sf.1/vdpa-0/rx/untagged/mcast/bytes + config VP_VDPA tristate "Virtio PCI bridge vDPA driver" select VIRTIO_PCI_LIB @@ -86,4 +96,43 @@ config ALIBABA_ENI_VDPA VDPA driver for Alibaba ENI (Elastic Network Interface) which is built upon virtio 0.9.5 specification. + config SNET_VDPA + tristate "SolidRun's vDPA driver for SolidNET" + depends on PCI_MSI && PCI_IOV && (HWMON || HWMON=n) + + # This driver MAY create a HWMON device. + # Depending on (HWMON || HWMON=n) ensures that: + # If HWMON=n the driver can be compiled either as a module or built-in. + # If HWMON=y the driver can be compiled either as a module or built-in. + # If HWMON=m the driver is forced to be compiled as a module. + # By doing so, IS_ENABLED can be used instead of IS_REACHABLE + + help + vDPA driver for SolidNET DPU. + With this driver, the VirtIO dataplane can be + offloaded to a SolidNET DPU. + This driver includes a HW monitor device that + reads health values from the DPU. + +config PDS_VDPA + tristate "vDPA driver for AMD/Pensando DSC devices" + select VIRTIO_PCI_LIB + depends on PCI_MSI + depends on PDS_CORE + help + vDPA network driver for AMD/Pensando's PDS Core devices. + With this driver, the VirtIO dataplane can be + offloaded to an AMD/Pensando DSC device. + +config OCTEONEP_VDPA + tristate "vDPA driver for Octeon DPU devices" + depends on m + depends on PCI_MSI + help + This is a vDPA driver designed for Marvell's Octeon DPU devices. + This driver enables the offloading of the VirtIO dataplane to an + Octeon DPU device. + Please note that this driver must be built as a module and it + cannot be loaded until the Octeon emulation software is running. + endif # VDPA diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile index 15665563a7f4..5654d36707af 100644 --- a/drivers/vdpa/Makefile +++ b/drivers/vdpa/Makefile @@ -6,3 +6,6 @@ obj-$(CONFIG_IFCVF) += ifcvf/ obj-$(CONFIG_MLX5_VDPA) += mlx5/ obj-$(CONFIG_VP_VDPA) += virtio_pci/ obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/ +obj-$(CONFIG_SNET_VDPA) += solidrun/ +obj-$(CONFIG_PDS_VDPA) += pds/ +obj-$(CONFIG_OCTEONEP_VDPA) += octeon_ep/ diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c index 5a09a09cca70..e476504db0c8 100644 --- a/drivers/vdpa/alibaba/eni_vdpa.c +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -254,6 +254,13 @@ static u16 eni_vdpa_get_vq_num_min(struct vdpa_device *vdpa) return vp_legacy_get_queue_size(ldev, 0); } +static u16 eni_vdpa_get_vq_size(struct vdpa_device *vdpa, u16 qid) +{ + struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa); + + return vp_legacy_get_queue_size(ldev, qid); +} + static int eni_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid, struct vdpa_vq_state *state) { @@ -416,6 +423,7 @@ static const struct vdpa_config_ops eni_vdpa_ops = { .reset = eni_vdpa_reset, .get_vq_num_max = eni_vdpa_get_vq_num_max, .get_vq_num_min = eni_vdpa_get_vq_num_min, + .get_vq_size = eni_vdpa_get_vq_size, .get_vq_state = eni_vdpa_get_vq_state, .set_vq_state = eni_vdpa_set_vq_state, .set_vq_cb = eni_vdpa_set_vq_cb, @@ -470,7 +478,8 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; eni_vdpa = vdpa_alloc_device(struct eni_vdpa, vdpa, - dev, &eni_vdpa_ops, 1, 1, NULL, false); + dev, &eni_vdpa_ops, NULL, + 1, 1, NULL, false); if (IS_ERR(eni_vdpa)) { ENI_ERR(pdev, "failed to allocate vDPA structure\n"); return PTR_ERR(eni_vdpa); @@ -488,7 +497,7 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_master(pdev); pci_set_drvdata(pdev, eni_vdpa); - eni_vdpa->vdpa.dma_dev = &pdev->dev; + eni_vdpa->vdpa.vmap.dma_dev = &pdev->dev; eni_vdpa->queues = eni_vdpa_get_num_queues(eni_vdpa); eni_vdpa->vring = devm_kcalloc(&pdev->dev, eni_vdpa->queues, @@ -497,7 +506,7 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!eni_vdpa->vring) { ret = -ENOMEM; ENI_ERR(pdev, "failed to allocate virtqueues\n"); - goto err; + goto err_remove_vp_legacy; } for (i = 0; i < eni_vdpa->queues; i++) { @@ -509,11 +518,13 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = vdpa_register_device(&eni_vdpa->vdpa, eni_vdpa->queues); if (ret) { ENI_ERR(pdev, "failed to register to vdpa bus\n"); - goto err; + goto err_remove_vp_legacy; } return 0; +err_remove_vp_legacy: + vp_legacy_remove(&eni_vdpa->ldev); err: put_device(&eni_vdpa->vdpa.dev); return ret; diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c index 3e4486bfa0b7..d5507b63b6cd 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.c +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -10,11 +10,6 @@ #include "ifcvf_base.h" -struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw) -{ - return container_of(hw, struct ifcvf_adapter, vf); -} - u16 ifcvf_set_vq_vector(struct ifcvf_hw *hw, u16 qid, int vector) { struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; @@ -37,8 +32,6 @@ u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector) static void __iomem *get_cap_addr(struct ifcvf_hw *hw, struct virtio_pci_cap *cap) { - struct ifcvf_adapter *ifcvf; - struct pci_dev *pdev; u32 length, offset; u8 bar; @@ -46,17 +39,14 @@ static void __iomem *get_cap_addr(struct ifcvf_hw *hw, offset = le32_to_cpu(cap->offset); bar = cap->bar; - ifcvf= vf_to_adapter(hw); - pdev = ifcvf->pdev; - if (bar >= IFCVF_PCI_MAX_RESOURCE) { - IFCVF_DBG(pdev, + IFCVF_DBG(hw->pdev, "Invalid bar number %u to get capabilities\n", bar); return NULL; } - if (offset + length > pci_resource_len(pdev, bar)) { - IFCVF_DBG(pdev, + if (offset + length > pci_resource_len(hw->pdev, bar)) { + IFCVF_DBG(hw->pdev, "offset(%u) + len(%u) overflows bar%u's capability\n", offset, length, bar); return NULL; @@ -79,6 +69,36 @@ static int ifcvf_read_config_range(struct pci_dev *dev, return 0; } +u16 ifcvf_get_vq_size(struct ifcvf_hw *hw, u16 qid) +{ + u16 queue_size; + + if (qid >= hw->nr_vring) + return 0; + + vp_iowrite16(qid, &hw->common_cfg->queue_select); + queue_size = vp_ioread16(&hw->common_cfg->queue_size); + + return queue_size; +} + +u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw) +{ + u16 queue_size, max_size, qid; + + max_size = ifcvf_get_vq_size(hw, 0); + for (qid = 1; qid < hw->nr_vring; qid++) { + queue_size = ifcvf_get_vq_size(hw, qid); + /* 0 means the queue is unavailable */ + if (!queue_size) + continue; + + max_size = max(queue_size, max_size); + } + + return max_size; +} + int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev) { struct virtio_pci_cap cap; @@ -88,10 +108,11 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev) u32 i; ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos); - if (ret < 0) { + if (ret) { IFCVF_ERR(pdev, "Failed to read PCI capability list\n"); return -EIO; } + hw->pdev = pdev; while (pos) { ret = ifcvf_read_config_range(pdev, (u32 *)&cap, @@ -143,6 +164,9 @@ next: } hw->nr_vring = vp_ioread16(&hw->common_cfg->num_queues); + hw->vring = kzalloc(sizeof(struct vring_info) * hw->nr_vring, GFP_KERNEL); + if (!hw->vring) + return -ENOMEM; for (i = 0; i < hw->nr_vring; i++) { vp_iowrite16(i, &hw->common_cfg->queue_select); @@ -179,21 +203,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status) void ifcvf_reset(struct ifcvf_hw *hw) { - hw->config_cb.callback = NULL; - hw->config_cb.private = NULL; - ifcvf_set_status(hw, 0); - /* flush set_status, make sure VF is stopped, reset */ - ifcvf_get_status(hw); -} - -static void ifcvf_add_status(struct ifcvf_hw *hw, u8 status) -{ - if (status != 0) - status |= ifcvf_get_status(hw); - - ifcvf_set_status(hw, status); - ifcvf_get_status(hw); + while (ifcvf_get_status(hw)) + msleep(1); } u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) @@ -213,17 +225,33 @@ u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) return features; } -u64 ifcvf_get_features(struct ifcvf_hw *hw) +/* return provisioned vDPA dev features */ +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw) { - return hw->hw_features; + return hw->dev_features; } -int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features) +u64 ifcvf_get_driver_features(struct ifcvf_hw *hw) { - struct ifcvf_adapter *ifcvf = vf_to_adapter(hw); + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + u32 features_lo, features_hi; + u64 features; + + vp_iowrite32(0, &cfg->device_feature_select); + features_lo = vp_ioread32(&cfg->guest_feature); + + vp_iowrite32(1, &cfg->device_feature_select); + features_hi = vp_ioread32(&cfg->guest_feature); + features = ((u64)features_hi << 32) | features_lo; + + return features; +} + +int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features) +{ if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) { - IFCVF_ERR(ifcvf->pdev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n"); + IFCVF_ERR(hw->pdev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n"); return -EINVAL; } @@ -232,13 +260,11 @@ int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features) u32 ifcvf_get_config_size(struct ifcvf_hw *hw) { - struct ifcvf_adapter *adapter; u32 net_config_size = sizeof(struct virtio_net_config); u32 blk_config_size = sizeof(struct virtio_blk_config); u32 cap_size = hw->cap_dev_config_size; u32 config_size; - adapter = vf_to_adapter(hw); /* If the onboard device config space size is greater than * the size of struct virtio_net/blk_config, only the spec * implementing contents size is returned, this is very @@ -253,7 +279,7 @@ u32 ifcvf_get_config_size(struct ifcvf_hw *hw) break; default: config_size = 0; - IFCVF_ERR(adapter->pdev, "VIRTIO ID %u not supported\n", hw->dev_type); + IFCVF_ERR(hw->pdev, "VIRTIO ID %u not supported\n", hw->dev_type); } return config_size; @@ -288,7 +314,7 @@ void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 offset, vp_iowrite8(*p++, hw->dev_cfg + offset + i); } -static void ifcvf_set_features(struct ifcvf_hw *hw, u64 features) +void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features) { struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; @@ -299,108 +325,104 @@ static void ifcvf_set_features(struct ifcvf_hw *hw, u64 features) vp_iowrite32(features >> 32, &cfg->guest_feature); } -static int ifcvf_config_features(struct ifcvf_hw *hw) -{ - struct ifcvf_adapter *ifcvf; - - ifcvf = vf_to_adapter(hw); - ifcvf_set_features(hw, hw->req_features); - ifcvf_add_status(hw, VIRTIO_CONFIG_S_FEATURES_OK); - - if (!(ifcvf_get_status(hw) & VIRTIO_CONFIG_S_FEATURES_OK)) { - IFCVF_ERR(ifcvf->pdev, "Failed to set FEATURES_OK status\n"); - return -EIO; - } - - return 0; -} - u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid) { - struct ifcvf_lm_cfg __iomem *ifcvf_lm; - void __iomem *avail_idx_addr; + struct ifcvf_lm_cfg __iomem *lm_cfg = hw->lm_cfg; u16 last_avail_idx; - u32 q_pair_id; - ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg; - q_pair_id = qid / 2; - avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2]; - last_avail_idx = vp_ioread16(avail_idx_addr); + last_avail_idx = vp_ioread16(&lm_cfg->vq_state_region + qid * 2); return last_avail_idx; } int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num) { - struct ifcvf_lm_cfg __iomem *ifcvf_lm; - void __iomem *avail_idx_addr; - u32 q_pair_id; + struct ifcvf_lm_cfg __iomem *lm_cfg = hw->lm_cfg; - ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg; - q_pair_id = qid / 2; - avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2]; - hw->vring[qid].last_avail_idx = num; - vp_iowrite16(num, avail_idx_addr); + vp_iowrite16(num, &lm_cfg->vq_state_region + qid * 2); return 0; } -static int ifcvf_hw_enable(struct ifcvf_hw *hw) +void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num) { - struct virtio_pci_common_cfg __iomem *cfg; - u32 i; + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; - cfg = hw->common_cfg; - for (i = 0; i < hw->nr_vring; i++) { - if (!hw->vring[i].ready) - break; + vp_iowrite16(qid, &cfg->queue_select); + vp_iowrite16(num, &cfg->queue_size); +} - vp_iowrite16(i, &cfg->queue_select); - vp_iowrite64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo, - &cfg->queue_desc_hi); - vp_iowrite64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo, - &cfg->queue_avail_hi); - vp_iowrite64_twopart(hw->vring[i].used, &cfg->queue_used_lo, - &cfg->queue_used_hi); - vp_iowrite16(hw->vring[i].size, &cfg->queue_size); - ifcvf_set_vq_state(hw, i, hw->vring[i].last_avail_idx); - vp_iowrite16(1, &cfg->queue_enable); - } +int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area, + u64 driver_area, u64 device_area) +{ + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + + vp_iowrite16(qid, &cfg->queue_select); + vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo, + &cfg->queue_desc_hi); + vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo, + &cfg->queue_avail_hi); + vp_iowrite64_twopart(device_area, &cfg->queue_used_lo, + &cfg->queue_used_hi); return 0; } -static void ifcvf_hw_disable(struct ifcvf_hw *hw) +bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid) { - u32 i; + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; + u16 queue_enable; - ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR); - for (i = 0; i < hw->nr_vring; i++) { - ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR); - } + vp_iowrite16(qid, &cfg->queue_select); + queue_enable = vp_ioread16(&cfg->queue_enable); + + return (bool)queue_enable; } -int ifcvf_start_hw(struct ifcvf_hw *hw) +void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready) { - ifcvf_reset(hw); - ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE); - ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER); + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; - if (ifcvf_config_features(hw) < 0) - return -EINVAL; + vp_iowrite16(qid, &cfg->queue_select); + vp_iowrite16(ready, &cfg->queue_enable); +} - if (ifcvf_hw_enable(hw) < 0) - return -EINVAL; +static void ifcvf_reset_vring(struct ifcvf_hw *hw) +{ + u16 qid; + + for (qid = 0; qid < hw->nr_vring; qid++) { + hw->vring[qid].cb.callback = NULL; + hw->vring[qid].cb.private = NULL; + ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR); + } +} - ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK); +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw) +{ + hw->config_cb.callback = NULL; + hw->config_cb.private = NULL; + ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR); +} - return 0; +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw) +{ + u32 nvectors = hw->num_msix_vectors; + struct pci_dev *pdev = hw->pdev; + int i, irq; + + for (i = 0; i < nvectors; i++) { + irq = pci_irq_vector(pdev, i); + if (irq >= 0) + synchronize_irq(irq); + } } -void ifcvf_stop_hw(struct ifcvf_hw *hw) +void ifcvf_stop(struct ifcvf_hw *hw) { - ifcvf_hw_disable(hw); - ifcvf_reset(hw); + ifcvf_synchronize_irq(hw); + ifcvf_reset_vring(hw); + ifcvf_reset_config_handler(hw); } void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index f5563f665cc6..aa36de361c10 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -19,28 +19,21 @@ #include <uapi/linux/virtio_blk.h> #include <uapi/linux/virtio_config.h> #include <uapi/linux/virtio_pci.h> +#include <uapi/linux/vdpa.h> #define N3000_DEVICE_ID 0x1041 #define N3000_SUBSYS_DEVICE_ID 0x001A -/* Max 8 data queue pairs(16 queues) and one control vq for now. */ -#define IFCVF_MAX_QUEUES 17 - #define IFCVF_QUEUE_ALIGNMENT PAGE_SIZE -#define IFCVF_QUEUE_MAX 32768 #define IFCVF_PCI_MAX_RESOURCE 6 -#define IFCVF_LM_CFG_SIZE 0x40 -#define IFCVF_LM_RING_STATE_OFFSET 0x20 #define IFCVF_LM_BAR 4 +#define IFCVF_MIN_VQ_SIZE 64 #define IFCVF_ERR(pdev, fmt, ...) dev_err(&pdev->dev, fmt, ##__VA_ARGS__) #define IFCVF_DBG(pdev, fmt, ...) dev_dbg(&pdev->dev, fmt, ##__VA_ARGS__) #define IFCVF_INFO(pdev, fmt, ...) dev_info(&pdev->dev, fmt, ##__VA_ARGS__) -#define ifcvf_private_to_vf(adapter) \ - (&((struct ifcvf_adapter *)adapter)->vf) - /* all vqs and config interrupt has its own vector */ #define MSIX_VECTOR_PER_VQ_AND_CONFIG 1 /* all vqs share a vector, and config interrupt has a separate vector */ @@ -49,12 +42,7 @@ #define MSIX_VECTOR_DEV_SHARED 3 struct vring_info { - u64 desc; - u64 avail; - u64 used; - u16 size; u16 last_avail_idx; - bool ready; void __iomem *notify_addr; phys_addr_t notify_pa; u32 irq; @@ -62,10 +50,18 @@ struct vring_info { char msix_name[256]; }; +struct ifcvf_lm_cfg { + __le64 control; + __le64 status; + __le64 lm_mem_log_start_addr; + __le64 lm_mem_log_end_addr; + __le16 vq_state_region; +}; + struct ifcvf_hw { u8 __iomem *isr; /* Live migration */ - u8 __iomem *lm_cfg; + struct ifcvf_lm_cfg __iomem *lm_cfg; /* Notification bar number */ u8 notify_bar; u8 msix_vector_status; @@ -76,11 +72,12 @@ struct ifcvf_hw { phys_addr_t notify_base_pa; u32 notify_off_multiplier; u32 dev_type; - u64 req_features; u64 hw_features; + /* provisioned device features */ + u64 dev_features; struct virtio_pci_common_cfg __iomem *common_cfg; void __iomem *dev_cfg; - struct vring_info vring[IFCVF_MAX_QUEUES]; + struct vring_info *vring; void __iomem * const *base; char config_msix_name[256]; struct vdpa_callback config_cb; @@ -88,34 +85,26 @@ struct ifcvf_hw { int vqs_reused_irq; u16 nr_vring; /* VIRTIO_PCI_CAP_DEVICE_CFG size */ + u32 num_msix_vectors; u32 cap_dev_config_size; + struct pci_dev *pdev; }; struct ifcvf_adapter { struct vdpa_device vdpa; struct pci_dev *pdev; - struct ifcvf_hw vf; -}; - -struct ifcvf_vring_lm_cfg { - u32 idx_addr[2]; - u8 reserved[IFCVF_LM_CFG_SIZE - 8]; -}; - -struct ifcvf_lm_cfg { - u8 reserved[IFCVF_LM_RING_STATE_OFFSET]; - struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUES]; + struct ifcvf_hw *vf; }; struct ifcvf_vdpa_mgmt_dev { struct vdpa_mgmt_dev mdev; + struct ifcvf_hw vf; struct ifcvf_adapter *adapter; struct pci_dev *pdev; }; int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev); -int ifcvf_start_hw(struct ifcvf_hw *hw); -void ifcvf_stop_hw(struct ifcvf_hw *hw); +void ifcvf_stop(struct ifcvf_hw *hw); void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid); void ifcvf_read_dev_config(struct ifcvf_hw *hw, u64 offset, void *dst, int length); @@ -123,16 +112,22 @@ void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 offset, const void *src, int length); u8 ifcvf_get_status(struct ifcvf_hw *hw); void ifcvf_set_status(struct ifcvf_hw *hw, u8 status); -void io_write64_twopart(u64 val, u32 *lo, u32 *hi); void ifcvf_reset(struct ifcvf_hw *hw); -u64 ifcvf_get_features(struct ifcvf_hw *hw); +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw); u64 ifcvf_get_hw_features(struct ifcvf_hw *hw); int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features); u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid); int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num); -struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw); -int ifcvf_probed_virtio_net(struct ifcvf_hw *hw); u32 ifcvf_get_config_size(struct ifcvf_hw *hw); u16 ifcvf_set_vq_vector(struct ifcvf_hw *hw, u16 qid, int vector); u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector); +void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num); +int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area, + u64 driver_area, u64 device_area); +bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid); +void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready); +void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features); +u64 ifcvf_get_driver_features(struct ifcvf_hw *hw); +u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw); +u16 ifcvf_get_vq_size(struct ifcvf_hw *hw, u16 qid); #endif /* _IFCVF_H_ */ diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index f9c0044c6442..6658dc74d915 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -69,10 +69,9 @@ static void ifcvf_free_irq_vectors(void *data) pci_free_irq_vectors(data); } -static void ifcvf_free_per_vq_irq(struct ifcvf_adapter *adapter) +static void ifcvf_free_per_vq_irq(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; - struct ifcvf_hw *vf = &adapter->vf; + struct pci_dev *pdev = vf->pdev; int i; for (i = 0; i < vf->nr_vring; i++) { @@ -83,10 +82,9 @@ static void ifcvf_free_per_vq_irq(struct ifcvf_adapter *adapter) } } -static void ifcvf_free_vqs_reused_irq(struct ifcvf_adapter *adapter) +static void ifcvf_free_vqs_reused_irq(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; - struct ifcvf_hw *vf = &adapter->vf; + struct pci_dev *pdev = vf->pdev; if (vf->vqs_reused_irq != -EINVAL) { devm_free_irq(&pdev->dev, vf->vqs_reused_irq, vf); @@ -95,20 +93,17 @@ static void ifcvf_free_vqs_reused_irq(struct ifcvf_adapter *adapter) } -static void ifcvf_free_vq_irq(struct ifcvf_adapter *adapter) +static void ifcvf_free_vq_irq(struct ifcvf_hw *vf) { - struct ifcvf_hw *vf = &adapter->vf; - if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG) - ifcvf_free_per_vq_irq(adapter); + ifcvf_free_per_vq_irq(vf); else - ifcvf_free_vqs_reused_irq(adapter); + ifcvf_free_vqs_reused_irq(vf); } -static void ifcvf_free_config_irq(struct ifcvf_adapter *adapter) +static void ifcvf_free_config_irq(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; - struct ifcvf_hw *vf = &adapter->vf; + struct pci_dev *pdev = vf->pdev; if (vf->config_irq == -EINVAL) return; @@ -123,13 +118,14 @@ static void ifcvf_free_config_irq(struct ifcvf_adapter *adapter) } } -static void ifcvf_free_irq(struct ifcvf_adapter *adapter) +static void ifcvf_free_irq(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; + struct pci_dev *pdev = vf->pdev; - ifcvf_free_vq_irq(adapter); - ifcvf_free_config_irq(adapter); + ifcvf_free_vq_irq(vf); + ifcvf_free_config_irq(vf); ifcvf_free_irq_vectors(pdev); + vf->num_msix_vectors = 0; } /* ifcvf MSIX vectors allocator, this helper tries to allocate @@ -137,10 +133,9 @@ static void ifcvf_free_irq(struct ifcvf_adapter *adapter) * It returns the number of allocated vectors, negative * return value when fails. */ -static int ifcvf_alloc_vectors(struct ifcvf_adapter *adapter) +static int ifcvf_alloc_vectors(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; - struct ifcvf_hw *vf = &adapter->vf; + struct pci_dev *pdev = vf->pdev; int max_intr, ret; /* all queues and config interrupt */ @@ -160,10 +155,9 @@ static int ifcvf_alloc_vectors(struct ifcvf_adapter *adapter) return ret; } -static int ifcvf_request_per_vq_irq(struct ifcvf_adapter *adapter) +static int ifcvf_request_per_vq_irq(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; - struct ifcvf_hw *vf = &adapter->vf; + struct pci_dev *pdev = vf->pdev; int i, vector, ret, irq; vf->vqs_reused_irq = -EINVAL; @@ -190,15 +184,14 @@ static int ifcvf_request_per_vq_irq(struct ifcvf_adapter *adapter) return 0; err: - ifcvf_free_irq(adapter); + ifcvf_free_irq(vf); return -EFAULT; } -static int ifcvf_request_vqs_reused_irq(struct ifcvf_adapter *adapter) +static int ifcvf_request_vqs_reused_irq(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; - struct ifcvf_hw *vf = &adapter->vf; + struct pci_dev *pdev = vf->pdev; int i, vector, ret, irq; vector = 0; @@ -224,15 +217,14 @@ static int ifcvf_request_vqs_reused_irq(struct ifcvf_adapter *adapter) return 0; err: - ifcvf_free_irq(adapter); + ifcvf_free_irq(vf); return -EFAULT; } -static int ifcvf_request_dev_irq(struct ifcvf_adapter *adapter) +static int ifcvf_request_dev_irq(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; - struct ifcvf_hw *vf = &adapter->vf; + struct pci_dev *pdev = vf->pdev; int i, vector, ret, irq; vector = 0; @@ -265,29 +257,27 @@ static int ifcvf_request_dev_irq(struct ifcvf_adapter *adapter) return 0; err: - ifcvf_free_irq(adapter); + ifcvf_free_irq(vf); return -EFAULT; } -static int ifcvf_request_vq_irq(struct ifcvf_adapter *adapter) +static int ifcvf_request_vq_irq(struct ifcvf_hw *vf) { - struct ifcvf_hw *vf = &adapter->vf; int ret; if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG) - ret = ifcvf_request_per_vq_irq(adapter); + ret = ifcvf_request_per_vq_irq(vf); else - ret = ifcvf_request_vqs_reused_irq(adapter); + ret = ifcvf_request_vqs_reused_irq(vf); return ret; } -static int ifcvf_request_config_irq(struct ifcvf_adapter *adapter) +static int ifcvf_request_config_irq(struct ifcvf_hw *vf) { - struct pci_dev *pdev = adapter->pdev; - struct ifcvf_hw *vf = &adapter->vf; + struct pci_dev *pdev = vf->pdev; int config_vector, ret; if (vf->msix_vector_status == MSIX_VECTOR_PER_VQ_AND_CONFIG) @@ -320,17 +310,16 @@ static int ifcvf_request_config_irq(struct ifcvf_adapter *adapter) return 0; err: - ifcvf_free_irq(adapter); + ifcvf_free_irq(vf); return -EFAULT; } -static int ifcvf_request_irq(struct ifcvf_adapter *adapter) +static int ifcvf_request_irq(struct ifcvf_hw *vf) { - struct ifcvf_hw *vf = &adapter->vf; int nvectors, ret, max_intr; - nvectors = ifcvf_alloc_vectors(adapter); + nvectors = ifcvf_alloc_vectors(vf); if (nvectors <= 0) return -EFAULT; @@ -341,70 +330,25 @@ static int ifcvf_request_irq(struct ifcvf_adapter *adapter) if (nvectors == 1) { vf->msix_vector_status = MSIX_VECTOR_DEV_SHARED; - ret = ifcvf_request_dev_irq(adapter); + ret = ifcvf_request_dev_irq(vf); return ret; } - ret = ifcvf_request_vq_irq(adapter); + ret = ifcvf_request_vq_irq(vf); if (ret) return ret; - ret = ifcvf_request_config_irq(adapter); + ret = ifcvf_request_config_irq(vf); if (ret) return ret; - return 0; -} - -static int ifcvf_start_datapath(void *private) -{ - struct ifcvf_hw *vf = ifcvf_private_to_vf(private); - u8 status; - int ret; - - ret = ifcvf_start_hw(vf); - if (ret < 0) { - status = ifcvf_get_status(vf); - status |= VIRTIO_CONFIG_S_FAILED; - ifcvf_set_status(vf, status); - } - - return ret; -} - -static int ifcvf_stop_datapath(void *private) -{ - struct ifcvf_hw *vf = ifcvf_private_to_vf(private); - int i; - - for (i = 0; i < vf->nr_vring; i++) - vf->vring[i].cb.callback = NULL; - - ifcvf_stop_hw(vf); + vf->num_msix_vectors = nvectors; return 0; } -static void ifcvf_reset_vring(struct ifcvf_adapter *adapter) -{ - struct ifcvf_hw *vf = ifcvf_private_to_vf(adapter); - int i; - - for (i = 0; i < vf->nr_vring; i++) { - vf->vring[i].last_avail_idx = 0; - vf->vring[i].desc = 0; - vf->vring[i].avail = 0; - vf->vring[i].used = 0; - vf->vring[i].ready = 0; - vf->vring[i].cb.callback = NULL; - vf->vring[i].cb.private = NULL; - } - - ifcvf_reset(vf); -} - static struct ifcvf_adapter *vdpa_to_adapter(struct vdpa_device *vdpa_dev) { return container_of(vdpa_dev, struct ifcvf_adapter, vdpa); @@ -414,7 +358,7 @@ static struct ifcvf_hw *vdpa_to_vf(struct vdpa_device *vdpa_dev) { struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); - return &adapter->vf; + return adapter->vf; } static u64 ifcvf_vdpa_get_device_features(struct vdpa_device *vdpa_dev) @@ -426,7 +370,7 @@ static u64 ifcvf_vdpa_get_device_features(struct vdpa_device *vdpa_dev) u64 features; if (type == VIRTIO_ID_NET || type == VIRTIO_ID_BLOCK) - features = ifcvf_get_features(vf); + features = ifcvf_get_dev_features(vf); else { features = 0; IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type); @@ -444,7 +388,7 @@ static int ifcvf_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 feat if (ret) return ret; - vf->req_features = features; + ifcvf_set_driver_features(vf, features); return 0; } @@ -452,8 +396,11 @@ static int ifcvf_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 feat static u64 ifcvf_vdpa_get_driver_features(struct vdpa_device *vdpa_dev) { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + u64 features; + + features = ifcvf_get_driver_features(vf); - return vf->req_features; + return features; } static u8 ifcvf_vdpa_get_status(struct vdpa_device *vdpa_dev) @@ -465,13 +412,11 @@ static u8 ifcvf_vdpa_get_status(struct vdpa_device *vdpa_dev) static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) { - struct ifcvf_adapter *adapter; struct ifcvf_hw *vf; u8 status_old; int ret; vf = vdpa_to_vf(vdpa_dev); - adapter = vdpa_to_adapter(vdpa_dev); status_old = ifcvf_get_status(vf); if (status_old == status) @@ -479,18 +424,11 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) { - ret = ifcvf_request_irq(adapter); + ret = ifcvf_request_irq(vf); if (ret) { - status = ifcvf_get_status(vf); - status |= VIRTIO_CONFIG_S_FAILED; - ifcvf_set_status(vf, status); + IFCVF_ERR(vf->pdev, "failed to request irq with error %d\n", ret); return; } - - if (ifcvf_start_datapath(adapter) < 0) - IFCVF_ERR(adapter->pdev, - "Failed to set ifcvf vdpa status %u\n", - status); } ifcvf_set_status(vf, status); @@ -498,30 +436,29 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev) { - struct ifcvf_adapter *adapter; - struct ifcvf_hw *vf; - u8 status_old; - - vf = vdpa_to_vf(vdpa_dev); - adapter = vdpa_to_adapter(vdpa_dev); - status_old = ifcvf_get_status(vf); + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + u8 status = ifcvf_get_status(vf); - if (status_old == 0) - return 0; + ifcvf_stop(vf); - if (status_old & VIRTIO_CONFIG_S_DRIVER_OK) { - ifcvf_stop_datapath(adapter); - ifcvf_free_irq(adapter); - } + if (status & VIRTIO_CONFIG_S_DRIVER_OK) + ifcvf_free_irq(vf); - ifcvf_reset_vring(adapter); + ifcvf_reset(vf); return 0; } static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) { - return IFCVF_QUEUE_MAX; + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return ifcvf_get_max_vq_size(vf); +} + +static u16 ifcvf_vdpa_get_vq_num_min(struct vdpa_device *vdpa_dev) +{ + return IFCVF_MIN_VQ_SIZE; } static int ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid, @@ -554,14 +491,14 @@ static void ifcvf_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - vf->vring[qid].ready = ready; + ifcvf_set_vq_ready(vf, qid, ready); } static bool ifcvf_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid) { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - return vf->vring[qid].ready; + return ifcvf_get_vq_ready(vf, qid); } static void ifcvf_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, @@ -569,7 +506,7 @@ static void ifcvf_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - vf->vring[qid].size = num; + ifcvf_set_vq_num(vf, qid, num); } static int ifcvf_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, @@ -578,11 +515,7 @@ static int ifcvf_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - vf->vring[qid].desc = desc_area; - vf->vring[qid].avail = driver_area; - vf->vring[qid].used = device_area; - - return 0; + return ifcvf_set_vq_address(vf, qid, desc_area, driver_area, device_area); } static void ifcvf_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid) @@ -669,6 +602,14 @@ static int ifcvf_vdpa_get_vq_irq(struct vdpa_device *vdpa_dev, return -EINVAL; } +static u16 ifcvf_vdpa_get_vq_size(struct vdpa_device *vdpa_dev, + u16 qid) +{ + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + + return ifcvf_get_vq_size(vf, qid); +} + static struct vdpa_notification_area ifcvf_get_vq_notification(struct vdpa_device *vdpa_dev, u16 idx) { @@ -696,6 +637,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .set_status = ifcvf_vdpa_set_status, .reset = ifcvf_vdpa_reset, .get_vq_num_max = ifcvf_vdpa_get_vq_num_max, + .get_vq_num_min = ifcvf_vdpa_get_vq_num_min, .get_vq_state = ifcvf_vdpa_get_vq_state, .set_vq_state = ifcvf_vdpa_set_vq_state, .set_vq_cb = ifcvf_vdpa_set_vq_cb, @@ -704,6 +646,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .set_vq_num = ifcvf_vdpa_set_vq_num, .set_vq_address = ifcvf_vdpa_set_vq_address, .get_vq_irq = ifcvf_vdpa_get_vq_irq, + .get_vq_size = ifcvf_vdpa_get_vq_size, .kick_vq = ifcvf_vdpa_kick_vq, .get_generation = ifcvf_vdpa_get_generation, .get_device_id = ifcvf_vdpa_get_device_id, @@ -755,17 +698,38 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, struct vdpa_device *vdpa_dev; struct pci_dev *pdev; struct ifcvf_hw *vf; + u64 device_features; int ret; ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev); - if (!ifcvf_mgmt_dev->adapter) - return -EOPNOTSUPP; + vf = &ifcvf_mgmt_dev->vf; + pdev = vf->pdev; + adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, + &pdev->dev, &ifc_vdpa_ops, + NULL, 1, 1, NULL, false); + if (IS_ERR(adapter)) { + IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); + return PTR_ERR(adapter); + } - adapter = ifcvf_mgmt_dev->adapter; - vf = &adapter->vf; - pdev = adapter->pdev; + ifcvf_mgmt_dev->adapter = adapter; + adapter->pdev = pdev; + adapter->vdpa.vmap.dma_dev = &pdev->dev; + adapter->vdpa.mdev = mdev; + adapter->vf = vf; vdpa_dev = &adapter->vdpa; + device_features = vf->hw_features; + if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + if (config->device_features & ~device_features) { + IFCVF_ERR(pdev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n", + config->device_features, device_features); + return -EINVAL; + } + device_features &= config->device_features; + } + vf->dev_features = device_features; + if (name) ret = dev_set_name(&vdpa_dev->dev, "%s", name); else @@ -781,7 +745,6 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, return 0; } - static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) { struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; @@ -800,7 +763,6 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; struct device *dev = &pdev->dev; - struct ifcvf_adapter *adapter; struct ifcvf_hw *vf; u32 dev_type; int ret, i; @@ -831,25 +793,21 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) } pci_set_master(pdev); - - adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - dev, &ifc_vdpa_ops, 1, 1, NULL, false); - if (IS_ERR(adapter)) { - IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); - return PTR_ERR(adapter); + ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); + if (!ifcvf_mgmt_dev) { + IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); + return -ENOMEM; } - vf = &adapter->vf; + vf = &ifcvf_mgmt_dev->vf; vf->dev_type = get_dev_type(pdev); vf->base = pcim_iomap_table(pdev); - - adapter->pdev = pdev; - adapter->vdpa.dma_dev = &pdev->dev; + vf->pdev = pdev; ret = ifcvf_init_hw(vf, pdev); if (ret) { IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); - return ret; + goto err; } for (i = 0; i < vf->nr_vring; i++) @@ -858,16 +816,6 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) vf->hw_features = ifcvf_get_hw_features(vf); vf->config_size = ifcvf_get_config_size(vf); - ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); - if (!ifcvf_mgmt_dev) { - IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); - return -ENOMEM; - } - - ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; - ifcvf_mgmt_dev->mdev.device = dev; - ifcvf_mgmt_dev->adapter = adapter; - dev_type = get_dev_type(pdev); switch (dev_type) { case VIRTIO_ID_NET: @@ -882,11 +830,11 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err; } + ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; + ifcvf_mgmt_dev->mdev.device = dev; ifcvf_mgmt_dev->mdev.max_supported_vqs = vf->nr_vring; ifcvf_mgmt_dev->mdev.supported_features = vf->hw_features; - - adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev; - + ifcvf_mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES); ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev); if (ret) { @@ -900,6 +848,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) return 0; err: + kfree(ifcvf_mgmt_dev->vf.vring); kfree(ifcvf_mgmt_dev); return ret; } @@ -910,6 +859,7 @@ static void ifcvf_remove(struct pci_dev *pdev) ifcvf_mgmt_dev = pci_get_drvdata(pdev); vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev); + kfree(ifcvf_mgmt_dev->vf.vring); kfree(ifcvf_mgmt_dev); } @@ -919,7 +869,9 @@ static struct pci_device_id ifcvf_pci_ids[] = { N3000_DEVICE_ID, PCI_VENDOR_ID_INTEL, N3000_SUBSYS_DEVICE_ID) }, - /* C5000X-PL network device */ + /* C5000X-PL network device + * F2000X-PL network device + */ { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, VIRTIO_TRANS_ID_NET, PCI_VENDOR_ID_INTEL, @@ -943,4 +895,5 @@ static struct pci_driver ifcvf_driver = { module_pci_driver(ifcvf_driver); +MODULE_DESCRIPTION("Intel IFC VF NIC driver for virtio dataplane offloading"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile index f717978c83bf..e791394c33e3 100644 --- a/drivers/vdpa/mlx5/Makefile +++ b/drivers/vdpa/mlx5/Makefile @@ -1,4 +1,4 @@ subdir-ccflags-y += -I$(srctree)/drivers/vdpa/mlx5/core obj-$(CONFIG_MLX5_VDPA_NET) += mlx5_vdpa.o -mlx5_vdpa-$(CONFIG_MLX5_VDPA_NET) += net/mlx5_vnet.o core/resources.o core/mr.o +mlx5_vdpa-$(CONFIG_MLX5_VDPA_NET) += net/mlx5_vnet.o core/resources.o core/mr.o net/debug.o diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 058fbe28107e..2cedf7e2dbc4 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -31,11 +31,13 @@ struct mlx5_vdpa_mr { struct list_head head; unsigned long num_directs; unsigned long num_klms; - bool initialized; - /* serialize mkey creation and destruction */ - struct mutex mkey_mtx; + struct vhost_iotlb *iotlb; + bool user_mr; + + refcount_t refcount; + struct list_head mr_list; }; struct mlx5_vdpa_resources { @@ -73,17 +75,36 @@ struct mlx5_vdpa_wq_ent { enum { MLX5_VDPA_DATAVQ_GROUP, MLX5_VDPA_CVQ_GROUP, + MLX5_VDPA_DATAVQ_DESC_GROUP, MLX5_VDPA_NUMVQ_GROUPS }; enum { - MLX5_VDPA_NUM_AS = MLX5_VDPA_NUMVQ_GROUPS + MLX5_VDPA_NUM_AS = 2 +}; + +struct mlx5_vdpa_mr_resources { + struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS]; + unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; + + /* Pre-deletion mr list */ + struct list_head mr_list_head; + + /* Deferred mr list */ + struct list_head mr_gc_list_head; + struct workqueue_struct *wq_gc; + struct delayed_work gc_dwork_ent; + + struct mutex lock; + + atomic_t shutdown; }; struct mlx5_vdpa_dev { struct vdpa_device vdev; struct mlx5_core_dev *mdev; struct mlx5_vdpa_resources res; + struct mlx5_vdpa_mr_resources mres; u64 mlx_features; u64 actual_features; @@ -92,15 +113,25 @@ struct mlx5_vdpa_dev { u16 max_idx; u32 generation; - struct mlx5_vdpa_mr mr; struct mlx5_control_vq cvq; struct workqueue_struct *wq; - unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; + bool suspended; + + struct mlx5_async_ctx async_ctx; +}; + +struct mlx5_vdpa_async_cmd { + int err; + struct mlx5_async_work cb_work; + struct completion cmd_done; + + void *in; + size_t inlen; + + void *out; + size_t outlen; }; -int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid); -int mlx5_vdpa_dealloc_pd(struct mlx5_vdpa_dev *dev, u32 pdn, u16 uid); -int mlx5_vdpa_get_null_mkey(struct mlx5_vdpa_dev *dev, u32 *null_mkey); int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn); void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn); int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn); @@ -115,11 +146,31 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev); int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in, int inlen); int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey); -int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, - bool *change_map, unsigned int asid); -int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, - unsigned int asid); -void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev); +struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb); +int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev); +void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev); +void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev); +void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr); +void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr); +void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr, + unsigned int asid); +int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, + unsigned int asid); +int mlx5_vdpa_create_dma_mr(struct mlx5_vdpa_dev *mvdev); +int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid); +int mlx5_vdpa_exec_async_cmds(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_async_cmd *cmds, + int num_cmds); + +#define mlx5_vdpa_err(__dev, format, ...) \ + dev_err((__dev)->mdev->device, "%s:%d:(pid %d) error: " format, __func__, __LINE__, \ + current->pid, ##__VA_ARGS__) + #define mlx5_vdpa_warn(__dev, format, ...) \ dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, __func__, __LINE__, \ diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 0a1e0b0dc37e..8870a7169267 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -49,17 +49,23 @@ static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, __be64 *mtt) } } -static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) +struct mlx5_create_mkey_mem { + u8 out[MLX5_ST_SZ_BYTES(create_mkey_out)]; + u8 in[MLX5_ST_SZ_BYTES(create_mkey_in)]; + __be64 mtt[]; +}; + +struct mlx5_destroy_mkey_mem { + u8 out[MLX5_ST_SZ_BYTES(destroy_mkey_out)]; + u8 in[MLX5_ST_SZ_BYTES(destroy_mkey_in)]; +}; + +static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_direct_mr *mr, + struct mlx5_create_mkey_mem *mem) { - int inlen; + void *in = &mem->in; void *mkc; - void *in; - int err; - - inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16); - in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid); mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); @@ -76,18 +82,36 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct MLX5_SET(create_mkey_in, in, translations_octword_actual_size, get_octo_len(mr->end - mr->start, mr->log_size)); populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt)); - err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen); - kvfree(in); - if (err) { - mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n"); - return err; - } - return 0; + MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); + MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid); +} + +static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_direct_mr *mr, + struct mlx5_create_mkey_mem *mem) +{ + u32 mkey_index = MLX5_GET(create_mkey_out, mem->out, mkey_index); + + mr->mr = mlx5_idx_to_mkey(mkey_index); +} + +static void fill_destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_direct_mr *mr, + struct mlx5_destroy_mkey_mem *mem) +{ + void *in = &mem->in; + + MLX5_SET(destroy_mkey_in, in, uid, mvdev->res.uid); + MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); + MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mr->mr)); } static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) { + if (!mr->mr) + return; + mlx5_vdpa_destroy_mkey(mvdev, mr->mr); } @@ -166,9 +190,12 @@ again: klm->bcount = cpu_to_be32(klm_bcount(dmr->end - dmr->start)); preve = dmr->end; } else { + u64 bcount = min_t(u64, dmr->start - preve, MAX_KLM_SIZE); + klm->key = cpu_to_be32(mvdev->res.null_mkey); - klm->bcount = cpu_to_be32(klm_bcount(dmr->start - preve)); - preve = dmr->start; + klm->bcount = cpu_to_be32(klm_bcount(bcount)); + preve += bcount; + goto again; } } @@ -179,6 +206,123 @@ static int klm_byte_size(int nklms) return 16 * ALIGN(nklms, 4); } +#define MLX5_VDPA_MTT_ALIGN 16 + +static int create_direct_keys(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + struct mlx5_vdpa_async_cmd *cmds; + struct mlx5_vdpa_direct_mr *dmr; + int err = 0; + int i = 0; + + cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + + list_for_each_entry(dmr, &mr->head, list) { + struct mlx5_create_mkey_mem *cmd_mem; + int mttlen, mttcount; + + mttlen = roundup(MLX5_ST_SZ_BYTES(mtt) * dmr->nsg, MLX5_VDPA_MTT_ALIGN); + mttcount = mttlen / sizeof(cmd_mem->mtt[0]); + cmd_mem = kvcalloc(1, struct_size(cmd_mem, mtt, mttcount), GFP_KERNEL); + if (!cmd_mem) { + err = -ENOMEM; + goto done; + } + + cmds[i].out = cmd_mem->out; + cmds[i].outlen = sizeof(cmd_mem->out); + cmds[i].in = cmd_mem->in; + cmds[i].inlen = struct_size(cmd_mem, mtt, mttcount); + + fill_create_direct_mr(mvdev, dmr, cmd_mem); + + i++; + } + + err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs); + if (err) { + + mlx5_vdpa_err(mvdev, "error issuing MTT mkey creation for direct mrs: %d\n", err); + goto done; + } + + i = 0; + list_for_each_entry(dmr, &mr->head, list) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i++]; + struct mlx5_create_mkey_mem *cmd_mem; + + cmd_mem = container_of(cmd->out, struct mlx5_create_mkey_mem, out); + + if (!cmd->err) { + create_direct_mr_end(mvdev, dmr, cmd_mem); + } else { + err = err ? err : cmd->err; + mlx5_vdpa_err(mvdev, "error creating MTT mkey [0x%llx, 0x%llx]: %d\n", + dmr->start, dmr->end, cmd->err); + } + } + +done: + for (i = i-1; i >= 0; i--) { + struct mlx5_create_mkey_mem *cmd_mem; + + cmd_mem = container_of(cmds[i].out, struct mlx5_create_mkey_mem, out); + kvfree(cmd_mem); + } + + kvfree(cmds); + return err; +} + +DEFINE_FREE(free_cmds, struct mlx5_vdpa_async_cmd *, kvfree(_T)) +DEFINE_FREE(free_cmd_mem, struct mlx5_destroy_mkey_mem *, kvfree(_T)) + +static int destroy_direct_keys(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + struct mlx5_destroy_mkey_mem *cmd_mem __free(free_cmd_mem) = NULL; + struct mlx5_vdpa_async_cmd *cmds __free(free_cmds) = NULL; + struct mlx5_vdpa_direct_mr *dmr; + int err = 0; + int i = 0; + + cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL); + cmd_mem = kvcalloc(mr->num_directs, sizeof(*cmd_mem), GFP_KERNEL); + if (!cmds || !cmd_mem) + return -ENOMEM; + + list_for_each_entry(dmr, &mr->head, list) { + cmds[i].out = cmd_mem[i].out; + cmds[i].outlen = sizeof(cmd_mem[i].out); + cmds[i].in = cmd_mem[i].in; + cmds[i].inlen = sizeof(cmd_mem[i].in); + fill_destroy_direct_mr(mvdev, dmr, &cmd_mem[i]); + i++; + } + + err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs); + if (err) { + + mlx5_vdpa_err(mvdev, "error issuing MTT mkey deletion for direct mrs: %d\n", err); + return err; + } + + i = 0; + list_for_each_entry(dmr, &mr->head, list) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i++]; + + dmr->mr = 0; + if (cmd->err) { + err = err ? err : cmd->err; + mlx5_vdpa_err(mvdev, "error deleting MTT mkey [0x%llx, 0x%llx]: %d\n", + dmr->start, dmr->end, cmd->err); + } + } + + return err; +} + static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { int inlen; @@ -227,21 +371,19 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr unsigned long lgcd = 0; int log_entity_size; unsigned long size; - u64 start = 0; int err; struct page *pg; unsigned int nsg; int sglen; - u64 pa; + u64 pa, offset; u64 paend; struct scatterlist *sg; - struct device *dma = mvdev->vdev.dma_dev; + struct device *dma = mvdev->vdev.vmap.dma_dev; for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); - map; map = vhost_iotlb_itree_next(map, start, mr->end - 1)) { + map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) { size = maplen(map, mr); lgcd = gcd(lgcd, size); - start += size; } log_entity_size = ilog2(lgcd); @@ -255,8 +397,10 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr sg = mr->sg_head.sgl; for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) { - paend = map->addr + maplen(map, mr); - for (pa = map->addr; pa < paend; pa += sglen) { + offset = mr->start > map->start ? mr->start - map->start : 0; + pa = map->addr + offset; + paend = map->addr + offset + maplen(map, mr); + for (; pa < paend; pa += sglen) { pg = pfn_to_page(__phys_to_pfn(pa)); if (!sg) { mlx5_vdpa_warn(mvdev, "sg null. start 0x%llx, end 0x%llx\n", @@ -279,14 +423,8 @@ done: goto err_map; } - err = create_direct_mr(mvdev, mr); - if (err) - goto err_direct; - return 0; -err_direct: - dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); err_map: sg_free_table(&mr->sg_head); return err; @@ -294,17 +432,20 @@ err_map: static void unmap_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) { - struct device *dma = mvdev->vdev.dma_dev; + struct device *dma = mvdev->vdev.vmap.dma_dev; destroy_direct_mr(mvdev, mr); dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); sg_free_table(&mr->sg_head); } -static int add_direct_chain(struct mlx5_vdpa_dev *mvdev, u64 start, u64 size, u8 perm, +static int add_direct_chain(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr, + u64 start, + u64 size, + u8 perm, struct vhost_iotlb *iotlb) { - struct mlx5_vdpa_mr *mr = &mvdev->mr; struct mlx5_vdpa_direct_mr *dmr; struct mlx5_vdpa_direct_mr *n; LIST_HEAD(tmp); @@ -354,9 +495,10 @@ err_alloc: * indirect memory key that provides access to the enitre address space given * by iotlb. */ -static int create_user_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +static int create_user_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr, + struct vhost_iotlb *iotlb) { - struct mlx5_vdpa_mr *mr = &mvdev->mr; struct mlx5_vdpa_direct_mr *dmr; struct mlx5_vdpa_direct_mr *n; struct vhost_iotlb_map *map; @@ -384,7 +526,7 @@ static int create_user_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb LOG_MAX_KLM_SIZE); mr->num_klms += nnuls; } - err = add_direct_chain(mvdev, ps, pe - ps, pperm, iotlb); + err = add_direct_chain(mvdev, mr, ps, pe - ps, pperm, iotlb); if (err) goto err_chain; } @@ -393,7 +535,11 @@ static int create_user_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb pperm = map->perm; } } - err = add_direct_chain(mvdev, ps, pe - ps, pperm, iotlb); + err = add_direct_chain(mvdev, mr, ps, pe - ps, pperm, iotlb); + if (err) + goto err_chain; + + err = create_direct_keys(mvdev, mr); if (err) goto err_chain; @@ -450,20 +596,23 @@ static void destroy_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) mlx5_vdpa_destroy_mkey(mvdev, mr->mkey); } -static int dup_iotlb(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *src) +static int dup_iotlb(struct vhost_iotlb *dst, struct vhost_iotlb *src) { struct vhost_iotlb_map *map; u64 start = 0, last = ULLONG_MAX; int err; + if (dst == src) + return -EINVAL; + if (!src) { - err = vhost_iotlb_add_range(mvdev->cvq.iotlb, start, last, start, VHOST_ACCESS_RW); + err = vhost_iotlb_add_range(dst, start, last, start, VHOST_ACCESS_RW); return err; } for (map = vhost_iotlb_itree_first(src, start, last); map; map = vhost_iotlb_itree_next(map, start, last)) { - err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, map->last, + err = vhost_iotlb_add_range(dst, map->start, map->last, map->addr, map->perm); if (err) return err; @@ -471,9 +620,9 @@ static int dup_iotlb(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *src) return 0; } -static void prune_iotlb(struct mlx5_vdpa_dev *mvdev) +static void prune_iotlb(struct vhost_iotlb *iotlb) { - vhost_iotlb_del_range(mvdev->cvq.iotlb, 0, ULLONG_MAX); + vhost_iotlb_del_range(iotlb, 0, ULLONG_MAX); } static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) @@ -482,6 +631,7 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr struct mlx5_vdpa_direct_mr *n; destroy_indirect_key(mvdev, mr); + destroy_direct_keys(mvdev, mr); list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) { list_del_init(&dmr->list); unmap_direct_mr(mvdev, dmr); @@ -489,91 +639,282 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr } } -void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) +static void _mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { - struct mlx5_vdpa_mr *mr = &mvdev->mr; + if (WARN_ON(!mr)) + return; - mutex_lock(&mr->mkey_mtx); - if (!mr->initialized) - goto out; - - prune_iotlb(mvdev); if (mr->user_mr) destroy_user_mr(mvdev, mr); else destroy_dma_mr(mvdev, mr); - memset(mr, 0, sizeof(*mr)); - mr->initialized = false; -out: - mutex_unlock(&mr->mkey_mtx); + vhost_iotlb_free(mr->iotlb); + + list_del(&mr->mr_list); + + kfree(mr); +} + +/* There can be multiple .set_map() operations in quick succession. + * This large delay is a simple way to prevent the MR cleanup from blocking + * .set_map() MR creation in this scenario. + */ +#define MLX5_VDPA_MR_GC_TRIGGER_MS 2000 + +static void mlx5_vdpa_mr_gc_handler(struct work_struct *work) +{ + struct mlx5_vdpa_mr_resources *mres; + struct mlx5_vdpa_mr *mr, *tmp; + struct mlx5_vdpa_dev *mvdev; + + mres = container_of(work, struct mlx5_vdpa_mr_resources, gc_dwork_ent.work); + + if (atomic_read(&mres->shutdown)) { + mutex_lock(&mres->lock); + } else if (!mutex_trylock(&mres->lock)) { + queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent, + msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS)); + return; + } + + mvdev = container_of(mres, struct mlx5_vdpa_dev, mres); + + list_for_each_entry_safe(mr, tmp, &mres->mr_gc_list_head, mr_list) { + _mlx5_vdpa_destroy_mr(mvdev, mr); + } + + mutex_unlock(&mres->lock); +} + +static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr) +{ + struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; + + if (!mr) + return; + + if (refcount_dec_and_test(&mr->refcount)) { + list_move_tail(&mr->mr_list, &mres->mr_gc_list_head); + queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent, + msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS)); + } +} + +void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr) +{ + mutex_lock(&mvdev->mres.lock); + _mlx5_vdpa_put_mr(mvdev, mr); + mutex_unlock(&mvdev->mres.lock); +} + +static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr) +{ + if (!mr) + return; + + refcount_inc(&mr->refcount); +} + +void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr) +{ + mutex_lock(&mvdev->mres.lock); + _mlx5_vdpa_get_mr(mvdev, mr); + mutex_unlock(&mvdev->mres.lock); +} + +void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *new_mr, + unsigned int asid) +{ + struct mlx5_vdpa_mr *old_mr = mvdev->mres.mr[asid]; + + mutex_lock(&mvdev->mres.lock); + + _mlx5_vdpa_put_mr(mvdev, old_mr); + mvdev->mres.mr[asid] = new_mr; + + mutex_unlock(&mvdev->mres.lock); +} + +static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_mr *mr; + + mutex_lock(&mvdev->mres.lock); + + list_for_each_entry(mr, &mvdev->mres.mr_list_head, mr_list) { + + mlx5_vdpa_warn(mvdev, "mkey still alive after resource delete: " + "mr: %p, mkey: 0x%x, refcount: %u\n", + mr, mr->mkey, refcount_read(&mr->refcount)); + } + + mutex_unlock(&mvdev->mres.lock); + +} + +void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev) +{ + if (!mvdev->res.valid) + return; + + for (int i = 0; i < MLX5_VDPA_NUM_AS; i++) + mlx5_vdpa_update_mr(mvdev, NULL, i); + + prune_iotlb(mvdev->cvq.iotlb); + + mlx5_vdpa_show_mr_leaks(mvdev); } static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, - struct vhost_iotlb *iotlb, unsigned int asid) + struct mlx5_vdpa_mr *mr, + struct vhost_iotlb *iotlb) { - struct mlx5_vdpa_mr *mr = &mvdev->mr; int err; - if (mr->initialized) - return 0; + if (iotlb) + err = create_user_mr(mvdev, mr, iotlb); + else + err = create_dma_mr(mvdev, mr); - if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { - if (iotlb) - err = create_user_mr(mvdev, iotlb); - else - err = create_dma_mr(mvdev, mr); + if (err) + return err; - if (err) - return err; + mr->iotlb = vhost_iotlb_alloc(0, 0); + if (!mr->iotlb) { + err = -ENOMEM; + goto err_mr; } - if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) { - err = dup_iotlb(mvdev, iotlb); - if (err) - goto out_err; - } + err = dup_iotlb(mr->iotlb, iotlb); + if (err) + goto err_iotlb; + + list_add_tail(&mr->mr_list, &mvdev->mres.mr_list_head); - mr->initialized = true; return 0; -out_err: - if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { - if (iotlb) - destroy_user_mr(mvdev, mr); - else - destroy_dma_mr(mvdev, mr); - } +err_iotlb: + vhost_iotlb_free(mr->iotlb); + +err_mr: + if (iotlb) + destroy_user_mr(mvdev, mr); + else + destroy_dma_mr(mvdev, mr); return err; } -int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, - unsigned int asid) +struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb) { + struct mlx5_vdpa_mr *mr; int err; - mutex_lock(&mvdev->mr.mkey_mtx); - err = _mlx5_vdpa_create_mr(mvdev, iotlb, asid); - mutex_unlock(&mvdev->mr.mkey_mtx); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mutex_lock(&mvdev->mres.lock); + err = _mlx5_vdpa_create_mr(mvdev, mr, iotlb); + mutex_unlock(&mvdev->mres.lock); + + if (err) + goto out_err; + + refcount_set(&mr->refcount, 1); + + return mr; + +out_err: + kfree(mr); + return ERR_PTR(err); +} + +int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, + unsigned int asid) +{ + int err; + + if (mvdev->mres.group2asid[MLX5_VDPA_CVQ_GROUP] != asid) + return 0; + + spin_lock(&mvdev->cvq.iommu_lock); + + prune_iotlb(mvdev->cvq.iotlb); + err = dup_iotlb(mvdev->cvq.iotlb, iotlb); + + spin_unlock(&mvdev->cvq.iommu_lock); + return err; } -int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, - bool *change_map, unsigned int asid) +int mlx5_vdpa_create_dma_mr(struct mlx5_vdpa_dev *mvdev) { - struct mlx5_vdpa_mr *mr = &mvdev->mr; - int err = 0; + struct mlx5_vdpa_mr *mr; + + mr = mlx5_vdpa_create_mr(mvdev, NULL); + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mlx5_vdpa_update_mr(mvdev, mr, 0); - *change_map = false; - mutex_lock(&mr->mkey_mtx); - if (mr->initialized) { - mlx5_vdpa_info(mvdev, "memory map update\n"); - *change_map = true; + return mlx5_vdpa_update_cvq_iotlb(mvdev, NULL, 0); +} + +int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +{ + if (asid >= MLX5_VDPA_NUM_AS) + return -EINVAL; + + mlx5_vdpa_update_mr(mvdev, NULL, asid); + + if (asid == 0 && MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + if (mlx5_vdpa_create_dma_mr(mvdev)) + mlx5_vdpa_warn(mvdev, "create DMA MR failed\n"); + } else { + mlx5_vdpa_update_cvq_iotlb(mvdev, NULL, asid); } - if (!*change_map) - err = _mlx5_vdpa_create_mr(mvdev, iotlb, asid); - mutex_unlock(&mr->mkey_mtx); - return err; + return 0; +} + +int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; + + mres->wq_gc = create_singlethread_workqueue("mlx5_vdpa_mr_gc"); + if (!mres->wq_gc) + return -ENOMEM; + + INIT_DELAYED_WORK(&mres->gc_dwork_ent, mlx5_vdpa_mr_gc_handler); + + mutex_init(&mres->lock); + + INIT_LIST_HEAD(&mres->mr_list_head); + INIT_LIST_HEAD(&mres->mr_gc_list_head); + + return 0; +} + +void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; + + if (!mres->wq_gc) + return; + + atomic_set(&mres->shutdown, 1); + + flush_delayed_work(&mres->gc_dwork_ent); + destroy_workqueue(mres->wq_gc); + mres->wq_gc = NULL; + mutex_destroy(&mres->lock); } diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c index 9800f9bec225..aeae31d0cefa 100644 --- a/drivers/vdpa/mlx5/core/resources.c +++ b/drivers/vdpa/mlx5/core/resources.c @@ -213,7 +213,7 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in, return err; mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index); - *mkey |= mlx5_idx_to_mkey(mkey_index); + *mkey = mlx5_idx_to_mkey(mkey_index); return 0; } @@ -233,6 +233,7 @@ static int init_ctrl_vq(struct mlx5_vdpa_dev *mvdev) if (!mvdev->cvq.iotlb) return -ENOMEM; + spin_lock_init(&mvdev->cvq.iommu_lock); vringh_set_iotlb(&mvdev->cvq.vring, mvdev->cvq.iotlb, &mvdev->cvq.iommu_lock); return 0; @@ -255,7 +256,6 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev) mlx5_vdpa_warn(mvdev, "resources already allocated\n"); return -EINVAL; } - mutex_init(&mvdev->mr.mkey_mtx); res->uar = mlx5_get_uars_page(mdev); if (IS_ERR(res->uar)) { err = PTR_ERR(res->uar); @@ -300,7 +300,6 @@ err_pd: err_uctx: mlx5_put_uars_page(mdev, res->uar); err_uars: - mutex_destroy(&mvdev->mr.mkey_mtx); return err; } @@ -317,6 +316,78 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev) dealloc_pd(mvdev, res->pdn, res->uid); destroy_uctx(mvdev, res->uid); mlx5_put_uars_page(mvdev->mdev, res->uar); - mutex_destroy(&mvdev->mr.mkey_mtx); res->valid = false; } + +static void virtqueue_cmd_callback(int status, struct mlx5_async_work *context) +{ + struct mlx5_vdpa_async_cmd *cmd = + container_of(context, struct mlx5_vdpa_async_cmd, cb_work); + + cmd->err = mlx5_cmd_check(context->ctx->dev, status, cmd->in, cmd->out); + complete(&cmd->cmd_done); +} + +static int issue_async_cmd(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_async_cmd *cmds, + int issued, + int *completed) + +{ + struct mlx5_vdpa_async_cmd *cmd = &cmds[issued]; + int err; + +retry: + err = mlx5_cmd_exec_cb(&mvdev->async_ctx, + cmd->in, cmd->inlen, + cmd->out, cmd->outlen, + virtqueue_cmd_callback, + &cmd->cb_work); + if (err == -EBUSY) { + if (*completed < issued) { + /* Throttled by own commands: wait for oldest completion. */ + wait_for_completion(&cmds[*completed].cmd_done); + (*completed)++; + + goto retry; + } else { + /* Throttled by external commands: switch to sync api. */ + err = mlx5_cmd_exec(mvdev->mdev, + cmd->in, cmd->inlen, + cmd->out, cmd->outlen); + if (!err) + (*completed)++; + } + } + + return err; +} + +int mlx5_vdpa_exec_async_cmds(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_async_cmd *cmds, + int num_cmds) +{ + int completed = 0; + int issued = 0; + int err = 0; + + for (int i = 0; i < num_cmds; i++) + init_completion(&cmds[i].cmd_done); + + while (issued < num_cmds) { + + err = issue_async_cmd(mvdev, cmds, issued, &completed); + if (err) { + mlx5_vdpa_err(mvdev, "error issuing command %d of %d: %d\n", + issued, num_cmds, err); + break; + } + + issued++; + } + + while (completed < issued) + wait_for_completion(&cmds[completed++].cmd_done); + + return err; +} diff --git a/drivers/vdpa/mlx5/net/debug.c b/drivers/vdpa/mlx5/net/debug.c new file mode 100644 index 000000000000..9c85162c19fc --- /dev/null +++ b/drivers/vdpa/mlx5/net/debug.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include <linux/debugfs.h> +#include <linux/mlx5/fs.h> +#include "mlx5_vnet.h" + +static int tirn_show(struct seq_file *file, void *priv) +{ + struct mlx5_vdpa_net *ndev = file->private; + + seq_printf(file, "0x%x\n", ndev->res.tirn); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(tirn); + +void mlx5_vdpa_remove_tirn(struct mlx5_vdpa_net *ndev) +{ + if (ndev->debugfs) + debugfs_remove(ndev->res.tirn_dent); +} + +void mlx5_vdpa_add_tirn(struct mlx5_vdpa_net *ndev) +{ + ndev->res.tirn_dent = debugfs_create_file("tirn", 0444, ndev->rx_dent, + ndev, &tirn_fops); +} + +static int rx_flow_table_show(struct seq_file *file, void *priv) +{ + struct mlx5_vdpa_net *ndev = file->private; + + seq_printf(file, "0x%x\n", mlx5_flow_table_id(ndev->rxft)); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(rx_flow_table); + +void mlx5_vdpa_remove_rx_flow_table(struct mlx5_vdpa_net *ndev) +{ + if (ndev->debugfs) + debugfs_remove(ndev->rx_table_dent); +} + +void mlx5_vdpa_add_rx_flow_table(struct mlx5_vdpa_net *ndev) +{ + ndev->rx_table_dent = debugfs_create_file("table_id", 0444, ndev->rx_dent, + ndev, &rx_flow_table_fops); +} + +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) +static int packets_show(struct seq_file *file, void *priv) +{ + struct mlx5_vdpa_counter *counter = file->private; + u64 packets; + u64 bytes; + int err; + + err = mlx5_fc_query(counter->mdev, counter->counter, &packets, &bytes); + if (err) + return err; + + seq_printf(file, "0x%llx\n", packets); + return 0; +} + +static int bytes_show(struct seq_file *file, void *priv) +{ + struct mlx5_vdpa_counter *counter = file->private; + u64 packets; + u64 bytes; + int err; + + err = mlx5_fc_query(counter->mdev, counter->counter, &packets, &bytes); + if (err) + return err; + + seq_printf(file, "0x%llx\n", bytes); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(packets); +DEFINE_SHOW_ATTRIBUTE(bytes); + +static void add_counter_node(struct mlx5_vdpa_counter *counter, + struct dentry *parent) +{ + debugfs_create_file("packets", 0444, parent, counter, + &packets_fops); + debugfs_create_file("bytes", 0444, parent, counter, + &bytes_fops); +} + +void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) +{ + static const char *ut = "untagged"; + char vidstr[9]; + u16 vid; + + node->ucast_counter.mdev = ndev->mvdev.mdev; + node->mcast_counter.mdev = ndev->mvdev.mdev; + if (node->tagged) { + vid = key2vid(node->macvlan); + snprintf(vidstr, sizeof(vidstr), "0x%x", vid); + } else { + strcpy(vidstr, ut); + } + + node->dent = debugfs_create_dir(vidstr, ndev->rx_dent); + if (IS_ERR(node->dent)) { + node->dent = NULL; + return; + } + + node->ucast_counter.dent = debugfs_create_dir("ucast", node->dent); + if (IS_ERR(node->ucast_counter.dent)) + return; + + add_counter_node(&node->ucast_counter, node->ucast_counter.dent); + + node->mcast_counter.dent = debugfs_create_dir("mcast", node->dent); + if (IS_ERR(node->mcast_counter.dent)) + return; + + add_counter_node(&node->mcast_counter, node->mcast_counter.dent); +} + +void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) +{ + if (node->dent && ndev->debugfs) + debugfs_remove_recursive(node->dent); +} +#endif + +void mlx5_vdpa_add_debugfs(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_core_dev *mdev; + + mdev = ndev->mvdev.mdev; + ndev->debugfs = debugfs_create_dir(dev_name(&ndev->mvdev.vdev.dev), + mlx5_debugfs_get_dev_root(mdev)); + if (!IS_ERR(ndev->debugfs)) + ndev->rx_dent = debugfs_create_dir("rx", ndev->debugfs); +} + +void mlx5_vdpa_remove_debugfs(struct mlx5_vdpa_net *ndev) +{ + debugfs_remove_recursive(ndev->debugfs); + ndev->debugfs = NULL; +} diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 3a6dbbc6440d..ddaa1366704b 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -7,6 +7,7 @@ #include <uapi/linux/virtio_net.h> #include <uapi/linux/virtio_ids.h> #include <uapi/linux/vdpa.h> +#include <uapi/linux/vhost_types.h> #include <linux/virtio_config.h> #include <linux/auxiliary_bus.h> #include <linux/mlx5/cq.h> @@ -18,15 +19,12 @@ #include <linux/mlx5/mlx5_ifc_vdpa.h> #include <linux/mlx5/mpfs.h> #include "mlx5_vdpa.h" +#include "mlx5_vnet.h" MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>"); MODULE_DESCRIPTION("Mellanox VDPA driver"); MODULE_LICENSE("Dual BSD/GPL"); -#define to_mlx5_vdpa_ndev(__mvdev) \ - container_of(__mvdev, struct mlx5_vdpa_net, mvdev) -#define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev) - #define VALID_FEATURES_MASK \ (BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) | \ BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) | \ @@ -50,13 +48,17 @@ MODULE_LICENSE("Dual BSD/GPL"); #define MLX5V_UNTAGGED 0x1000 -struct mlx5_vdpa_net_resources { - u32 tisn; - u32 tdn; - u32 tirn; - u32 rqtn; - bool valid; -}; +/* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section + * 5.1.6.5.5 "Device operation in multiqueue mode": + * + * Multiqueue is disabled by default. + * The driver enables multiqueue by sending a command using class + * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue + * operation, as follows: ... + */ +#define MLX5V_DEFAULT_VQ_COUNT 2 + +#define MLX5V_DEFAULT_VQ_SIZE 256 struct mlx5_vdpa_cq_buf { struct mlx5_frag_buf_ctrl fbc; @@ -94,6 +96,7 @@ struct mlx5_vq_restore_info { u64 driver_addr; u16 avail_index; u16 used_index; + struct msi_map map; bool ready; bool restore; }; @@ -130,6 +133,13 @@ struct mlx5_vdpa_virtqueue { u16 used_idx; int fw_state; + u64 modified_fields; + + struct mlx5_vdpa_mr *vq_mr; + struct mlx5_vdpa_mr *desc_mr; + + struct msi_map map; + /* keep last in the struct */ struct mlx5_vq_restore_info ri; }; @@ -146,47 +156,14 @@ static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx) return idx <= mvdev->max_idx; } -#define MLX5V_MACVLAN_SIZE 256 - -struct mlx5_vdpa_net { - struct mlx5_vdpa_dev mvdev; - struct mlx5_vdpa_net_resources res; - struct virtio_net_config config; - struct mlx5_vdpa_virtqueue *vqs; - struct vdpa_callback *event_cbs; - - /* Serialize vq resources creation and destruction. This is required - * since memory map might change and we need to destroy and create - * resources while driver in operational. - */ - struct rw_semaphore reslock; - struct mlx5_flow_table *rxft; - bool setup; - u32 cur_num_vqs; - u32 rqt_size; - bool nb_registered; - struct notifier_block nb; - struct vdpa_callback config_cb; - struct mlx5_vdpa_wq_ent cvq_ent; - struct hlist_head macvlan_hash[MLX5V_MACVLAN_SIZE]; -}; - -struct macvlan_node { - struct hlist_node hlist; - struct mlx5_flow_handle *ucast_rule; - struct mlx5_flow_handle *mcast_rule; - u64 macvlan; -}; - -static void free_resources(struct mlx5_vdpa_net *ndev); -static void init_mvqs(struct mlx5_vdpa_net *ndev); -static int setup_driver(struct mlx5_vdpa_dev *mvdev); -static void teardown_driver(struct mlx5_vdpa_net *ndev); +static void free_fixed_resources(struct mlx5_vdpa_net *ndev); +static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev); +static int setup_vq_resources(struct mlx5_vdpa_net *ndev, bool filled); +static void teardown_vq_resources(struct mlx5_vdpa_net *ndev); +static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq); static bool mlx5_vdpa_debug; -#define MLX5_CVQ_MAX_ENT 16 - #define MLX5_LOG_VIO_FLAG(_feature) \ do { \ if (features & BIT_ULL(_feature)) \ @@ -596,6 +573,8 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) vcq->mcq.set_ci_db = vcq->db.db; vcq->mcq.arm_db = vcq->db.db + 1; vcq->mcq.cqe_sz = 64; + vcq->mcq.comp = mlx5_vdpa_cq_comp; + vcq->cqe = num_ent; err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent); if (err) @@ -621,7 +600,7 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) /* Use vector 0 by default. Consider adding code to choose least used * vector. */ - err = mlx5_vector2eqn(mdev, 0, &eqn); + err = mlx5_comp_eqn_get(mdev, 0, &eqn); if (err) goto err_vec; @@ -635,10 +614,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) if (err) goto err_vec; - vcq->mcq.comp = mlx5_vdpa_cq_comp; - vcq->cqe = num_ent; - vcq->mcq.set_ci_db = vcq->db.db; - vcq->mcq.arm_db = vcq->db.db + 1; mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); kfree(in); return 0; @@ -666,30 +641,70 @@ static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx) mlx5_db_free(ndev->mvdev.mdev, &vcq->db); } +static int read_umem_params(struct mlx5_vdpa_net *ndev) +{ + u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; + u16 opmod = (MLX5_CAP_VDPA_EMULATION << 1) | (HCA_CAP_OPMOD_GET_CUR & 0x01); + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; + int out_size; + void *caps; + void *out; + int err; + + out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); + out = kzalloc(out_size, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); + if (err) { + mlx5_vdpa_warn(&ndev->mvdev, + "Failed reading vdpa umem capabilities with err %d\n", err); + goto out; + } + + caps = MLX5_ADDR_OF(query_hca_cap_out, out, capability); + + ndev->umem_1_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_a); + ndev->umem_1_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_b); + + ndev->umem_2_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_a); + ndev->umem_2_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_b); + + ndev->umem_3_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_a); + ndev->umem_3_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_b); + +out: + kfree(out); + return 0; +} + static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, struct mlx5_vdpa_umem **umemp) { - struct mlx5_core_dev *mdev = ndev->mvdev.mdev; - int p_a; - int p_b; + u32 p_a; + u32 p_b; switch (num) { case 1: - p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a); - p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b); + p_a = ndev->umem_1_buffer_param_a; + p_b = ndev->umem_1_buffer_param_b; *umemp = &mvq->umem1; break; case 2: - p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a); - p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b); + p_a = ndev->umem_2_buffer_param_a; + p_b = ndev->umem_2_buffer_param_b; *umemp = &mvq->umem2; break; case 3: - p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a); - p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b); + p_a = ndev->umem_3_buffer_param_a; + p_b = ndev->umem_3_buffer_param_b; *umemp = &mvq->umem3; break; } + (*umemp)->size = p_a * mvq->num_ent + p_b; } @@ -821,12 +836,28 @@ static bool vq_is_tx(u16 idx) return idx % 2; } -static u16 get_features_12_3(u64 features) +enum { + MLX5_VIRTIO_NET_F_MRG_RXBUF = 2, + MLX5_VIRTIO_NET_F_HOST_ECN = 4, + MLX5_VIRTIO_NET_F_GUEST_ECN = 6, + MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7, + MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8, + MLX5_VIRTIO_NET_F_GUEST_CSUM = 9, + MLX5_VIRTIO_NET_F_CSUM = 10, + MLX5_VIRTIO_NET_F_HOST_TSO6 = 11, + MLX5_VIRTIO_NET_F_HOST_TSO4 = 12, +}; + +static u16 get_features(u64 features) { - return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) | - (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) | - (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) | - (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6); + return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) | + (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4); } static bool counters_supported(const struct mlx5_vdpa_dev *mvdev) @@ -835,11 +866,25 @@ static bool counters_supported(const struct mlx5_vdpa_dev *mvdev) BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); } -static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev) +{ + return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) & + (1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) && + pci_msix_can_alloc_dyn(mvdev->mdev->pdev); +} + +static int create_virtqueue(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + bool filled) { int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in); u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {}; + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + struct mlx5_vdpa_mr *vq_mr; + struct mlx5_vdpa_mr *vq_desc_mr; + u64 features = filled ? mvdev->actual_features : mvdev->mlx_features; void *obj_context; + u16 mlx_features; void *cmd_hdr; void *vq_ctx; void *in; @@ -855,6 +900,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque goto err_alloc; } + mlx_features = get_features(features); cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); @@ -862,26 +908,58 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context); - MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx); - MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3, - get_features_12_3(ndev->mvdev.actual_features)); + mlx_features >> 3); + MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0, + mlx_features & 7); vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context); MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev)); if (vq_is_tx(mvq->index)) MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn); - MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE); + if (mvq->map.virq) { + MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE); + MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index); + } else { + MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE); + MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn); + } + MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index); - MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn); MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent); MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, - !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1))); - MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr); - MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr); - MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); - MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey); + !!(features & BIT_ULL(VIRTIO_F_VERSION_1))); + + if (filled) { + MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx); + MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); + + MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr); + MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr); + MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); + + vq_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_GROUP]]; + if (vq_mr) + MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey); + + vq_desc_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; + if (vq_desc_mr && + MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) + MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, vq_desc_mr->mkey); + } else { + /* If there is no mr update, make sure that the existing ones are set + * modify to ready. + */ + vq_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_GROUP]]; + if (vq_mr) + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY; + + vq_desc_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; + if (vq_desc_mr) + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY; + } + MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id); MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size); MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id); @@ -900,6 +978,17 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque kfree(in); mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + if (filled) { + mlx5_vdpa_get_mr(mvdev, vq_mr); + mvq->vq_mr = vq_mr; + + if (vq_desc_mr && + MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) { + mlx5_vdpa_get_mr(mvdev, vq_desc_mr); + mvq->desc_mr = vq_desc_mr; + } + } + return 0; err_cmd: @@ -926,6 +1015,12 @@ static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtq } mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; umems_destroy(ndev, mvq); + + mlx5_vdpa_put_mr(&ndev->mvdev, mvq->vq_mr); + mvq->vq_mr = NULL; + + mlx5_vdpa_put_mr(&ndev->mvdev, mvq->desc_mr); + mvq->desc_mr = NULL; } static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw) @@ -1087,44 +1182,101 @@ struct mlx5_virtq_attr { u16 used_index; }; -static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, - struct mlx5_virtq_attr *attr) -{ - int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out); - u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {}; - void *out; - void *obj_context; - void *cmd_hdr; - int err; +struct mlx5_virtqueue_query_mem { + u8 in[MLX5_ST_SZ_BYTES(query_virtio_net_q_in)]; + u8 out[MLX5_ST_SZ_BYTES(query_virtio_net_q_out)]; +}; - out = kzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; +struct mlx5_virtqueue_modify_mem { + u8 in[MLX5_ST_SZ_BYTES(modify_virtio_net_q_in)]; + u8 out[MLX5_ST_SZ_BYTES(modify_virtio_net_q_out)]; +}; - cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr); +static void fill_query_virtqueue_cmd(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + struct mlx5_virtqueue_query_mem *cmd) +{ + void *cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, cmd->in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); - err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen); - if (err) - goto err_cmd; +} + +static void query_virtqueue_end(struct mlx5_vdpa_net *ndev, + struct mlx5_virtqueue_query_mem *cmd, + struct mlx5_virtq_attr *attr) +{ + void *obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, cmd->out, obj_context); - obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context); memset(attr, 0, sizeof(*attr)); attr->state = MLX5_GET(virtio_net_q_object, obj_context, state); attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index); attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index); - kfree(out); - return 0; +} -err_cmd: - kfree(out); +static int query_virtqueues(struct mlx5_vdpa_net *ndev, + int start_vq, + int num_vqs, + struct mlx5_virtq_attr *attrs) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + struct mlx5_virtqueue_query_mem *cmd_mem; + struct mlx5_vdpa_async_cmd *cmds; + int err = 0; + + WARN(start_vq + num_vqs > mvdev->max_vqs, "query vq range invalid [%d, %d), max_vqs: %u\n", + start_vq, start_vq + num_vqs, mvdev->max_vqs); + + cmds = kvcalloc(num_vqs, sizeof(*cmds), GFP_KERNEL); + cmd_mem = kvcalloc(num_vqs, sizeof(*cmd_mem), GFP_KERNEL); + if (!cmds || !cmd_mem) { + err = -ENOMEM; + goto done; + } + + for (int i = 0; i < num_vqs; i++) { + cmds[i].in = &cmd_mem[i].in; + cmds[i].inlen = sizeof(cmd_mem[i].in); + cmds[i].out = &cmd_mem[i].out; + cmds[i].outlen = sizeof(cmd_mem[i].out); + fill_query_virtqueue_cmd(ndev, &ndev->vqs[start_vq + i], &cmd_mem[i]); + } + + err = mlx5_vdpa_exec_async_cmds(&ndev->mvdev, cmds, num_vqs); + if (err) { + mlx5_vdpa_err(mvdev, "error issuing query cmd for vq range [%d, %d): %d\n", + start_vq, start_vq + num_vqs, err); + goto done; + } + + for (int i = 0; i < num_vqs; i++) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i]; + int vq_idx = start_vq + i; + + if (cmd->err) { + mlx5_vdpa_err(mvdev, "query vq %d failed, err: %d\n", vq_idx, cmd->err); + if (!err) + err = cmd->err; + continue; + } + + query_virtqueue_end(ndev, &cmd_mem[i], &attrs[i]); + } + +done: + kvfree(cmd_mem); + kvfree(cmds); return err; } -static bool is_valid_state_change(int oldstate, int newstate) +static bool is_resumable(struct mlx5_vdpa_net *ndev) +{ + return ndev->mvdev.vdev.config->resume; +} + +static bool is_valid_state_change(int oldstate, int newstate, bool resumable) { switch (oldstate) { case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT: @@ -1132,48 +1284,122 @@ static bool is_valid_state_change(int oldstate, int newstate) case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY: return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND; case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND: + return resumable ? newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY : false; case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR: default: return false; } } -static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state) +static bool modifiable_virtqueue_fields(struct mlx5_vdpa_virtqueue *mvq) { - int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); - u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {}; - void *obj_context; - void *cmd_hdr; - void *in; - int err; + /* Only state is always modifiable */ + if (mvq->modified_fields & ~MLX5_VIRTQ_MODIFY_MASK_STATE) + return mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT || + mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND; - if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE) - return 0; - - if (!is_valid_state_change(mvq->fw_state, state)) - return -EINVAL; + return true; +} - in = kzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; +static void fill_modify_virtqueue_cmd(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + int state, + struct mlx5_virtqueue_modify_mem *cmd) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + struct mlx5_vdpa_mr *desc_mr = NULL; + struct mlx5_vdpa_mr *vq_mr = NULL; + void *obj_context; + void *cmd_hdr; + void *vq_ctx; - cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr); + cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, cmd->in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); - obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context); - MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, - MLX5_VIRTQ_MODIFY_MASK_STATE); - MLX5_SET(virtio_net_q_object, obj_context, state, state); - err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); - kfree(in); - if (!err) + obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, cmd->in, obj_context); + vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context); + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) + MLX5_SET(virtio_net_q_object, obj_context, state, state); + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) { + MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr); + MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr); + MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); + } + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX) + MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx); + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX) + MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION) + MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, + !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1))); + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES) { + u16 mlx_features = get_features(ndev->mvdev.actual_features); + + MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3, + mlx_features >> 3); + MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0, + mlx_features & 7); + } + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) { + vq_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_GROUP]]; + + if (vq_mr) + MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey); + else + mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY; + } + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) { + desc_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; + + if (desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) + MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, desc_mr->mkey); + else + mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY; + } + + MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields); +} + +static void modify_virtqueue_end(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + int state) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) { + unsigned int asid = mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_GROUP]; + struct mlx5_vdpa_mr *vq_mr = mvdev->mres.mr[asid]; + + mlx5_vdpa_put_mr(mvdev, mvq->vq_mr); + mlx5_vdpa_get_mr(mvdev, vq_mr); + mvq->vq_mr = vq_mr; + } + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) { + unsigned int asid = mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]; + struct mlx5_vdpa_mr *desc_mr = mvdev->mres.mr[asid]; + + mlx5_vdpa_put_mr(mvdev, mvq->desc_mr); + mlx5_vdpa_get_mr(mvdev, desc_mr); + mvq->desc_mr = desc_mr; + } + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) mvq->fw_state = state; - return err; + mvq->modified_fields = 0; } static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) @@ -1217,13 +1443,62 @@ static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_vir mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id); } -static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv) { - u16 idx = mvq->index; + struct vdpa_callback *cb = priv; + + if (cb->callback) + return cb->callback(cb->private); + + return IRQ_HANDLED; +} + +static void alloc_vector(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq) +{ + struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp; + struct mlx5_vdpa_irq_pool_entry *ent; int err; + int i; - if (!mvq->num_ent) - return 0; + for (i = 0; i < irqp->num_ent; i++) { + ent = &irqp->entries[i]; + if (!ent->used) { + snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d", + dev_name(&ndev->mvdev.vdev.dev), mvq->index); + ent->dev_id = &ndev->event_cbs[mvq->index]; + err = request_irq(ent->map.virq, mlx5_vdpa_int_handler, 0, + ent->name, ent->dev_id); + if (err) + return; + + ent->used = true; + mvq->map = ent->map; + return; + } + } +} + +static void dealloc_vector(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq) +{ + struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp; + int i; + + for (i = 0; i < irqp->num_ent; i++) + if (mvq->map.virq == irqp->entries[i].map.virq) { + free_irq(mvq->map.virq, irqp->entries[i].dev_id); + irqp->entries[i].used = false; + return; + } +} + +static int setup_vq(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + bool filled) +{ + u16 idx = mvq->index; + int err; if (mvq->initialized) return 0; @@ -1246,27 +1521,29 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) err = counter_set_alloc(ndev, mvq); if (err) - goto err_counter; + goto err_connect; - err = create_virtqueue(ndev, mvq); + alloc_vector(ndev, mvq); + err = create_virtqueue(ndev, mvq, filled); if (err) - goto err_connect; + goto err_vq; + + mvq->initialized = true; if (mvq->ready) { - err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); - if (err) { - mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n", - idx, err); - goto err_connect; - } + err = resume_vq(ndev, mvq); + if (err) + goto err_modify; } - mvq->initialized = true; return 0; -err_connect: +err_modify: + destroy_virtqueue(ndev, mvq); +err_vq: + dealloc_vector(ndev, mvq); counter_set_dealloc(ndev, mvq); -err_counter: +err_connect: qp_destroy(ndev, &mvq->vqqp); err_vqqp: qp_destroy(ndev, &mvq->fwqp); @@ -1275,33 +1552,171 @@ err_fwqp: return err; } -static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +static int modify_virtqueues(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs, int state) { - struct mlx5_virtq_attr attr; + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + struct mlx5_virtqueue_modify_mem *cmd_mem; + struct mlx5_vdpa_async_cmd *cmds; + int err = 0; + + WARN(start_vq + num_vqs > mvdev->max_vqs, "modify vq range invalid [%d, %d), max_vqs: %u\n", + start_vq, start_vq + num_vqs, mvdev->max_vqs); + + cmds = kvcalloc(num_vqs, sizeof(*cmds), GFP_KERNEL); + cmd_mem = kvcalloc(num_vqs, sizeof(*cmd_mem), GFP_KERNEL); + if (!cmds || !cmd_mem) { + err = -ENOMEM; + goto done; + } + + for (int i = 0; i < num_vqs; i++) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i]; + struct mlx5_vdpa_virtqueue *mvq; + int vq_idx = start_vq + i; + mvq = &ndev->vqs[vq_idx]; + + if (!modifiable_virtqueue_fields(mvq)) { + err = -EINVAL; + goto done; + } + + if (mvq->fw_state != state) { + if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) { + err = -EINVAL; + goto done; + } + + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_STATE; + } + + cmd->in = &cmd_mem[i].in; + cmd->inlen = sizeof(cmd_mem[i].in); + cmd->out = &cmd_mem[i].out; + cmd->outlen = sizeof(cmd_mem[i].out); + fill_modify_virtqueue_cmd(ndev, mvq, state, &cmd_mem[i]); + } + + err = mlx5_vdpa_exec_async_cmds(&ndev->mvdev, cmds, num_vqs); + if (err) { + mlx5_vdpa_err(mvdev, "error issuing modify cmd for vq range [%d, %d)\n", + start_vq, start_vq + num_vqs); + goto done; + } + + for (int i = 0; i < num_vqs; i++) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i]; + struct mlx5_vdpa_virtqueue *mvq; + int vq_idx = start_vq + i; + + mvq = &ndev->vqs[vq_idx]; + + if (cmd->err) { + mlx5_vdpa_err(mvdev, "modify vq %d failed, state: %d -> %d, err: %d\n", + vq_idx, mvq->fw_state, state, err); + if (!err) + err = cmd->err; + continue; + } + + modify_virtqueue_end(ndev, mvq, state); + } + +done: + kvfree(cmd_mem); + kvfree(cmds); + return err; +} + +static int suspend_vqs(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs) +{ + struct mlx5_vdpa_virtqueue *mvq; + struct mlx5_virtq_attr *attrs; + int vq_idx, i; + int err; + + if (start_vq >= ndev->cur_num_vqs) + return -EINVAL; + + mvq = &ndev->vqs[start_vq]; if (!mvq->initialized) - return; + return 0; if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) - return; + return 0; - if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)) - mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n"); + err = modify_virtqueues(ndev, start_vq, num_vqs, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND); + if (err) + return err; - if (query_virtqueue(ndev, mvq, &attr)) { - mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n"); - return; + attrs = kcalloc(num_vqs, sizeof(struct mlx5_virtq_attr), GFP_KERNEL); + if (!attrs) + return -ENOMEM; + + err = query_virtqueues(ndev, start_vq, num_vqs, attrs); + if (err) + goto done; + + for (i = 0, vq_idx = start_vq; i < num_vqs; i++, vq_idx++) { + mvq = &ndev->vqs[vq_idx]; + mvq->avail_idx = attrs[i].available_index; + mvq->used_idx = attrs[i].used_index; } - mvq->avail_idx = attr.available_index; - mvq->used_idx = attr.used_index; + +done: + kfree(attrs); + return err; } -static void suspend_vqs(struct mlx5_vdpa_net *ndev) +static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { - int i; + return suspend_vqs(ndev, mvq->index, 1); +} - for (i = 0; i < ndev->mvdev.max_vqs; i++) - suspend_vq(ndev, &ndev->vqs[i]); +static int resume_vqs(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs) +{ + struct mlx5_vdpa_virtqueue *mvq; + int err; + + if (start_vq >= ndev->mvdev.max_vqs) + return -EINVAL; + + mvq = &ndev->vqs[start_vq]; + if (!mvq->initialized) + return 0; + + if (mvq->index >= ndev->cur_num_vqs) + return 0; + + switch (mvq->fw_state) { + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT: + /* Due to a FW quirk we need to modify the VQ fields first then change state. + * This should be fixed soon. After that, a single command can be used. + */ + err = modify_virtqueues(ndev, start_vq, num_vqs, mvq->fw_state); + if (err) + return err; + break; + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND: + if (!is_resumable(ndev)) { + mlx5_vdpa_warn(&ndev->mvdev, "vq %d is not resumable\n", mvq->index); + return -EINVAL; + } + break; + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY: + return 0; + default: + mlx5_vdpa_err(&ndev->mvdev, "resume vq %u called from bad state %d\n", + mvq->index, mvq->fw_state); + return -EINVAL; + } + + return modify_virtqueues(ndev, start_vq, num_vqs, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); +} + +static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + return resume_vqs(ndev, mvq->index, 1); } static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) @@ -1310,7 +1725,9 @@ static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue * return; suspend_vq(ndev, mvq); + mvq->modified_fields = 0; destroy_virtqueue(ndev, mvq); + dealloc_vector(ndev, mvq); counter_set_dealloc(ndev, mvq); qp_destroy(ndev, &mvq->vqqp); qp_destroy(ndev, &mvq->fwqp); @@ -1431,36 +1848,85 @@ static int create_tir(struct mlx5_vdpa_net *ndev) err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn); kfree(in); + if (err) + return err; + + mlx5_vdpa_add_tirn(ndev); return err; } static void destroy_tir(struct mlx5_vdpa_net *ndev) { + mlx5_vdpa_remove_tirn(ndev); mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn); } #define MAX_STEERING_ENT 0x8000 #define MAX_STEERING_GROUPS 2 +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + #define NUM_DESTS 2 +#else + #define NUM_DESTS 1 +#endif + +static int add_steering_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dests) +{ +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + int err; + + node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false); + if (IS_ERR(node->ucast_counter.counter)) + return PTR_ERR(node->ucast_counter.counter); + + node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false); + if (IS_ERR(node->mcast_counter.counter)) { + err = PTR_ERR(node->mcast_counter.counter); + goto err_mcast_counter; + } + + dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + return 0; + +err_mcast_counter: + mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter); + return err; +#else + return 0; +#endif +} + +static void remove_steering_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) +{ +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter); + mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter); +#endif +} + static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac, - u16 vid, bool tagged, - struct mlx5_flow_handle **ucast, - struct mlx5_flow_handle **mcast) + struct macvlan_node *node) { - struct mlx5_flow_destination dest = {}; + struct mlx5_flow_destination dests[NUM_DESTS] = {}; struct mlx5_flow_act flow_act = {}; - struct mlx5_flow_handle *rule; struct mlx5_flow_spec *spec; void *headers_c; void *headers_v; u8 *dmac_c; u8 *dmac_v; int err; + u16 vid; spec = kvzalloc(sizeof(*spec), GFP_KERNEL); if (!spec) return -ENOMEM; + vid = key2vid(node->macvlan); spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers); headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); @@ -1472,44 +1938,58 @@ static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac, MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid); } - if (tagged) { + if (node->tagged) { MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid); } flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; - dest.tir_num = ndev->res.tirn; - rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, &dest, 1); - if (IS_ERR(rule)) - return PTR_ERR(rule); + dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dests[0].tir_num = ndev->res.tirn; + err = add_steering_counters(ndev, node, &flow_act, dests); + if (err) + goto out_free; + +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + dests[1].counter = node->ucast_counter.counter; +#endif + node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS); + if (IS_ERR(node->ucast_rule)) { + err = PTR_ERR(node->ucast_rule); + goto err_ucast; + } - *ucast = rule; +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + dests[1].counter = node->mcast_counter.counter; +#endif memset(dmac_c, 0, ETH_ALEN); memset(dmac_v, 0, ETH_ALEN); dmac_c[0] = 1; dmac_v[0] = 1; - rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, &dest, 1); - kvfree(spec); - if (IS_ERR(rule)) { - err = PTR_ERR(rule); + node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS); + if (IS_ERR(node->mcast_rule)) { + err = PTR_ERR(node->mcast_rule); goto err_mcast; } - - *mcast = rule; + kvfree(spec); + mlx5_vdpa_add_rx_counters(ndev, node); return 0; err_mcast: - mlx5_del_flow_rules(*ucast); + mlx5_del_flow_rules(node->ucast_rule); +err_ucast: + remove_steering_counters(ndev, node); +out_free: + kvfree(spec); return err; } static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev, - struct mlx5_flow_handle *ucast, - struct mlx5_flow_handle *mcast) + struct macvlan_node *node) { - mlx5_del_flow_rules(ucast); - mlx5_del_flow_rules(mcast); + mlx5_vdpa_remove_rx_counters(ndev, node); + mlx5_del_flow_rules(node->ucast_rule); + mlx5_del_flow_rules(node->mcast_rule); } static u64 search_val(u8 *mac, u16 vlan, bool tagged) @@ -1543,14 +2023,14 @@ static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 valu return NULL; } -static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged) // vlan -> vid +static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged) { struct macvlan_node *ptr; u64 val; u32 idx; int err; - val = search_val(mac, vlan, tagged); + val = search_val(mac, vid, tagged); if (mac_vlan_lookup(ndev, val)) return -EEXIST; @@ -1558,12 +2038,13 @@ static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagg if (!ptr) return -ENOMEM; - err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, vlan, tagged, - &ptr->ucast_rule, &ptr->mcast_rule); + ptr->tagged = tagged; + ptr->macvlan = val; + ptr->ndev = ndev; + err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr); if (err) goto err_add; - ptr->macvlan = val; idx = hash_64(val, 8); hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]); return 0; @@ -1582,7 +2063,8 @@ static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tag return; hlist_del(&ptr->hlist); - mlx5_vdpa_del_mac_vlan_rules(ndev, ptr->ucast_rule, ptr->mcast_rule); + mlx5_vdpa_del_mac_vlan_rules(ndev, ptr); + remove_steering_counters(ndev, ptr); kfree(ptr); } @@ -1595,7 +2077,8 @@ static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev) for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) { hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) { hlist_del(&pos->hlist); - mlx5_vdpa_del_mac_vlan_rules(ndev, pos->ucast_rule, pos->mcast_rule); + mlx5_vdpa_del_mac_vlan_rules(ndev, pos); + remove_steering_counters(ndev, pos); kfree(pos); } } @@ -1612,15 +2095,16 @@ static int setup_steering(struct mlx5_vdpa_net *ndev) ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS); if (!ns) { - mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n"); + mlx5_vdpa_err(&ndev->mvdev, "failed to get flow namespace\n"); return -EOPNOTSUPP; } ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); if (IS_ERR(ndev->rxft)) { - mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n"); + mlx5_vdpa_err(&ndev->mvdev, "failed to create flow table\n"); return PTR_ERR(ndev->rxft); } + mlx5_vdpa_add_rx_flow_table(ndev); err = mac_vlan_add(ndev, ndev->config.mac, 0, false); if (err) @@ -1629,6 +2113,7 @@ static int setup_steering(struct mlx5_vdpa_net *ndev) return 0; err_add: + mlx5_vdpa_remove_rx_flow_table(ndev); mlx5_destroy_flow_table(ndev->rxft); return err; } @@ -1636,6 +2121,7 @@ err_add: static void teardown_steering(struct mlx5_vdpa_net *ndev) { clear_mac_vlan_table(ndev); + mlx5_vdpa_remove_rx_flow_table(ndev); mlx5_destroy_flow_table(ndev->rxft); } @@ -1731,37 +2217,48 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - int cur_qps = ndev->cur_num_vqs / 2; + int cur_vqs = ndev->cur_num_vqs; + int new_vqs = newqps * 2; int err; int i; - if (cur_qps > newqps) { - err = modify_rqt(ndev, 2 * newqps); + if (cur_vqs > new_vqs) { + err = modify_rqt(ndev, new_vqs); if (err) return err; - for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--) - teardown_vq(ndev, &ndev->vqs[i]); + if (is_resumable(ndev)) { + suspend_vqs(ndev, new_vqs, cur_vqs - new_vqs); + } else { + for (i = new_vqs; i < cur_vqs; i++) + teardown_vq(ndev, &ndev->vqs[i]); + } - ndev->cur_num_vqs = 2 * newqps; + ndev->cur_num_vqs = new_vqs; } else { - ndev->cur_num_vqs = 2 * newqps; - for (i = cur_qps * 2; i < 2 * newqps; i++) { - err = setup_vq(ndev, &ndev->vqs[i]); + ndev->cur_num_vqs = new_vqs; + + for (i = cur_vqs; i < new_vqs; i++) { + err = setup_vq(ndev, &ndev->vqs[i], false); if (err) goto clean_added; } - err = modify_rqt(ndev, 2 * newqps); + + err = resume_vqs(ndev, cur_vqs, new_vqs - cur_vqs); + if (err) + goto clean_added; + + err = modify_rqt(ndev, new_vqs); if (err) goto clean_added; } return 0; clean_added: - for (--i; i >= 2 * cur_qps; --i) + for (--i; i >= cur_vqs; --i) teardown_vq(ndev, &ndev->vqs[i]); - ndev->cur_num_vqs = 2 * cur_qps; + ndev->cur_num_vqs = cur_vqs; return err; } @@ -1971,6 +2468,7 @@ static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_ mvq->desc_addr = desc_area; mvq->device_addr = device_area; mvq->driver_addr = driver_area; + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS; return 0; } @@ -1980,10 +2478,18 @@ static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num) struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct mlx5_vdpa_virtqueue *mvq; - if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx)) + if (!is_index_valid(mvdev, idx)) return; + if (is_ctrl_vq_idx(mvdev, idx)) { + struct mlx5_control_vq *cvq = &mvdev->cvq; + + cvq->vring.vring.num = num; + return; + } + mvq = &ndev->vqs[idx]; + ndev->needs_teardown |= num != mvq->num_ent; mvq->num_ent = num; } @@ -2023,7 +2529,6 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct mlx5_vdpa_virtqueue *mvq; - int err; if (!mvdev->actual_features) return; @@ -2039,15 +2544,11 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready mvq = &ndev->vqs[idx]; if (!ready) { suspend_vq(ndev, mvq); - } else { - err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); - if (err) { - mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err); + } else if (mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) { + if (resume_vq(ndev, mvq)) ready = false; - } } - mvq->ready = ready; } @@ -2088,6 +2589,8 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx, mvq->used_idx = state->split.avail_index; mvq->avail_idx = state->split.avail_index; + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX | + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX; return 0; } @@ -2121,9 +2624,9 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa return 0; } - err = query_virtqueue(ndev, mvq, &attr); + err = query_virtqueues(ndev, mvq->index, 1, &attr); if (err) { - mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n"); + mlx5_vdpa_err(mvdev, "failed to query virtqueue\n"); return err; } state->split.avail_index = attr.used_index; @@ -2145,23 +2648,37 @@ static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx) return MLX5_VDPA_DATAVQ_GROUP; } -enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9, - MLX5_VIRTIO_NET_F_CSUM = 1 << 10, - MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11, - MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12, -}; +static u32 mlx5_vdpa_get_vq_desc_group(struct vdpa_device *vdev, u16 idx) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + if (is_ctrl_vq_idx(mvdev, idx)) + return MLX5_VDPA_CVQ_GROUP; + + return MLX5_VDPA_DATAVQ_DESC_GROUP; +} static u64 mlx_to_vritio_features(u16 dev_features) { u64 result = 0; - if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM) + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF)) + result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN)) + result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM)) result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM); - if (dev_features & MLX5_VIRTIO_NET_F_CSUM) + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM)) result |= BIT_ULL(VIRTIO_NET_F_CSUM); - if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6) + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6)) result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6); - if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4) + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4)) result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4); return result; @@ -2183,6 +2700,7 @@ static u64 get_supported_features(struct mlx5_core_dev *mdev) mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS); mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU); mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN); + mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC); return mlx_vdpa_features; } @@ -2218,14 +2736,14 @@ static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features) return 0; } -static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev) +static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev, bool filled) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); int err; int i; for (i = 0; i < mvdev->max_vqs; i++) { - err = setup_vq(ndev, &ndev->vqs[i]); + err = setup_vq(ndev, &ndev->vqs[i], filled); if (err) goto err_vq; } @@ -2241,16 +2759,10 @@ err_vq: static void teardown_virtqueues(struct mlx5_vdpa_net *ndev) { - struct mlx5_vdpa_virtqueue *mvq; int i; - for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) { - mvq = &ndev->vqs[i]; - if (!mvq->initialized) - continue; - - teardown_vq(ndev, mvq); - } + for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) + teardown_vq(ndev, &ndev->vqs[i]); } static void update_cvq_info(struct mlx5_vdpa_dev *mvdev) @@ -2271,10 +2783,127 @@ static void update_cvq_info(struct mlx5_vdpa_dev *mvdev) } } +static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + if (vport) + MLX5_SET(query_vport_state_in, in, other_vport, 1); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return 0; + + return MLX5_GET(query_vport_state_out, out, state); +} + +static bool get_link_state(struct mlx5_vdpa_dev *mvdev) +{ + if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) == + VPORT_STATE_UP) + return true; + + return false; +} + +static void update_carrier(struct work_struct *work) +{ + struct mlx5_vdpa_wq_ent *wqent; + struct mlx5_vdpa_dev *mvdev; + struct mlx5_vdpa_net *ndev; + + wqent = container_of(work, struct mlx5_vdpa_wq_ent, work); + mvdev = wqent->mvdev; + ndev = to_mlx5_vdpa_ndev(mvdev); + if (get_link_state(mvdev)) + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); + else + ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); + + if (ndev->config_cb.callback) + ndev->config_cb.callback(ndev->config_cb.private); + + kfree(wqent); +} + +static int queue_link_work(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_wq_ent *wqent; + + wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); + if (!wqent) + return -ENOMEM; + + wqent->mvdev = &ndev->mvdev; + INIT_WORK(&wqent->work, update_carrier); + queue_work(ndev->mvdev.wq, &wqent->work); + return 0; +} + +static int event_handler(struct notifier_block *nb, unsigned long event, void *param) +{ + struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb); + struct mlx5_eqe *eqe = param; + int ret = NOTIFY_DONE; + + if (ndev->mvdev.suspended) + return NOTIFY_DONE; + + if (event == MLX5_EVENT_TYPE_PORT_CHANGE) { + switch (eqe->sub_type) { + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + if (queue_link_work(ndev)) + return NOTIFY_DONE; + + ret = NOTIFY_OK; + break; + default: + return NOTIFY_DONE; + } + return ret; + } + return ret; +} + +static void register_link_notifier(struct mlx5_vdpa_net *ndev) +{ + if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS))) + return; + + ndev->nb.notifier_call = event_handler; + mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb); + ndev->nb_registered = true; + queue_link_work(ndev); +} + +static void unregister_link_notifier(struct mlx5_vdpa_net *ndev) +{ + if (!ndev->nb_registered) + return; + + ndev->nb_registered = false; + mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb); + if (ndev->mvdev.wq) + flush_workqueue(ndev->mvdev.wq); +} + +static u64 mlx5_vdpa_get_backend_features(const struct vdpa_device *vdpa) +{ + return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK); +} + static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + u64 old_features = mvdev->actual_features; + u64 diff_features; int err; print_features(mvdev, features, true); @@ -2284,12 +2913,26 @@ static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features) return err; ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features; - if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ)) - ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs); - else - ndev->rqt_size = 1; - ndev->cur_num_vqs = 2 * ndev->rqt_size; + /* Interested in changes of vq features only. */ + if (get_features(old_features) != get_features(mvdev->actual_features)) { + for (int i = 0; i < mvdev->max_vqs; ++i) { + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i]; + + mvq->modified_fields |= ( + MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION | + MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES + ); + } + } + + /* When below features diverge from initial device features, VQs need a full teardown. */ +#define NEEDS_TEARDOWN_MASK (BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | \ + BIT_ULL(VIRTIO_NET_F_CSUM) | \ + BIT_ULL(VIRTIO_F_VERSION_1)) + + diff_features = mvdev->mlx_features ^ mvdev->actual_features; + ndev->needs_teardown = !!(diff_features & NEEDS_TEARDOWN_MASK); update_cvq_info(mvdev); return err; @@ -2335,7 +2978,7 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu int err; if (mvq->initialized) { - err = query_virtqueue(ndev, mvq, &attr); + err = query_virtqueues(ndev, mvq->index, 1, &attr); if (err) return err; } @@ -2347,6 +2990,7 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu ri->desc_addr = mvq->desc_addr; ri->device_addr = mvq->device_addr; ri->driver_addr = mvq->driver_addr; + ri->map = mvq->map; ri->restore = true; return 0; } @@ -2377,7 +3021,7 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev) int i; mlx5_clear_vqs(ndev); - init_mvqs(ndev); + mvqs_set_defaults(ndev); for (i = 0; i < ndev->mvdev.max_vqs; i++) { mvq = &ndev->vqs[i]; ri = &mvq->ri; @@ -2391,46 +3035,52 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev) mvq->desc_addr = ri->desc_addr; mvq->device_addr = ri->device_addr; mvq->driver_addr = ri->driver_addr; + mvq->map = ri->map; } } static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, - struct vhost_iotlb *iotlb, unsigned int asid) + struct mlx5_vdpa_mr *new_mr, + unsigned int asid) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + bool teardown = !is_resumable(ndev); int err; - suspend_vqs(ndev); - err = save_channels_info(ndev); - if (err) - goto err_mr; + suspend_vqs(ndev, 0, ndev->cur_num_vqs); + if (teardown) { + err = save_channels_info(ndev); + if (err) + return err; - teardown_driver(ndev); - mlx5_vdpa_destroy_mr(mvdev); - err = mlx5_vdpa_create_mr(mvdev, iotlb, asid); - if (err) - goto err_mr; + teardown_vq_resources(ndev); + } - if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) - goto err_mr; + mlx5_vdpa_update_mr(mvdev, new_mr, asid); - restore_channels_info(ndev); - err = setup_driver(mvdev); - if (err) - goto err_setup; + for (int i = 0; i < mvdev->max_vqs; i++) + ndev->vqs[i].modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY | + MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY; - return 0; + if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended) + return 0; -err_setup: - mlx5_vdpa_destroy_mr(mvdev); -err_mr: - return err; + if (teardown) { + restore_channels_info(ndev); + err = setup_vq_resources(ndev, true); + if (err) + return err; + } + + resume_vqs(ndev, 0, ndev->cur_num_vqs); + + return 0; } /* reslock must be held for this function */ -static int setup_driver(struct mlx5_vdpa_dev *mvdev) +static int setup_vq_resources(struct mlx5_vdpa_net *ndev, bool filled) { - struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; int err; WARN_ON(!rwsem_is_locked(&ndev->reslock)); @@ -2440,10 +3090,16 @@ static int setup_driver(struct mlx5_vdpa_dev *mvdev) err = 0; goto out; } - err = setup_virtqueues(mvdev); + mlx5_vdpa_add_debugfs(ndev); + + err = read_umem_params(ndev); + if (err) + goto err_setup; + + err = setup_virtqueues(mvdev, filled); if (err) { mlx5_vdpa_warn(mvdev, "setup_virtqueues\n"); - goto out; + goto err_setup; } err = create_rqt(ndev); @@ -2473,12 +3129,14 @@ err_tir: destroy_rqt(ndev); err_rqt: teardown_virtqueues(ndev); +err_setup: + mlx5_vdpa_remove_debugfs(ndev); out: return err; } /* reslock must be held for this function */ -static void teardown_driver(struct mlx5_vdpa_net *ndev) +static void teardown_vq_resources(struct mlx5_vdpa_net *ndev) { WARN_ON(!rwsem_is_locked(&ndev->reslock)); @@ -2486,21 +3144,13 @@ static void teardown_driver(struct mlx5_vdpa_net *ndev) if (!ndev->setup) return; + mlx5_vdpa_remove_debugfs(ndev); teardown_steering(ndev); destroy_tir(ndev); destroy_rqt(ndev); teardown_virtqueues(ndev); ndev->setup = false; -} - -static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) -{ - int i; - - for (i = 0; i < ndev->mvdev.max_vqs; i++) - ndev->vqs[i].ready = false; - - ndev->mvdev.cvq.ready = false; + ndev->needs_teardown = false; } static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev) @@ -2508,13 +3158,18 @@ static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev) struct mlx5_control_vq *cvq = &mvdev->cvq; int err = 0; - if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) + if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) { + u16 idx = cvq->vring.last_avail_idx; + err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features, - MLX5_CVQ_MAX_ENT, false, + cvq->vring.vring.num, false, (struct vring_desc *)(uintptr_t)cvq->desc_addr, (struct vring_avail *)(uintptr_t)cvq->driver_addr, (struct vring_used *)(uintptr_t)cvq->device_addr); + if (!err) + cvq->vring.last_avail_idx = cvq->vring.last_used_idx = idx; + } return err; } @@ -2535,10 +3190,23 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n"); goto err_setup; } - err = setup_driver(mvdev); - if (err) { - mlx5_vdpa_warn(mvdev, "failed to setup driver\n"); - goto err_setup; + register_link_notifier(ndev); + + if (ndev->needs_teardown) + teardown_vq_resources(ndev); + + if (ndev->setup) { + err = resume_vqs(ndev, 0, ndev->cur_num_vqs); + if (err) { + mlx5_vdpa_warn(mvdev, "failed to resume VQs\n"); + goto err_driver; + } + } else { + err = setup_vq_resources(ndev, true); + if (err) { + mlx5_vdpa_warn(mvdev, "failed to setup driver\n"); + goto err_driver; + } } } else { mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n"); @@ -2550,8 +3218,10 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) up_write(&ndev->reslock); return; +err_driver: + unregister_link_notifier(ndev); err_setup: - mlx5_vdpa_destroy_mr(&ndev->mvdev); + mlx5_vdpa_clean_mrs(&ndev->mvdev); ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; err_clear: up_write(&ndev->reslock); @@ -2563,23 +3233,51 @@ static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev) /* default mapping all groups are mapped to asid 0 */ for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++) - mvdev->group2asid[i] = 0; + mvdev->mres.group2asid[i] = 0; } -static int mlx5_vdpa_reset(struct vdpa_device *vdev) +static bool needs_vqs_reset(const struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[0]; + + if (mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) + return true; + + if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT) + return true; + + return mvq->modified_fields & ( + MLX5_VIRTQ_MODIFY_MASK_STATE | + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS | + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX | + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX + ); +} + +static int mlx5_vdpa_compat_reset(struct vdpa_device *vdev, u32 flags) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + bool vq_reset; print_status(mvdev, 0, true); mlx5_vdpa_info(mvdev, "performing device reset\n"); down_write(&ndev->reslock); - teardown_driver(ndev); - clear_vqs_ready(ndev); - mlx5_vdpa_destroy_mr(&ndev->mvdev); + unregister_link_notifier(ndev); + vq_reset = needs_vqs_reset(mvdev); + if (vq_reset) { + teardown_vq_resources(ndev); + mvqs_set_defaults(ndev); + } + + if (flags & VDPA_RESET_F_CLEAN_MAP) + mlx5_vdpa_clean_mrs(&ndev->mvdev); ndev->mvdev.status = 0; - ndev->cur_num_vqs = 0; + ndev->mvdev.suspended = false; + ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT; + ndev->mvdev.cvq.ready = false; ndev->mvdev.cvq.received_desc = 0; ndev->mvdev.cvq.completed_desc = 0; memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1)); @@ -2587,15 +3285,23 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) init_group_to_asid_map(mvdev); ++mvdev->generation; - if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { - if (mlx5_vdpa_create_mr(mvdev, NULL, 0)) - mlx5_vdpa_warn(mvdev, "create MR failed\n"); + if ((flags & VDPA_RESET_F_CLEAN_MAP) && + MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + if (mlx5_vdpa_create_dma_mr(mvdev)) + mlx5_vdpa_err(mvdev, "create MR failed\n"); } + if (vq_reset) + setup_vq_resources(ndev, false); up_write(&ndev->reslock); return 0; } +static int mlx5_vdpa_reset(struct vdpa_device *vdev) +{ + return mlx5_vdpa_compat_reset(vdev, 0); +} + static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev) { return sizeof(struct virtio_net_config); @@ -2627,18 +3333,38 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, unsigned int asid) { - bool change_map; + struct mlx5_vdpa_mr *new_mr; int err; - err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map, asid); - if (err) { - mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err); - return err; + if (asid >= MLX5_VDPA_NUM_AS) + return -EINVAL; + + if (vhost_iotlb_itree_first(iotlb, 0, U64_MAX)) { + new_mr = mlx5_vdpa_create_mr(mvdev, iotlb); + if (IS_ERR(new_mr)) { + err = PTR_ERR(new_mr); + mlx5_vdpa_err(mvdev, "create map failed(%d)\n", err); + return err; + } + } else { + /* Empty iotlbs don't have an mr but will clear the previous mr. */ + new_mr = NULL; } - if (change_map) - err = mlx5_vdpa_change_map(mvdev, iotlb, asid); + if (!mvdev->mres.mr[asid]) { + mlx5_vdpa_update_mr(mvdev, new_mr, asid); + } else { + err = mlx5_vdpa_change_map(mvdev, new_mr, asid); + if (err) { + mlx5_vdpa_err(mvdev, "change map failed(%d)\n", err); + goto out_err; + } + } + + return mlx5_vdpa_update_cvq_iotlb(mvdev, iotlb, asid); +out_err: + mlx5_vdpa_put_mr(mvdev, new_mr); return err; } @@ -2655,6 +3381,50 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, return err; } +static int mlx5_vdpa_reset_map(struct vdpa_device *vdev, unsigned int asid) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err; + + down_write(&ndev->reslock); + err = mlx5_vdpa_reset_mr(mvdev, asid); + up_write(&ndev->reslock); + return err; +} + +static union virtio_map mlx5_get_vq_map(struct vdpa_device *vdev, u16 idx) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + union virtio_map map; + + if (is_ctrl_vq_idx(mvdev, idx)) + map.dma_dev = &vdev->dev; + else + map.dma_dev = mvdev->vdev.vmap.dma_dev; + + return map; +} + +static void free_irqs(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_irq_pool_entry *ent; + int i; + + if (!msix_mode_supported(&ndev->mvdev)) + return; + + if (!ndev->irqp.entries) + return; + + for (i = ndev->irqp.num_ent - 1; i >= 0; i--) { + ent = ndev->irqp.entries + i; + if (ent->map.virq) + pci_msix_free_irq(ndev->mvdev.mdev->pdev, ent->map); + } + kfree(ndev->irqp.entries); +} + static void mlx5_vdpa_free(struct vdpa_device *vdev) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); @@ -2663,13 +3433,19 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev) ndev = to_mlx5_vdpa_ndev(mvdev); - free_resources(ndev); - mlx5_vdpa_destroy_mr(mvdev); + /* Functions called here should be able to work with + * uninitialized resources. + */ + free_fixed_resources(ndev); + mlx5_vdpa_clean_mrs(mvdev); + mlx5_vdpa_destroy_mr_resources(&ndev->mvdev); if (!is_zero_ether_addr(ndev->config.mac)) { pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); } + mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx); mlx5_vdpa_free_resources(&ndev->mvdev); + free_irqs(ndev); kfree(ndev->event_cbs); kfree(ndev->vqs); } @@ -2698,9 +3474,23 @@ static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device return ret; } -static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx) +static int mlx5_get_vq_irq(struct vdpa_device *vdev, u16 idx) { - return -EOPNOTSUPP; + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + + if (!is_index_valid(mvdev, idx)) + return -EINVAL; + + if (is_ctrl_vq_idx(mvdev, idx)) + return -EOPNOTSUPP; + + mvq = &ndev->vqs[idx]; + if (!mvq->map.virq) + return -EOPNOTSUPP; + + return mvq->map.virq; } static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev) @@ -2812,32 +3602,55 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - struct mlx5_vdpa_virtqueue *mvq; - int i; + int err; + + mlx5_vdpa_info(mvdev, "suspending device\n"); down_write(&ndev->reslock); - ndev->nb_registered = false; - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); - flush_workqueue(ndev->mvdev.wq); - for (i = 0; i < ndev->cur_num_vqs; i++) { - mvq = &ndev->vqs[i]; - suspend_vq(ndev, mvq); - } + err = suspend_vqs(ndev, 0, ndev->cur_num_vqs); mlx5_vdpa_cvq_suspend(mvdev); + mvdev->suspended = true; up_write(&ndev->reslock); - return 0; + + return err; +} + +static int mlx5_vdpa_resume(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev; + int err; + + ndev = to_mlx5_vdpa_ndev(mvdev); + + mlx5_vdpa_info(mvdev, "resuming device\n"); + + down_write(&ndev->reslock); + mvdev->suspended = false; + err = resume_vqs(ndev, 0, ndev->cur_num_vqs); + queue_link_work(ndev); + up_write(&ndev->reslock); + + return err; } static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, unsigned int asid) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + int err = 0; if (group >= MLX5_VDPA_NUMVQ_GROUPS) return -EINVAL; - mvdev->group2asid[group] = asid; - return 0; + mvdev->mres.group2asid[group] = asid; + + mutex_lock(&mvdev->mres.lock); + if (group == MLX5_VDPA_CVQ_GROUP && mvdev->mres.mr[asid]) + err = mlx5_vdpa_update_cvq_iotlb(mvdev, mvdev->mres.mr[asid]->iotlb, asid); + mutex_unlock(&mvdev->mres.lock); + + return err; } static const struct vdpa_config_ops mlx5_vdpa_ops = { @@ -2854,7 +3667,9 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vq_irq = mlx5_get_vq_irq, .get_vq_align = mlx5_vdpa_get_vq_align, .get_vq_group = mlx5_vdpa_get_vq_group, + .get_vq_desc_group = mlx5_vdpa_get_vq_desc_group, /* Op disabled if not supported. */ .get_device_features = mlx5_vdpa_get_device_features, + .get_backend_features = mlx5_vdpa_get_backend_features, .set_driver_features = mlx5_vdpa_set_driver_features, .get_driver_features = mlx5_vdpa_get_driver_features, .set_config_cb = mlx5_vdpa_set_config_cb, @@ -2864,14 +3679,18 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_status = mlx5_vdpa_get_status, .set_status = mlx5_vdpa_set_status, .reset = mlx5_vdpa_reset, + .compat_reset = mlx5_vdpa_compat_reset, .get_config_size = mlx5_vdpa_get_config_size, .get_config = mlx5_vdpa_get_config, .set_config = mlx5_vdpa_set_config, .get_generation = mlx5_vdpa_get_generation, .set_map = mlx5_vdpa_set_map, + .reset_map = mlx5_vdpa_reset_map, .set_group_asid = mlx5_set_group_asid, + .get_vq_map = mlx5_get_vq_map, .free = mlx5_vdpa_free, .suspend = mlx5_vdpa_suspend, + .resume = mlx5_vdpa_resume, /* Op disabled if not supported. */ }; static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu) @@ -2887,7 +3706,7 @@ static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu) return 0; } -static int alloc_resources(struct mlx5_vdpa_net *ndev) +static int alloc_fixed_resources(struct mlx5_vdpa_net *ndev) { struct mlx5_vdpa_net_resources *res = &ndev->res; int err; @@ -2914,7 +3733,7 @@ err_tis: return err; } -static void free_resources(struct mlx5_vdpa_net *ndev) +static void free_fixed_resources(struct mlx5_vdpa_net *ndev) { struct mlx5_vdpa_net_resources *res = &ndev->res; @@ -2926,7 +3745,7 @@ static void free_resources(struct mlx5_vdpa_net *ndev) res->valid = false; } -static void init_mvqs(struct mlx5_vdpa_net *ndev) +static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev) { struct mlx5_vdpa_virtqueue *mvq; int i; @@ -2938,12 +3757,7 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev) mvq->ndev = ndev; mvq->fwqp.fw = true; mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; - } - for (; i < ndev->mvdev.max_vqs; i++) { - mvq = &ndev->vqs[i]; - memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri)); - mvq->index = i; - mvq->ndev = ndev; + mvq->num_ent = MLX5V_DEFAULT_VQ_SIZE; } } @@ -2951,84 +3765,9 @@ struct mlx5_vdpa_mgmtdev { struct vdpa_mgmt_dev mgtdev; struct mlx5_adev *madev; struct mlx5_vdpa_net *ndev; + struct vdpa_config_ops vdpa_ops; }; -static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) -{ - u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; - int err; - - MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE); - MLX5_SET(query_vport_state_in, in, op_mod, opmod); - MLX5_SET(query_vport_state_in, in, vport_number, vport); - if (vport) - MLX5_SET(query_vport_state_in, in, other_vport, 1); - - err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); - if (err) - return 0; - - return MLX5_GET(query_vport_state_out, out, state); -} - -static bool get_link_state(struct mlx5_vdpa_dev *mvdev) -{ - if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) == - VPORT_STATE_UP) - return true; - - return false; -} - -static void update_carrier(struct work_struct *work) -{ - struct mlx5_vdpa_wq_ent *wqent; - struct mlx5_vdpa_dev *mvdev; - struct mlx5_vdpa_net *ndev; - - wqent = container_of(work, struct mlx5_vdpa_wq_ent, work); - mvdev = wqent->mvdev; - ndev = to_mlx5_vdpa_ndev(mvdev); - if (get_link_state(mvdev)) - ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); - else - ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); - - if (ndev->nb_registered && ndev->config_cb.callback) - ndev->config_cb.callback(ndev->config_cb.private); - - kfree(wqent); -} - -static int event_handler(struct notifier_block *nb, unsigned long event, void *param) -{ - struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb); - struct mlx5_eqe *eqe = param; - int ret = NOTIFY_DONE; - struct mlx5_vdpa_wq_ent *wqent; - - if (event == MLX5_EVENT_TYPE_PORT_CHANGE) { - switch (eqe->sub_type) { - case MLX5_PORT_CHANGE_SUBTYPE_DOWN: - case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: - wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); - if (!wqent) - return NOTIFY_DONE; - - wqent->mvdev = &ndev->mvdev; - INIT_WORK(&wqent->work, update_carrier); - queue_work(ndev->mvdev.wq, &wqent->work); - ret = NOTIFY_OK; - break; - default: - return NOTIFY_DONE; - } - return ret; - } - return ret; -} - static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu) { int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); @@ -3051,6 +3790,34 @@ static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu) return err; } +static void allocate_irqs(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_irq_pool_entry *ent; + int i; + + if (!msix_mode_supported(&ndev->mvdev)) + return; + + if (!ndev->mvdev.mdev->pdev) + return; + + ndev->irqp.entries = kcalloc(ndev->mvdev.max_vqs, sizeof(*ndev->irqp.entries), GFP_KERNEL); + if (!ndev->irqp.entries) + return; + + + for (i = 0; i < ndev->mvdev.max_vqs; i++) { + ent = ndev->irqp.entries + i; + snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d", + dev_name(&ndev->mvdev.vdev.dev), i); + ent->map = pci_msix_alloc_irq_at(ndev->mvdev.mdev->pdev, MSI_ANY_INDEX, NULL); + if (!ent->map.virq) + return; + + ndev->irqp.num_ent++; + } +} + static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, const struct vdpa_dev_set_config *add_config) { @@ -3060,6 +3827,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, struct mlx5_vdpa_dev *mvdev; struct mlx5_vdpa_net *ndev; struct mlx5_core_dev *mdev; + u64 device_features; u32 max_vqs; u16 mtu; int err; @@ -3068,6 +3836,26 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, return -ENOSPC; mdev = mgtdev->madev->mdev; + device_features = mgtdev->mgtdev.supported_features; + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + if (add_config->device_features & ~device_features) { + dev_warn(mdev->device, + "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n", + add_config->device_features, device_features); + return -EINVAL; + } + device_features &= add_config->device_features; + } else { + device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF); + } + if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) && + device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) { + dev_warn(mdev->device, + "Must provision minimum features 0x%llx for this device", + BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)); + return -EOPNOTSUPP; + } + if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) { dev_warn(mdev->device, "missing support for split virtqueues\n"); @@ -3091,15 +3879,19 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, max_vqs = 2; } - ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); + ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mgtdev->vdpa_ops, + NULL, MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); - ndev->mvdev.mlx_features = mgtdev->mgtdev.supported_features; ndev->mvdev.max_vqs = max_vqs; mvdev = &ndev->mvdev; mvdev->mdev = mdev; + /* cpu_to_mlx5vdpa16() below depends on this flag */ + mvdev->actual_features = + (device_features & BIT_ULL(VIRTIO_F_VERSION_1)); + + mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx); ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL); ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL); @@ -3107,8 +3899,10 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, err = -ENOMEM; goto err_alloc; } + ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT; - init_mvqs(ndev); + mvqs_set_defaults(ndev); + allocate_irqs(ndev); init_rwsem(&ndev->reslock); config = &ndev->config; @@ -3118,20 +3912,26 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, goto err_alloc; } - err = query_mtu(mdev, &mtu); - if (err) - goto err_alloc; + if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) { + err = query_mtu(mdev, &mtu); + if (err) + goto err_alloc; - ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu); + ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu); + } - if (get_link_state(mvdev)) - ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); - else - ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); + if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) { + if (get_link_state(mvdev)) + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); + else + ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); + } if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) { memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN); - } else { + /* No bother setting mac address in config if not going to provision _F_MAC */ + } else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 || + device_features & BIT_ULL(VIRTIO_NET_F_MAC)) { err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac); if (err) goto err_alloc; @@ -3142,56 +3942,80 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, err = mlx5_mpfs_add_mac(pfmdev, config->mac); if (err) goto err_alloc; + } else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) { + /* + * We used to clear _F_MAC feature bit if seeing + * zero mac address when device features are not + * specifically provisioned. Keep the behaviour + * so old scripts do not break. + */ + device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC); + } else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) { + /* Don't provision zero mac address for _F_MAC */ + mlx5_vdpa_warn(&ndev->mvdev, + "No mac address provisioned?\n"); + err = -EINVAL; + goto err_alloc; + } - ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC); + if (device_features & BIT_ULL(VIRTIO_NET_F_MQ)) { + config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2); + ndev->rqt_size = max_vqs / 2; + } else { + ndev->rqt_size = 1; } - config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2); - mvdev->vdev.dma_dev = &mdev->pdev->dev; + ndev->mvdev.mlx_features = device_features; + mvdev->vdev.vmap.dma_dev = &mdev->pdev->dev; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); if (err) - goto err_mpfs; + goto err_alloc; + + err = mlx5_vdpa_init_mr_resources(mvdev); + if (err) + goto err_alloc; if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { - err = mlx5_vdpa_create_mr(mvdev, NULL, 0); + err = mlx5_vdpa_create_dma_mr(mvdev); if (err) - goto err_res; + goto err_alloc; } - err = alloc_resources(ndev); + err = alloc_fixed_resources(ndev); if (err) - goto err_mr; + goto err_alloc; ndev->cvq_ent.mvdev = mvdev; INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler); mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq"); if (!mvdev->wq) { err = -ENOMEM; - goto err_res2; + goto err_alloc; } - ndev->nb.notifier_call = event_handler; - mlx5_notifier_register(mdev, &ndev->nb); - ndev->nb_registered = true; mvdev->vdev.mdev = &mgtdev->mgtdev; err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1); if (err) goto err_reg; mgtdev->ndev = ndev; + + /* For virtio-vdpa, the device was set up during device register. */ + if (ndev->setup) + return 0; + + down_write(&ndev->reslock); + err = setup_vq_resources(ndev, false); + up_write(&ndev->reslock); + if (err) + goto err_setup_vq_res; + return 0; +err_setup_vq_res: + _vdpa_unregister_device(&mvdev->vdev); err_reg: destroy_workqueue(mvdev->wq); -err_res2: - free_resources(ndev); -err_mr: - mlx5_vdpa_destroy_mr(mvdev); -err_res: - mlx5_vdpa_free_resources(&ndev->mvdev); -err_mpfs: - if (!is_zero_ether_addr(config->mac)) - mlx5_mpfs_del_mac(pfmdev, config->mac); err_alloc: put_device(&mvdev->vdev.dev); return err; @@ -3204,20 +4028,50 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct workqueue_struct *wq; - if (ndev->nb_registered) { - ndev->nb_registered = false; - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); - } + unregister_link_notifier(ndev); + _vdpa_unregister_device(dev); + + down_write(&ndev->reslock); + teardown_vq_resources(ndev); + up_write(&ndev->reslock); + wq = mvdev->wq; mvdev->wq = NULL; destroy_workqueue(wq); - _vdpa_unregister_device(dev); mgtdev->ndev = NULL; } +static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev, + const struct vdpa_dev_set_config *add_config) +{ + struct virtio_net_config *config; + struct mlx5_core_dev *pfmdev; + struct mlx5_vdpa_dev *mvdev; + struct mlx5_vdpa_net *ndev; + struct mlx5_core_dev *mdev; + int err = -EOPNOTSUPP; + + mvdev = to_mvdev(dev); + ndev = to_mlx5_vdpa_ndev(mvdev); + mdev = mvdev->mdev; + config = &ndev->config; + + down_write(&ndev->reslock); + if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev)); + err = mlx5_mpfs_add_mac(pfmdev, config->mac); + if (!err) + ether_addr_copy(config->mac, add_config->net.mac); + } + + up_write(&ndev->reslock); + return err; +} + static const struct vdpa_mgmtdev_ops mdev_ops = { .dev_add = mlx5_vdpa_dev_add, .dev_del = mlx5_vdpa_dev_del, + .dev_set_attr = mlx5_vdpa_set_attr, }; static struct virtio_device_id id_table[] = { @@ -3243,11 +4097,19 @@ static int mlx5v_probe(struct auxiliary_device *adev, mgtdev->mgtdev.id_table = id_table; mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) | - BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU); + BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | + BIT_ULL(VDPA_ATTR_DEV_FEATURES); mgtdev->mgtdev.max_supported_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1; mgtdev->mgtdev.supported_features = get_supported_features(mdev); mgtdev->madev = madev; + mgtdev->vdpa_ops = mlx5_vdpa_ops; + + if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, desc_group_mkey_supported)) + mgtdev->vdpa_ops.get_vq_desc_group = NULL; + + if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, freeze_to_rdy_supported)) + mgtdev->vdpa_ops.resume = NULL; err = vdpa_mgmtdev_register(&mgtdev->mgtdev); if (err) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.h b/drivers/vdpa/mlx5/net/mlx5_vnet.h new file mode 100644 index 000000000000..00e79a7d0be8 --- /dev/null +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_VNET_H__ +#define __MLX5_VNET_H__ + +#include "mlx5_vdpa.h" + +#define to_mlx5_vdpa_ndev(__mvdev) \ + container_of(__mvdev, struct mlx5_vdpa_net, mvdev) +#define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev) + +struct mlx5_vdpa_net_resources { + u32 tisn; + u32 tdn; + u32 tirn; + u32 rqtn; + bool valid; + struct dentry *tirn_dent; +}; + +#define MLX5V_MACVLAN_SIZE 256 + +static inline u16 key2vid(u64 key) +{ + return (u16)(key >> 48) & 0xfff; +} + +#define MLX5_VDPA_IRQ_NAME_LEN 32 + +struct mlx5_vdpa_irq_pool_entry { + struct msi_map map; + bool used; + char name[MLX5_VDPA_IRQ_NAME_LEN]; + void *dev_id; +}; + +struct mlx5_vdpa_irq_pool { + int num_ent; + struct mlx5_vdpa_irq_pool_entry *entries; +}; + +struct mlx5_vdpa_net { + struct mlx5_vdpa_dev mvdev; + struct mlx5_vdpa_net_resources res; + struct virtio_net_config config; + struct mlx5_vdpa_virtqueue *vqs; + struct vdpa_callback *event_cbs; + + /* Serialize vq resources creation and destruction. This is required + * since memory map might change and we need to destroy and create + * resources while driver in operational. + */ + struct rw_semaphore reslock; + struct mlx5_flow_table *rxft; + struct dentry *rx_dent; + struct dentry *rx_table_dent; + bool setup; + bool needs_teardown; + u32 cur_num_vqs; + u32 rqt_size; + bool nb_registered; + struct notifier_block nb; + struct vdpa_callback config_cb; + struct mlx5_vdpa_wq_ent cvq_ent; + struct hlist_head macvlan_hash[MLX5V_MACVLAN_SIZE]; + struct mlx5_vdpa_irq_pool irqp; + struct dentry *debugfs; + + u32 umem_1_buffer_param_a; + u32 umem_1_buffer_param_b; + + u32 umem_2_buffer_param_a; + u32 umem_2_buffer_param_b; + + u32 umem_3_buffer_param_a; + u32 umem_3_buffer_param_b; +}; + +struct mlx5_vdpa_counter { + struct mlx5_fc *counter; + struct dentry *dent; + struct mlx5_core_dev *mdev; +}; + +struct macvlan_node { + struct hlist_node hlist; + struct mlx5_flow_handle *ucast_rule; + struct mlx5_flow_handle *mcast_rule; + u64 macvlan; + struct mlx5_vdpa_net *ndev; + bool tagged; +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) + struct dentry *dent; + struct mlx5_vdpa_counter ucast_counter; + struct mlx5_vdpa_counter mcast_counter; +#endif +}; + +void mlx5_vdpa_add_debugfs(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_remove_debugfs(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_add_rx_flow_table(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_remove_rx_flow_table(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_add_tirn(struct mlx5_vdpa_net *ndev); +void mlx5_vdpa_remove_tirn(struct mlx5_vdpa_net *ndev); +#if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) +void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node); +void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node); +#else +static inline void mlx5_vdpa_add_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) {} +static inline void mlx5_vdpa_remove_rx_counters(struct mlx5_vdpa_net *ndev, + struct macvlan_node *node) {} +#endif + + +#endif /* __MLX5_VNET_H__ */ diff --git a/drivers/vdpa/octeon_ep/Makefile b/drivers/vdpa/octeon_ep/Makefile new file mode 100644 index 000000000000..e23e2ff14f33 --- /dev/null +++ b/drivers/vdpa/octeon_ep/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa.o +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_main.o +octep_vdpa-$(CONFIG_OCTEONEP_VDPA) += octep_vdpa_hw.o diff --git a/drivers/vdpa/octeon_ep/octep_vdpa.h b/drivers/vdpa/octeon_ep/octep_vdpa.h new file mode 100644 index 000000000000..53b020b019f7 --- /dev/null +++ b/drivers/vdpa/octeon_ep/octep_vdpa.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * Copyright (C) 2024 Marvell. + */ +#ifndef __OCTEP_VDPA_H__ +#define __OCTEP_VDPA_H__ + +#include <linux/pci.h> +#include <linux/pci_regs.h> +#include <linux/vdpa.h> +#include <linux/virtio_pci_modern.h> +#include <uapi/linux/virtio_crypto.h> +#include <uapi/linux/virtio_net.h> +#include <uapi/linux/virtio_blk.h> +#include <uapi/linux/virtio_config.h> +#include <uapi/linux/virtio_pci.h> +#include <uapi/linux/vdpa.h> + +#define OCTEP_VDPA_DEVID_CN106K_PF 0xb900 +#define OCTEP_VDPA_DEVID_CN106K_VF 0xb903 +#define OCTEP_VDPA_DEVID_CN105K_PF 0xba00 +#define OCTEP_VDPA_DEVID_CN105K_VF 0xba03 +#define OCTEP_VDPA_DEVID_CN103K_PF 0xbd00 +#define OCTEP_VDPA_DEVID_CN103K_VF 0xbd03 + +#define OCTEP_HW_MBOX_BAR 0 +#define OCTEP_HW_CAPS_BAR 4 + +#define OCTEP_DEV_READY_SIGNATURE 0xBABABABA + +#define OCTEP_EPF_RINFO(x) (0x000209f0 | ((x) << 25)) +#define OCTEP_VF_MBOX_DATA(x) (0x00010210 | ((x) << 17)) +#define OCTEP_PF_MBOX_DATA(x) (0x00022000 | ((x) << 4)) +#define OCTEP_VF_IN_CTRL(x) (0x00010000 | ((x) << 17)) +#define OCTEP_VF_IN_CTRL_RPVF(val) (((val) >> 48) & 0xF) + +#define OCTEP_FW_READY_SIGNATURE0 0xFEEDFEED +#define OCTEP_FW_READY_SIGNATURE1 0x3355ffaa +#define OCTEP_MAX_CB_INTR 8 + +enum octep_vdpa_dev_status { + OCTEP_VDPA_DEV_STATUS_INVALID, + OCTEP_VDPA_DEV_STATUS_ALLOC, + OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT, + OCTEP_VDPA_DEV_STATUS_INIT, + OCTEP_VDPA_DEV_STATUS_READY, + OCTEP_VDPA_DEV_STATUS_UNINIT +}; + +struct octep_vring_info { + struct vdpa_callback cb; + void __iomem *notify_addr; + void __iomem *cb_notify_addr; + phys_addr_t notify_pa; +}; + +enum octep_pci_vndr_cfg_type { + OCTEP_PCI_VNDR_CFG_TYPE_VIRTIO_ID, + OCTEP_PCI_VNDR_CFG_TYPE_MAX, +}; + +struct octep_pci_vndr_data { + struct virtio_pci_vndr_data hdr; + u8 id; + u8 bar; + union { + u64 data; + struct { + u32 offset; + u32 length; + }; + }; +}; + +struct octep_hw { + struct pci_dev *pdev; + u8 __iomem *base[PCI_STD_NUM_BARS]; + struct virtio_pci_common_cfg __iomem *common_cfg; + u8 __iomem *dev_cfg; + u8 __iomem *isr; + void __iomem *notify_base; + phys_addr_t notify_base_pa; + u32 notify_off_multiplier; + u8 notify_bar; + struct octep_vring_info *vqs; + struct vdpa_callback config_cb; + u64 features; + u16 nr_vring; + u32 config_size; + int nb_irqs; + int *irqs; + u8 dev_id; +}; + +u8 octep_hw_get_status(struct octep_hw *oct_hw); +void octep_hw_set_status(struct octep_hw *dev, uint8_t status); +void octep_hw_reset(struct octep_hw *oct_hw); +void octep_write_queue_select(struct octep_hw *oct_hw, u16 queue_id); +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid); +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length); +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area, + u64 device_area); +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num); +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready); +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid); +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state); +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state); +u16 octep_get_vq_size(struct octep_hw *oct_hw); +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev); +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw); +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features); +u64 octep_hw_get_drv_features(struct octep_hw *oct_hw); +int octep_verify_features(u64 features); + +#endif /* __OCTEP_VDPA_H__ */ diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c new file mode 100644 index 000000000000..74240101c505 --- /dev/null +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c @@ -0,0 +1,549 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Marvell. */ + +#include <linux/iopoll.h> +#include <linux/build_bug.h> + +#include "octep_vdpa.h" + +enum octep_mbox_ids { + OCTEP_MBOX_MSG_SET_VQ_STATE = 1, + OCTEP_MBOX_MSG_GET_VQ_STATE, +}; + +#define OCTEP_HW_TIMEOUT 10000000 + +#define MBOX_OFFSET 64 +#define MBOX_RSP_MASK 0x00000001 +#define MBOX_RC_MASK 0x0000FFFE + +#define MBOX_RSP_TO_ERR(val) (-(((val) & MBOX_RC_MASK) >> 2)) +#define MBOX_AVAIL(val) (((val) & MBOX_RSP_MASK)) +#define MBOX_RSP(val) ((val) & (MBOX_RC_MASK | MBOX_RSP_MASK)) + +#define DEV_RST_ACK_BIT 7 +#define FEATURE_SEL_ACK_BIT 15 +#define QUEUE_SEL_ACK_BIT 15 + +struct octep_mbox_hdr { + u8 ver; + u8 rsvd1; + u16 id; + u16 rsvd2; +#define MBOX_REQ_SIG (0xdead) +#define MBOX_RSP_SIG (0xbeef) + u16 sig; +}; + +struct octep_mbox_sts { + u16 rsp:1; + u16 rc:15; + u16 rsvd; +}; + +struct octep_mbox { + struct octep_mbox_hdr hdr; + struct octep_mbox_sts sts; + u64 rsvd; + u32 data[]; +}; + +static inline struct octep_mbox __iomem *octep_get_mbox(struct octep_hw *oct_hw) +{ + return (struct octep_mbox __iomem *)(oct_hw->dev_cfg + MBOX_OFFSET); +} + +static inline int octep_wait_for_mbox_avail(struct octep_mbox __iomem *mbox) +{ + u32 val; + + return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_AVAIL(val), 10, + OCTEP_HW_TIMEOUT); +} + +static inline int octep_wait_for_mbox_rsp(struct octep_mbox __iomem *mbox) +{ + u32 val; + + return readx_poll_timeout(ioread32, &mbox->sts, val, MBOX_RSP(val), 10, + OCTEP_HW_TIMEOUT); +} + +static inline void octep_write_hdr(struct octep_mbox __iomem *mbox, u16 id, u16 sig) +{ + iowrite16(id, &mbox->hdr.id); + iowrite16(sig, &mbox->hdr.sig); +} + +static inline u32 octep_read_sig(struct octep_mbox __iomem *mbox) +{ + return ioread16(&mbox->hdr.sig); +} + +static inline void octep_write_sts(struct octep_mbox __iomem *mbox, u32 sts) +{ + iowrite32(sts, &mbox->sts); +} + +static inline u32 octep_read_sts(struct octep_mbox __iomem *mbox) +{ + return ioread32(&mbox->sts); +} + +static inline u32 octep_read32_word(struct octep_mbox __iomem *mbox, u16 word_idx) +{ + return ioread32(&mbox->data[word_idx]); +} + +static inline void octep_write32_word(struct octep_mbox __iomem *mbox, u16 word_idx, u32 word) +{ + return iowrite32(word, &mbox->data[word_idx]); +} + +static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *buffer, + u32 buf_size, bool write) +{ + struct octep_mbox __iomem *mbox = octep_get_mbox(oct_hw); + struct pci_dev *pdev = oct_hw->pdev; + u32 *p = (u32 *)buffer; + u16 data_wds; + int ret, i; + u32 val; + + if (!IS_ALIGNED(buf_size, 4)) + return -EINVAL; + + /* Make sure mbox space is available */ + ret = octep_wait_for_mbox_avail(mbox); + if (ret) { + dev_warn(&pdev->dev, "Timeout waiting for previous mbox data to be consumed\n"); + return ret; + } + data_wds = buf_size / 4; + + if (write) { + for (i = 1; i <= data_wds; i++) { + octep_write32_word(mbox, i, *p); + p++; + } + } + octep_write32_word(mbox, 0, (u32)qid); + octep_write_sts(mbox, 0); + + octep_write_hdr(mbox, id, MBOX_REQ_SIG); + + ret = octep_wait_for_mbox_rsp(mbox); + if (ret) { + dev_warn(&pdev->dev, "Timeout waiting for mbox : %d response\n", id); + return ret; + } + + val = octep_read_sig(mbox); + if ((val & 0xFFFF) != MBOX_RSP_SIG) { + dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id); + return -EINVAL; + } + + val = octep_read_sts(mbox); + if (val & MBOX_RC_MASK) { + ret = MBOX_RSP_TO_ERR(val); + dev_warn(&pdev->dev, "Error while processing mbox : %d, err %d\n", id, ret); + return ret; + } + + if (!write) + for (i = 1; i <= data_wds; i++) + *p++ = octep_read32_word(mbox, i); + + return 0; +} + +static void octep_mbox_init(struct octep_mbox __iomem *mbox) +{ + iowrite32(1, &mbox->sts); +} + +int octep_verify_features(u64 features) +{ + /* Minimum features to expect */ + if (!(features & BIT_ULL(VIRTIO_F_VERSION_1))) + return -EOPNOTSUPP; + + if (!(features & BIT_ULL(VIRTIO_F_NOTIFICATION_DATA))) + return -EOPNOTSUPP; + + if (!(features & BIT_ULL(VIRTIO_F_RING_PACKED))) + return -EOPNOTSUPP; + + return 0; +} + +u8 octep_hw_get_status(struct octep_hw *oct_hw) +{ + return ioread8(&oct_hw->common_cfg->device_status); +} + +void octep_hw_set_status(struct octep_hw *oct_hw, u8 status) +{ + iowrite8(status, &oct_hw->common_cfg->device_status); +} + +void octep_hw_reset(struct octep_hw *oct_hw) +{ + u8 val; + + octep_hw_set_status(oct_hw, 0 | BIT(DEV_RST_ACK_BIT)); + if (readx_poll_timeout(ioread8, &oct_hw->common_cfg->device_status, val, !val, 10, + OCTEP_HW_TIMEOUT)) { + dev_warn(&oct_hw->pdev->dev, "Octeon device reset timeout\n"); + return; + } +} + +static int feature_sel_write_with_timeout(struct octep_hw *oct_hw, u32 select, void __iomem *addr) +{ + u32 val; + + iowrite32(select | BIT(FEATURE_SEL_ACK_BIT), addr); + + if (readx_poll_timeout(ioread32, addr, val, val == select, 10, OCTEP_HW_TIMEOUT)) { + dev_warn(&oct_hw->pdev->dev, "Feature select%d write timeout\n", select); + return -1; + } + return 0; +} + +u64 octep_hw_get_dev_features(struct octep_hw *oct_hw) +{ + u32 features_lo, features_hi; + + if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->device_feature_select)) + return 0; + + features_lo = ioread32(&oct_hw->common_cfg->device_feature); + + if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->device_feature_select)) + return 0; + + features_hi = ioread32(&oct_hw->common_cfg->device_feature); + + return ((u64)features_hi << 32) | features_lo; +} + +u64 octep_hw_get_drv_features(struct octep_hw *oct_hw) +{ + u32 features_lo, features_hi; + + if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->guest_feature_select)) + return 0; + + features_lo = ioread32(&oct_hw->common_cfg->guest_feature); + + if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->guest_feature_select)) + return 0; + + features_hi = ioread32(&oct_hw->common_cfg->guest_feature); + + return ((u64)features_hi << 32) | features_lo; +} + +void octep_hw_set_drv_features(struct octep_hw *oct_hw, u64 features) +{ + if (feature_sel_write_with_timeout(oct_hw, 0, &oct_hw->common_cfg->guest_feature_select)) + return; + + iowrite32(features & (BIT_ULL(32) - 1), &oct_hw->common_cfg->guest_feature); + + if (feature_sel_write_with_timeout(oct_hw, 1, &oct_hw->common_cfg->guest_feature_select)) + return; + + iowrite32(features >> 32, &oct_hw->common_cfg->guest_feature); +} + +void octep_write_queue_select(struct octep_hw *oct_hw, u16 queue_id) +{ + u16 val; + + iowrite16(queue_id | BIT(QUEUE_SEL_ACK_BIT), &oct_hw->common_cfg->queue_select); + + if (readx_poll_timeout(ioread16, &oct_hw->common_cfg->queue_select, val, val == queue_id, + 10, OCTEP_HW_TIMEOUT)) { + dev_warn(&oct_hw->pdev->dev, "Queue select write timeout\n"); + return; + } +} + +void octep_notify_queue(struct octep_hw *oct_hw, u16 qid) +{ + iowrite16(qid, oct_hw->vqs[qid].notify_addr); +} + +void octep_read_dev_config(struct octep_hw *oct_hw, u64 offset, void *dst, int length) +{ + u8 old_gen, new_gen, *p; + int i; + + if (WARN_ON(offset + length > oct_hw->config_size)) + return; + + do { + old_gen = ioread8(&oct_hw->common_cfg->config_generation); + p = dst; + for (i = 0; i < length; i++) + *p++ = ioread8(oct_hw->dev_cfg + offset + i); + + new_gen = ioread8(&oct_hw->common_cfg->config_generation); + } while (old_gen != new_gen); +} + +int octep_set_vq_address(struct octep_hw *oct_hw, u16 qid, u64 desc_area, u64 driver_area, + u64 device_area) +{ + struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg; + + octep_write_queue_select(oct_hw, qid); + vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo, + &cfg->queue_desc_hi); + vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo, + &cfg->queue_avail_hi); + vp_iowrite64_twopart(device_area, &cfg->queue_used_lo, + &cfg->queue_used_hi); + + return 0; +} + +int octep_get_vq_state(struct octep_hw *oct_hw, u16 qid, struct vdpa_vq_state *state) +{ + return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_GET_VQ_STATE, qid, state, + sizeof(*state), 0); +} + +int octep_set_vq_state(struct octep_hw *oct_hw, u16 qid, const struct vdpa_vq_state *state) +{ + struct vdpa_vq_state q_state; + + memcpy(&q_state, state, sizeof(struct vdpa_vq_state)); + return octep_process_mbox(oct_hw, OCTEP_MBOX_MSG_SET_VQ_STATE, qid, &q_state, + sizeof(*state), 1); +} + +void octep_set_vq_num(struct octep_hw *oct_hw, u16 qid, u32 num) +{ + struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg; + + octep_write_queue_select(oct_hw, qid); + iowrite16(num, &cfg->queue_size); +} + +void octep_set_vq_ready(struct octep_hw *oct_hw, u16 qid, bool ready) +{ + struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg; + + octep_write_queue_select(oct_hw, qid); + iowrite16(ready, &cfg->queue_enable); +} + +bool octep_get_vq_ready(struct octep_hw *oct_hw, u16 qid) +{ + struct virtio_pci_common_cfg __iomem *cfg = oct_hw->common_cfg; + + octep_write_queue_select(oct_hw, qid); + return ioread16(&cfg->queue_enable); +} + +u16 octep_get_vq_size(struct octep_hw *oct_hw) +{ + octep_write_queue_select(oct_hw, 0); + return ioread16(&oct_hw->common_cfg->queue_size); +} + +static u32 octep_get_config_size(struct octep_hw *oct_hw) +{ + switch (oct_hw->dev_id) { + case VIRTIO_ID_NET: + return sizeof(struct virtio_net_config); + case VIRTIO_ID_CRYPTO: + return sizeof(struct virtio_crypto_config); + default: + return 0; + } +} + +static void __iomem *octep_get_cap_addr(struct octep_hw *oct_hw, struct virtio_pci_cap *cap) +{ + struct device *dev = &oct_hw->pdev->dev; + u32 length = le32_to_cpu(cap->length); + u32 offset = le32_to_cpu(cap->offset); + u8 bar = cap->bar; + u32 len; + + if (bar != OCTEP_HW_CAPS_BAR) { + dev_err(dev, "Invalid bar: %u\n", bar); + return NULL; + } + if (offset + length < offset) { + dev_err(dev, "offset(%u) + length(%u) overflows\n", + offset, length); + return NULL; + } + len = pci_resource_len(oct_hw->pdev, bar); + if (offset + length > len) { + dev_err(dev, "invalid cap: overflows bar space: %u > %u\n", + offset + length, len); + return NULL; + } + return oct_hw->base[bar] + offset; +} + +/* In Octeon DPU device, the virtio config space is completely + * emulated by the device's firmware. So, the standard pci config + * read apis can't be used for reading the virtio capability. + */ +static void octep_pci_caps_read(struct octep_hw *oct_hw, void *buf, size_t len, off_t offset) +{ + u8 __iomem *bar = oct_hw->base[OCTEP_HW_CAPS_BAR]; + u8 *p = buf; + size_t i; + + for (i = 0; i < len; i++) + *p++ = ioread8(bar + offset + i); +} + +static int octep_pci_signature_verify(struct octep_hw *oct_hw) +{ + u32 signature[2]; + + octep_pci_caps_read(oct_hw, &signature, sizeof(signature), 0); + + if (signature[0] != OCTEP_FW_READY_SIGNATURE0) + return -1; + + if (signature[1] != OCTEP_FW_READY_SIGNATURE1) + return -1; + + return 0; +} + +static void octep_vndr_data_process(struct octep_hw *oct_hw, + struct octep_pci_vndr_data *vndr_data) +{ + BUILD_BUG_ON(sizeof(struct octep_pci_vndr_data) % 4 != 0); + + switch (vndr_data->id) { + case OCTEP_PCI_VNDR_CFG_TYPE_VIRTIO_ID: + oct_hw->dev_id = (u8)vndr_data->data; + break; + default: + dev_err(&oct_hw->pdev->dev, "Invalid vendor data id %u\n", + vndr_data->id); + break; + } +} + +int octep_hw_caps_read(struct octep_hw *oct_hw, struct pci_dev *pdev) +{ + struct octep_pci_vndr_data vndr_data; + struct octep_mbox __iomem *mbox; + struct device *dev = &pdev->dev; + struct virtio_pci_cap cap; + u16 notify_off; + int i, ret; + u8 pos; + + oct_hw->pdev = pdev; + ret = octep_pci_signature_verify(oct_hw); + if (ret) { + dev_err(dev, "Octeon Virtio FW is not initialized\n"); + return -EIO; + } + + octep_pci_caps_read(oct_hw, &pos, 1, PCI_CAPABILITY_LIST); + + while (pos) { + octep_pci_caps_read(oct_hw, &cap, 2, pos); + + if (cap.cap_vndr != PCI_CAP_ID_VNDR) { + dev_err(dev, "Found invalid capability vndr id: %d\n", cap.cap_vndr); + break; + } + + octep_pci_caps_read(oct_hw, &cap, sizeof(cap), pos); + + dev_info(dev, "[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n", + pos, cap.cfg_type, cap.bar, cap.offset, cap.length); + + switch (cap.cfg_type) { + case VIRTIO_PCI_CAP_COMMON_CFG: + oct_hw->common_cfg = octep_get_cap_addr(oct_hw, &cap); + break; + case VIRTIO_PCI_CAP_NOTIFY_CFG: + octep_pci_caps_read(oct_hw, &oct_hw->notify_off_multiplier, + 4, pos + sizeof(cap)); + + oct_hw->notify_base = octep_get_cap_addr(oct_hw, &cap); + oct_hw->notify_bar = cap.bar; + oct_hw->notify_base_pa = pci_resource_start(pdev, cap.bar) + + le32_to_cpu(cap.offset); + break; + case VIRTIO_PCI_CAP_DEVICE_CFG: + oct_hw->dev_cfg = octep_get_cap_addr(oct_hw, &cap); + break; + case VIRTIO_PCI_CAP_ISR_CFG: + oct_hw->isr = octep_get_cap_addr(oct_hw, &cap); + break; + case VIRTIO_PCI_CAP_VENDOR_CFG: + octep_pci_caps_read(oct_hw, &vndr_data, sizeof(vndr_data), pos); + if (vndr_data.hdr.vendor_id != PCI_VENDOR_ID_CAVIUM) { + dev_err(dev, "Invalid vendor data\n"); + return -EINVAL; + } + + octep_vndr_data_process(oct_hw, &vndr_data); + break; + } + + pos = cap.cap_next; + } + if (!oct_hw->common_cfg || !oct_hw->notify_base || + !oct_hw->dev_cfg || !oct_hw->isr) { + dev_err(dev, "Incomplete PCI capabilities"); + return -EIO; + } + dev_info(dev, "common cfg mapped at: %p\n", oct_hw->common_cfg); + dev_info(dev, "device cfg mapped at: %p\n", oct_hw->dev_cfg); + dev_info(dev, "isr cfg mapped at: %p\n", oct_hw->isr); + dev_info(dev, "notify base: %p, notify off multiplier: %u\n", + oct_hw->notify_base, oct_hw->notify_off_multiplier); + + oct_hw->config_size = octep_get_config_size(oct_hw); + oct_hw->features = octep_hw_get_dev_features(oct_hw); + + ret = octep_verify_features(oct_hw->features); + if (ret) { + dev_err(&pdev->dev, "Couldn't read features from the device FW\n"); + return ret; + } + oct_hw->nr_vring = vp_ioread16(&oct_hw->common_cfg->num_queues); + + oct_hw->vqs = devm_kcalloc(&pdev->dev, oct_hw->nr_vring, sizeof(*oct_hw->vqs), GFP_KERNEL); + if (!oct_hw->vqs) + return -ENOMEM; + + dev_info(&pdev->dev, "Device features : %llx\n", oct_hw->features); + dev_info(&pdev->dev, "Maximum queues : %u\n", oct_hw->nr_vring); + + for (i = 0; i < oct_hw->nr_vring; i++) { + octep_write_queue_select(oct_hw, i); + notify_off = vp_ioread16(&oct_hw->common_cfg->queue_notify_off); + oct_hw->vqs[i].notify_addr = oct_hw->notify_base + + notify_off * oct_hw->notify_off_multiplier; + oct_hw->vqs[i].cb_notify_addr = (u32 __iomem *)oct_hw->vqs[i].notify_addr + 1; + oct_hw->vqs[i].notify_pa = oct_hw->notify_base_pa + + notify_off * oct_hw->notify_off_multiplier; + } + mbox = octep_get_mbox(oct_hw); + octep_mbox_init(mbox); + dev_info(dev, "mbox mapped at: %p\n", mbox); + + return 0; +} diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c new file mode 100644 index 000000000000..31a02e7fd7f2 --- /dev/null +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c @@ -0,0 +1,900 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Marvell. */ + +#include <linux/interrupt.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/module.h> +#include <linux/iommu.h> +#include "octep_vdpa.h" + +#define OCTEP_VDPA_DRIVER_NAME "octep_vdpa" + +struct octep_pf { + u8 __iomem *base[PCI_STD_NUM_BARS]; + struct pci_dev *pdev; + struct resource res; + u64 vf_base; + int enabled_vfs; + u32 vf_stride; + u16 vf_devid; +}; + +struct octep_vdpa { + struct vdpa_device vdpa; + struct octep_hw *oct_hw; + struct pci_dev *pdev; +}; + +struct octep_vdpa_mgmt_dev { + struct vdpa_mgmt_dev mdev; + struct octep_hw oct_hw; + struct pci_dev *pdev; + /* Work entry to handle device setup */ + struct work_struct setup_task; + /* Device status */ + atomic_t status; +}; + +static struct octep_hw *vdpa_to_octep_hw(struct vdpa_device *vdpa_dev) +{ + struct octep_vdpa *oct_vdpa; + + oct_vdpa = container_of(vdpa_dev, struct octep_vdpa, vdpa); + + return oct_vdpa->oct_hw; +} + +static irqreturn_t octep_vdpa_intr_handler(int irq, void *data) +{ + struct octep_hw *oct_hw = data; + int i; + + /* Each device has multiple interrupts (nb_irqs) shared among rings + * (nr_vring). Device interrupts are mapped to the rings in a + * round-robin fashion. + * + * For example, if nb_irqs = 8 and nr_vring = 64: + * 0 -> 0, 8, 16, 24, 32, 40, 48, 56; + * 1 -> 1, 9, 17, 25, 33, 41, 49, 57; + * ... + * 7 -> 7, 15, 23, 31, 39, 47, 55, 63; + */ + + for (i = irq - oct_hw->irqs[0]; i < oct_hw->nr_vring; i += oct_hw->nb_irqs) { + if (ioread8(oct_hw->vqs[i].cb_notify_addr)) { + /* Acknowledge the per ring notification to the device */ + iowrite8(0, oct_hw->vqs[i].cb_notify_addr); + + if (likely(oct_hw->vqs[i].cb.callback)) + oct_hw->vqs[i].cb.callback(oct_hw->vqs[i].cb.private); + break; + } + } + + /* Check for config interrupt. Config uses the first interrupt */ + if (unlikely(irq == oct_hw->irqs[0] && ioread8(oct_hw->isr))) { + iowrite8(0, oct_hw->isr); + + if (oct_hw->config_cb.callback) + oct_hw->config_cb.callback(oct_hw->config_cb.private); + } + + return IRQ_HANDLED; +} + +static void octep_free_irqs(struct octep_hw *oct_hw) +{ + struct pci_dev *pdev = oct_hw->pdev; + int irq; + + if (!oct_hw->irqs) + return; + + for (irq = 0; irq < oct_hw->nb_irqs; irq++) { + if (!oct_hw->irqs[irq]) + break; + + devm_free_irq(&pdev->dev, oct_hw->irqs[irq], oct_hw); + } + + pci_free_irq_vectors(pdev); + devm_kfree(&pdev->dev, oct_hw->irqs); + oct_hw->irqs = NULL; +} + +static int octep_request_irqs(struct octep_hw *oct_hw) +{ + struct pci_dev *pdev = oct_hw->pdev; + int ret, irq, idx; + + oct_hw->irqs = devm_kcalloc(&pdev->dev, oct_hw->nb_irqs, sizeof(int), GFP_KERNEL); + if (!oct_hw->irqs) + return -ENOMEM; + + ret = pci_alloc_irq_vectors(pdev, 1, oct_hw->nb_irqs, PCI_IRQ_MSIX); + if (ret < 0) { + dev_err(&pdev->dev, "Failed to alloc msix vector"); + return ret; + } + + for (idx = 0; idx < oct_hw->nb_irqs; idx++) { + irq = pci_irq_vector(pdev, idx); + ret = devm_request_irq(&pdev->dev, irq, octep_vdpa_intr_handler, 0, + dev_name(&pdev->dev), oct_hw); + if (ret) { + dev_err(&pdev->dev, "Failed to register interrupt handler\n"); + goto free_irqs; + } + oct_hw->irqs[idx] = irq; + } + + return 0; + +free_irqs: + octep_free_irqs(oct_hw); + return ret; +} + +static u64 octep_vdpa_get_device_features(struct vdpa_device *vdpa_dev) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return oct_hw->features; +} + +static int octep_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + int ret; + + pr_debug("Driver Features: %llx\n", features); + + ret = octep_verify_features(features); + if (ret) { + dev_warn(&oct_hw->pdev->dev, + "Must negotiate minimum features 0x%llx for this device", + BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) | + BIT_ULL(VIRTIO_F_RING_PACKED)); + return ret; + } + octep_hw_set_drv_features(oct_hw, features); + + return 0; +} + +static u64 octep_vdpa_get_driver_features(struct vdpa_device *vdpa_dev) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return octep_hw_get_drv_features(oct_hw); +} + +static u8 octep_vdpa_get_status(struct vdpa_device *vdpa_dev) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return octep_hw_get_status(oct_hw); +} + +static void octep_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + u8 status_old; + + status_old = octep_hw_get_status(oct_hw); + + if (status_old == status) + return; + + if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && + !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) { + if (octep_request_irqs(oct_hw)) + status = status_old | VIRTIO_CONFIG_S_FAILED; + } + octep_hw_set_status(oct_hw, status); +} + +static int octep_vdpa_reset(struct vdpa_device *vdpa_dev) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + u8 status = octep_hw_get_status(oct_hw); + u16 qid; + + if (status == 0) + return 0; + + for (qid = 0; qid < oct_hw->nr_vring; qid++) { + oct_hw->vqs[qid].cb.callback = NULL; + oct_hw->vqs[qid].cb.private = NULL; + oct_hw->config_cb.callback = NULL; + oct_hw->config_cb.private = NULL; + } + octep_hw_reset(oct_hw); + + if (status & VIRTIO_CONFIG_S_DRIVER_OK) + octep_free_irqs(oct_hw); + + return 0; +} + +static u16 octep_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return octep_get_vq_size(oct_hw); +} + +static int octep_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid, + struct vdpa_vq_state *state) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return octep_get_vq_state(oct_hw, qid, state); +} + +static int octep_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid, + const struct vdpa_vq_state *state) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return octep_set_vq_state(oct_hw, qid, state); +} + +static void octep_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, struct vdpa_callback *cb) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + oct_hw->vqs[qid].cb = *cb; +} + +static void octep_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + octep_set_vq_ready(oct_hw, qid, ready); +} + +static bool octep_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return octep_get_vq_ready(oct_hw, qid); +} + +static void octep_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + octep_set_vq_num(oct_hw, qid, num); +} + +static int octep_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, u64 desc_area, + u64 driver_area, u64 device_area) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + pr_debug("qid[%d]: desc_area: %llx\n", qid, desc_area); + pr_debug("qid[%d]: driver_area: %llx\n", qid, driver_area); + pr_debug("qid[%d]: device_area: %llx\n\n", qid, device_area); + + return octep_set_vq_address(oct_hw, qid, desc_area, driver_area, device_area); +} + +static void octep_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid) +{ + /* Not supported */ +} + +static void octep_vdpa_kick_vq_with_data(struct vdpa_device *vdpa_dev, u32 data) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + u16 idx = data & 0xFFFF; + + vp_iowrite32(data, oct_hw->vqs[idx].notify_addr); +} + +static u32 octep_vdpa_get_generation(struct vdpa_device *vdpa_dev) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return vp_ioread8(&oct_hw->common_cfg->config_generation); +} + +static u32 octep_vdpa_get_device_id(struct vdpa_device *vdpa_dev) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return oct_hw->dev_id; +} + +static u32 octep_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) +{ + return PCI_VENDOR_ID_CAVIUM; +} + +static u32 octep_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) +{ + return PAGE_SIZE; +} + +static size_t octep_vdpa_get_config_size(struct vdpa_device *vdpa_dev) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + return oct_hw->config_size; +} + +static void octep_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf, + unsigned int len) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + octep_read_dev_config(oct_hw, offset, buf, len); +} + +static void octep_vdpa_set_config(struct vdpa_device *vdpa_dev, unsigned int offset, + const void *buf, unsigned int len) +{ + /* Not supported */ +} + +static void octep_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, struct vdpa_callback *cb) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + + oct_hw->config_cb.callback = cb->callback; + oct_hw->config_cb.private = cb->private; +} + +static struct vdpa_notification_area octep_get_vq_notification(struct vdpa_device *vdpa_dev, + u16 idx) +{ + struct octep_hw *oct_hw = vdpa_to_octep_hw(vdpa_dev); + struct vdpa_notification_area area; + + area.addr = oct_hw->vqs[idx].notify_pa; + area.size = PAGE_SIZE; + + return area; +} + +static struct vdpa_config_ops octep_vdpa_ops = { + .get_device_features = octep_vdpa_get_device_features, + .set_driver_features = octep_vdpa_set_driver_features, + .get_driver_features = octep_vdpa_get_driver_features, + .get_status = octep_vdpa_get_status, + .set_status = octep_vdpa_set_status, + .reset = octep_vdpa_reset, + .get_vq_num_max = octep_vdpa_get_vq_num_max, + .get_vq_state = octep_vdpa_get_vq_state, + .set_vq_state = octep_vdpa_set_vq_state, + .set_vq_cb = octep_vdpa_set_vq_cb, + .set_vq_ready = octep_vdpa_set_vq_ready, + .get_vq_ready = octep_vdpa_get_vq_ready, + .set_vq_num = octep_vdpa_set_vq_num, + .set_vq_address = octep_vdpa_set_vq_address, + .get_vq_irq = NULL, + .kick_vq = octep_vdpa_kick_vq, + .kick_vq_with_data = octep_vdpa_kick_vq_with_data, + .get_generation = octep_vdpa_get_generation, + .get_device_id = octep_vdpa_get_device_id, + .get_vendor_id = octep_vdpa_get_vendor_id, + .get_vq_align = octep_vdpa_get_vq_align, + .get_config_size = octep_vdpa_get_config_size, + .get_config = octep_vdpa_get_config, + .set_config = octep_vdpa_set_config, + .set_config_cb = octep_vdpa_set_config_cb, + .get_vq_notification = octep_get_vq_notification, +}; + +static int octep_iomap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar) +{ + int ret; + + ret = pci_request_region(pdev, bar, OCTEP_VDPA_DRIVER_NAME); + if (ret) { + dev_err(&pdev->dev, "Failed to request BAR:%u region\n", bar); + return ret; + } + + tbl[bar] = pci_iomap(pdev, bar, pci_resource_len(pdev, bar)); + if (!tbl[bar]) { + dev_err(&pdev->dev, "Failed to iomap BAR:%u\n", bar); + pci_release_region(pdev, bar); + ret = -ENOMEM; + } + + return ret; +} + +static void octep_iounmap_region(struct pci_dev *pdev, u8 __iomem **tbl, u8 bar) +{ + pci_iounmap(pdev, tbl[bar]); + pci_release_region(pdev, bar); +} + +static void octep_vdpa_pf_bar_shrink(struct octep_pf *octpf) +{ + struct pci_dev *pf_dev = octpf->pdev; + struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4; + struct pci_bus_region bus_region; + + octpf->res.start = res->start; + octpf->res.end = res->end; + octpf->vf_base = res->start; + + bus_region.start = res->start; + bus_region.end = res->start - 1; + + pcibios_bus_to_resource(pf_dev->bus, res, &bus_region); +} + +static void octep_vdpa_pf_bar_expand(struct octep_pf *octpf) +{ + struct pci_dev *pf_dev = octpf->pdev; + struct resource *res = pf_dev->resource + PCI_STD_RESOURCES + 4; + struct pci_bus_region bus_region; + + bus_region.start = octpf->res.start; + bus_region.end = octpf->res.end; + + pcibios_bus_to_resource(pf_dev->bus, res, &bus_region); +} + +static void octep_vdpa_remove_pf(struct pci_dev *pdev) +{ + struct octep_pf *octpf = pci_get_drvdata(pdev); + + pci_disable_sriov(pdev); + + if (octpf->base[OCTEP_HW_CAPS_BAR]) + octep_iounmap_region(pdev, octpf->base, OCTEP_HW_CAPS_BAR); + + if (octpf->base[OCTEP_HW_MBOX_BAR]) + octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR); + + octep_vdpa_pf_bar_expand(octpf); + + /* The pf version does not use managed PCI. */ + pci_disable_device(pdev); +} + +static void octep_vdpa_vf_bar_shrink(struct pci_dev *pdev) +{ + struct resource *vf_res = pdev->resource + PCI_STD_RESOURCES + 4; + + memset(vf_res, 0, sizeof(*vf_res)); +} + +static void octep_vdpa_remove_vf(struct pci_dev *pdev) +{ + struct octep_vdpa_mgmt_dev *mgmt_dev = pci_get_drvdata(pdev); + struct octep_hw *oct_hw; + int status; + + oct_hw = &mgmt_dev->oct_hw; + status = atomic_read(&mgmt_dev->status); + atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_UNINIT); + + cancel_work_sync(&mgmt_dev->setup_task); + if (status == OCTEP_VDPA_DEV_STATUS_READY) + vdpa_mgmtdev_unregister(&mgmt_dev->mdev); + + if (oct_hw->base[OCTEP_HW_CAPS_BAR]) + octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR); + + if (oct_hw->base[OCTEP_HW_MBOX_BAR]) + octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_MBOX_BAR); + + octep_vdpa_vf_bar_shrink(pdev); +} + +static void octep_vdpa_remove(struct pci_dev *pdev) +{ + if (pdev->is_virtfn) + octep_vdpa_remove_vf(pdev); + else + octep_vdpa_remove_pf(pdev); +} + +static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, + const struct vdpa_dev_set_config *config) +{ + struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(mdev, struct octep_vdpa_mgmt_dev, mdev); + struct octep_hw *oct_hw = &mgmt_dev->oct_hw; + struct pci_dev *pdev = oct_hw->pdev; + struct vdpa_device *vdpa_dev; + struct octep_vdpa *oct_vdpa; + u64 device_features; + int ret; + + oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, + NULL, 1, 1, NULL, false); + if (IS_ERR(oct_vdpa)) { + dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device"); + return PTR_ERR(oct_vdpa); + } + + oct_vdpa->pdev = pdev; + oct_vdpa->vdpa.vmap.dma_dev = &pdev->dev; + oct_vdpa->vdpa.mdev = mdev; + oct_vdpa->oct_hw = oct_hw; + vdpa_dev = &oct_vdpa->vdpa; + + device_features = oct_hw->features; + if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + if (config->device_features & ~device_features) { + dev_err(&pdev->dev, "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n", + config->device_features, device_features); + ret = -EINVAL; + goto vdpa_dev_put; + } + device_features &= config->device_features; + } + + oct_hw->features = device_features; + dev_info(&pdev->dev, "Vdpa management device features : %llx\n", device_features); + + ret = octep_verify_features(device_features); + if (ret) { + dev_warn(mdev->device, + "Must provision minimum features 0x%llx for this device", + BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) | + BIT_ULL(VIRTIO_F_NOTIFICATION_DATA) | BIT_ULL(VIRTIO_F_RING_PACKED)); + goto vdpa_dev_put; + } + if (name) + ret = dev_set_name(&vdpa_dev->dev, "%s", name); + else + ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index); + + ret = _vdpa_register_device(&oct_vdpa->vdpa, oct_hw->nr_vring); + if (ret) { + dev_err(&pdev->dev, "Failed to register to vDPA bus"); + goto vdpa_dev_put; + } + return 0; + +vdpa_dev_put: + put_device(&oct_vdpa->vdpa.dev); + return ret; +} + +static void octep_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *vdpa_dev) +{ + _vdpa_unregister_device(vdpa_dev); +} + +static const struct vdpa_mgmtdev_ops octep_vdpa_mgmt_dev_ops = { + .dev_add = octep_vdpa_dev_add, + .dev_del = octep_vdpa_dev_del +}; + +static bool get_device_ready_status(u8 __iomem *addr) +{ + u64 signature = readq(addr + OCTEP_VF_MBOX_DATA(0)); + + if (signature == OCTEP_DEV_READY_SIGNATURE) { + writeq(0, addr + OCTEP_VF_MBOX_DATA(0)); + return true; + } + + return false; +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static void octep_vdpa_setup_task(struct work_struct *work) +{ + struct octep_vdpa_mgmt_dev *mgmt_dev = container_of(work, struct octep_vdpa_mgmt_dev, + setup_task); + struct pci_dev *pdev = mgmt_dev->pdev; + struct device *dev = &pdev->dev; + struct octep_hw *oct_hw; + unsigned long timeout; + u64 val; + int ret; + + oct_hw = &mgmt_dev->oct_hw; + + atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_WAIT_FOR_BAR_INIT); + + /* Wait for a maximum of 5 sec */ + timeout = jiffies + msecs_to_jiffies(5000); + while (!time_after(jiffies, timeout)) { + if (get_device_ready_status(oct_hw->base[OCTEP_HW_MBOX_BAR])) { + atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_INIT); + break; + } + + if (atomic_read(&mgmt_dev->status) >= OCTEP_VDPA_DEV_STATUS_READY) { + dev_info(dev, "Stopping vDPA setup task.\n"); + return; + } + + usleep_range(1000, 1500); + } + + if (atomic_read(&mgmt_dev->status) != OCTEP_VDPA_DEV_STATUS_INIT) { + dev_err(dev, "BAR initialization is timed out\n"); + return; + } + + ret = octep_iomap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR); + if (ret) + return; + + val = readq(oct_hw->base[OCTEP_HW_MBOX_BAR] + OCTEP_VF_IN_CTRL(0)); + oct_hw->nb_irqs = OCTEP_VF_IN_CTRL_RPVF(val); + if (!oct_hw->nb_irqs || oct_hw->nb_irqs > OCTEP_MAX_CB_INTR) { + dev_err(dev, "Invalid number of interrupts %d\n", oct_hw->nb_irqs); + goto unmap_region; + } + + ret = octep_hw_caps_read(oct_hw, pdev); + if (ret < 0) + goto unmap_region; + + mgmt_dev->mdev.ops = &octep_vdpa_mgmt_dev_ops; + mgmt_dev->mdev.id_table = id_table; + mgmt_dev->mdev.max_supported_vqs = oct_hw->nr_vring; + mgmt_dev->mdev.supported_features = oct_hw->features; + mgmt_dev->mdev.config_attr_mask = (1 << VDPA_ATTR_DEV_FEATURES); + mgmt_dev->mdev.device = dev; + + ret = vdpa_mgmtdev_register(&mgmt_dev->mdev); + if (ret) { + dev_err(dev, "Failed to register vdpa management interface\n"); + goto unmap_region; + } + + atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_READY); + + return; + +unmap_region: + octep_iounmap_region(pdev, oct_hw->base, OCTEP_HW_CAPS_BAR); + oct_hw->base[OCTEP_HW_CAPS_BAR] = NULL; +} + +static int octep_vdpa_probe_vf(struct pci_dev *pdev) +{ + struct octep_vdpa_mgmt_dev *mgmt_dev; + struct device *dev = &pdev->dev; + int ret; + + ret = pcim_enable_device(pdev); + if (ret) { + dev_err(dev, "Failed to enable device\n"); + return ret; + } + + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (ret) { + dev_err(dev, "No usable DMA configuration\n"); + return ret; + } + pci_set_master(pdev); + + mgmt_dev = devm_kzalloc(dev, sizeof(struct octep_vdpa_mgmt_dev), GFP_KERNEL); + if (!mgmt_dev) + return -ENOMEM; + + ret = octep_iomap_region(pdev, mgmt_dev->oct_hw.base, OCTEP_HW_MBOX_BAR); + if (ret) + return ret; + + mgmt_dev->pdev = pdev; + pci_set_drvdata(pdev, mgmt_dev); + + atomic_set(&mgmt_dev->status, OCTEP_VDPA_DEV_STATUS_ALLOC); + INIT_WORK(&mgmt_dev->setup_task, octep_vdpa_setup_task); + schedule_work(&mgmt_dev->setup_task); + dev_info(&pdev->dev, "octep vdpa mgmt device setup task is queued\n"); + + return 0; +} + +static void octep_vdpa_assign_barspace(struct pci_dev *vf_dev, struct pci_dev *pf_dev, u8 idx) +{ + struct resource *vf_res = vf_dev->resource + PCI_STD_RESOURCES + 4; + struct resource *pf_res = pf_dev->resource + PCI_STD_RESOURCES + 4; + struct octep_pf *pf = pci_get_drvdata(pf_dev); + struct pci_bus_region bus_region; + + vf_res->name = pci_name(vf_dev); + vf_res->flags = pf_res->flags; + vf_res->parent = (pf_dev->resource + PCI_STD_RESOURCES)->parent; + + bus_region.start = pf->vf_base + idx * pf->vf_stride; + bus_region.end = bus_region.start + pf->vf_stride - 1; + pcibios_bus_to_resource(vf_dev->bus, vf_res, &bus_region); +} + +static int octep_sriov_enable(struct pci_dev *pdev, int num_vfs) +{ + struct octep_pf *pf = pci_get_drvdata(pdev); + u8 __iomem *addr = pf->base[OCTEP_HW_MBOX_BAR]; + struct pci_dev *vf_pdev = NULL; + bool done = false; + int index = 0; + int ret, i; + + ret = pci_enable_sriov(pdev, num_vfs); + if (ret) + return ret; + + pf->enabled_vfs = num_vfs; + + while ((vf_pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, vf_pdev))) { + if (vf_pdev->device != pf->vf_devid) + continue; + + octep_vdpa_assign_barspace(vf_pdev, pdev, index); + if (++index == num_vfs) { + done = true; + pci_dev_put(vf_pdev); + break; + } + } + + if (done) { + for (i = 0; i < pf->enabled_vfs; i++) + writeq(OCTEP_DEV_READY_SIGNATURE, addr + OCTEP_PF_MBOX_DATA(i)); + } + + return num_vfs; +} + +static int octep_sriov_disable(struct pci_dev *pdev) +{ + struct octep_pf *pf = pci_get_drvdata(pdev); + + if (!pci_num_vf(pdev)) + return 0; + + pci_disable_sriov(pdev); + pf->enabled_vfs = 0; + + return 0; +} + +static int octep_vdpa_sriov_configure(struct pci_dev *pdev, int num_vfs) +{ + if (num_vfs > 0) + return octep_sriov_enable(pdev, num_vfs); + else + return octep_sriov_disable(pdev); +} + +static u16 octep_get_vf_devid(struct pci_dev *pdev) +{ + u16 did; + + switch (pdev->device) { + case OCTEP_VDPA_DEVID_CN106K_PF: + did = OCTEP_VDPA_DEVID_CN106K_VF; + break; + case OCTEP_VDPA_DEVID_CN105K_PF: + did = OCTEP_VDPA_DEVID_CN105K_VF; + break; + case OCTEP_VDPA_DEVID_CN103K_PF: + did = OCTEP_VDPA_DEVID_CN103K_VF; + break; + default: + did = 0xFFFF; + break; + } + + return did; +} + +static int octep_vdpa_pf_setup(struct octep_pf *octpf) +{ + u8 __iomem *addr = octpf->base[OCTEP_HW_MBOX_BAR]; + struct pci_dev *pdev = octpf->pdev; + int totalvfs; + size_t len; + u64 val; + + totalvfs = pci_sriov_get_totalvfs(pdev); + if (unlikely(!totalvfs)) { + dev_info(&pdev->dev, "Total VFs are %d in PF sriov configuration\n", totalvfs); + return 0; + } + + addr = octpf->base[OCTEP_HW_MBOX_BAR]; + val = readq(addr + OCTEP_EPF_RINFO(0)); + if (val == 0) { + dev_err(&pdev->dev, "Invalid device configuration\n"); + return -EINVAL; + } + + len = pci_resource_len(pdev, OCTEP_HW_CAPS_BAR); + + octpf->vf_stride = len / totalvfs; + octpf->vf_devid = octep_get_vf_devid(pdev); + + octep_vdpa_pf_bar_shrink(octpf); + + return 0; +} + +static int octep_vdpa_probe_pf(struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct octep_pf *octpf; + int ret; + + ret = pci_enable_device(pdev); + if (ret) { + dev_err(dev, "Failed to enable device\n"); + return ret; + } + + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (ret) { + dev_err(dev, "No usable DMA configuration\n"); + goto disable_pci; + } + octpf = devm_kzalloc(dev, sizeof(*octpf), GFP_KERNEL); + if (!octpf) { + ret = -ENOMEM; + goto disable_pci; + } + + ret = octep_iomap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR); + if (ret) + goto disable_pci; + + pci_set_master(pdev); + pci_set_drvdata(pdev, octpf); + octpf->pdev = pdev; + + ret = octep_vdpa_pf_setup(octpf); + if (ret) + goto unmap_region; + + return 0; + +unmap_region: + octep_iounmap_region(pdev, octpf->base, OCTEP_HW_MBOX_BAR); +disable_pci: + pci_disable_device(pdev); + return ret; +} + +static int octep_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + if (pdev->is_virtfn) + return octep_vdpa_probe_vf(pdev); + else + return octep_vdpa_probe_pf(pdev); +} + +static struct pci_device_id octep_pci_vdpa_map[] = { + { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_PF) }, + { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN106K_VF) }, + { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_PF) }, + { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN105K_VF) }, + { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_PF) }, + { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_VDPA_DEVID_CN103K_VF) }, + { 0 }, +}; + +static struct pci_driver octep_pci_vdpa = { + .name = OCTEP_VDPA_DRIVER_NAME, + .id_table = octep_pci_vdpa_map, + .probe = octep_vdpa_probe, + .remove = octep_vdpa_remove, + .sriov_configure = octep_vdpa_sriov_configure +}; + +module_pci_driver(octep_pci_vdpa); + +MODULE_AUTHOR("Marvell"); +MODULE_DESCRIPTION("Marvell Octeon PCIe endpoint vDPA driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/vdpa/pds/Makefile b/drivers/vdpa/pds/Makefile new file mode 100644 index 000000000000..c2d314d4614d --- /dev/null +++ b/drivers/vdpa/pds/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Copyright(c) 2023 Advanced Micro Devices, Inc + +obj-$(CONFIG_PDS_VDPA) := pds_vdpa.o + +pds_vdpa-y := aux_drv.o \ + cmds.o \ + debugfs.o \ + vdpa_dev.o diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c new file mode 100644 index 000000000000..f57330cf9024 --- /dev/null +++ b/drivers/vdpa/pds/aux_drv.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#include <linux/auxiliary_bus.h> +#include <linux/pci.h> +#include <linux/vdpa.h> +#include <linux/virtio_pci_modern.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_auxbus.h> + +#include "aux_drv.h" +#include "debugfs.h" +#include "vdpa_dev.h" + +static const struct auxiliary_device_id pds_vdpa_id_table[] = { + { .name = PDS_VDPA_DEV_NAME, }, + {}, +}; + +static int pds_vdpa_device_id_check(struct pci_dev *pdev) +{ + if (pdev->device != PCI_DEVICE_ID_PENSANDO_VDPA_VF || + pdev->vendor != PCI_VENDOR_ID_PENSANDO) + return -ENODEV; + + return PCI_DEVICE_ID_PENSANDO_VDPA_VF; +} + +static int pds_vdpa_probe(struct auxiliary_device *aux_dev, + const struct auxiliary_device_id *id) + +{ + struct pds_auxiliary_dev *padev = + container_of(aux_dev, struct pds_auxiliary_dev, aux_dev); + struct device *dev = &aux_dev->dev; + struct pds_vdpa_aux *vdpa_aux; + int err; + + vdpa_aux = kzalloc(sizeof(*vdpa_aux), GFP_KERNEL); + if (!vdpa_aux) + return -ENOMEM; + + vdpa_aux->padev = padev; + vdpa_aux->vf_id = pci_iov_vf_id(padev->vf_pdev); + auxiliary_set_drvdata(aux_dev, vdpa_aux); + + /* Get device ident info and set up the vdpa_mgmt_dev */ + err = pds_vdpa_get_mgmt_info(vdpa_aux); + if (err) + goto err_free_mem; + + /* Find the virtio configuration */ + vdpa_aux->vd_mdev.pci_dev = padev->vf_pdev; + vdpa_aux->vd_mdev.device_id_check = pds_vdpa_device_id_check; + vdpa_aux->vd_mdev.dma_mask = DMA_BIT_MASK(PDS_CORE_ADDR_LEN); + err = vp_modern_probe(&vdpa_aux->vd_mdev); + if (err) { + dev_err(dev, "Unable to probe for virtio configuration: %pe\n", + ERR_PTR(err)); + goto err_free_mgmt_info; + } + + /* Let vdpa know that we can provide devices */ + err = vdpa_mgmtdev_register(&vdpa_aux->vdpa_mdev); + if (err) { + dev_err(dev, "%s: Failed to initialize vdpa_mgmt interface: %pe\n", + __func__, ERR_PTR(err)); + goto err_free_virtio; + } + + pds_vdpa_debugfs_add_pcidev(vdpa_aux); + pds_vdpa_debugfs_add_ident(vdpa_aux); + + return 0; + +err_free_virtio: + vp_modern_remove(&vdpa_aux->vd_mdev); +err_free_mgmt_info: + pci_free_irq_vectors(padev->vf_pdev); +err_free_mem: + kfree(vdpa_aux); + auxiliary_set_drvdata(aux_dev, NULL); + + return err; +} + +static void pds_vdpa_remove(struct auxiliary_device *aux_dev) +{ + struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev); + struct device *dev = &aux_dev->dev; + + vdpa_mgmtdev_unregister(&vdpa_aux->vdpa_mdev); + pds_vdpa_release_irqs(vdpa_aux->pdsv); + vp_modern_remove(&vdpa_aux->vd_mdev); + + pds_vdpa_debugfs_del_vdpadev(vdpa_aux); + kfree(vdpa_aux); + auxiliary_set_drvdata(aux_dev, NULL); + + dev_info(dev, "Removed\n"); +} + +static struct auxiliary_driver pds_vdpa_driver = { + .name = PDS_DEV_TYPE_VDPA_STR, + .probe = pds_vdpa_probe, + .remove = pds_vdpa_remove, + .id_table = pds_vdpa_id_table, +}; + +static void __exit pds_vdpa_cleanup(void) +{ + auxiliary_driver_unregister(&pds_vdpa_driver); + + pds_vdpa_debugfs_destroy(); +} +module_exit(pds_vdpa_cleanup); + +static int __init pds_vdpa_init(void) +{ + int err; + + pds_vdpa_debugfs_create(); + + err = auxiliary_driver_register(&pds_vdpa_driver); + if (err) { + pr_err("%s: aux driver register failed: %pe\n", + PDS_VDPA_DRV_NAME, ERR_PTR(err)); + pds_vdpa_debugfs_destroy(); + } + + return err; +} +module_init(pds_vdpa_init); + +MODULE_DESCRIPTION(PDS_VDPA_DRV_DESCRIPTION); +MODULE_AUTHOR("Advanced Micro Devices, Inc"); +MODULE_LICENSE("GPL"); diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h new file mode 100644 index 000000000000..26b75344156e --- /dev/null +++ b/drivers/vdpa/pds/aux_drv.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _AUX_DRV_H_ +#define _AUX_DRV_H_ + +#include <linux/virtio_pci_modern.h> + +#define PDS_VDPA_DRV_DESCRIPTION "AMD/Pensando vDPA VF Device Driver" +#define PDS_VDPA_DRV_NAME KBUILD_MODNAME + +struct pds_vdpa_aux { + struct pds_auxiliary_dev *padev; + + struct vdpa_mgmt_dev vdpa_mdev; + struct pds_vdpa_device *pdsv; + + struct pds_vdpa_ident ident; + + int vf_id; + struct dentry *dentry; + struct virtio_pci_modern_device vd_mdev; + + int nintrs; +}; +#endif /* _AUX_DRV_H_ */ diff --git a/drivers/vdpa/pds/cmds.c b/drivers/vdpa/pds/cmds.c new file mode 100644 index 000000000000..80863a41c3cd --- /dev/null +++ b/drivers/vdpa/pds/cmds.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#include <linux/vdpa.h> +#include <linux/virtio_pci_modern.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_auxbus.h> + +#include "vdpa_dev.h" +#include "aux_drv.h" +#include "cmds.h" + +int pds_vdpa_init_hw(struct pds_vdpa_device *pdsv) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_init.opcode = PDS_VDPA_CMD_INIT, + .vdpa_init.vdpa_index = pdsv->vdpa_index, + .vdpa_init.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + }; + union pds_core_adminq_comp comp = {}; + int err; + + /* Initialize the vdpa/virtio device */ + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_init), + &comp, 0); + if (err) + dev_dbg(dev, "Failed to init hw, status %d: %pe\n", + comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_reset(struct pds_vdpa_device *pdsv) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa.opcode = PDS_VDPA_CMD_RESET, + .vdpa.vdpa_index = pdsv->vdpa_index, + .vdpa.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + }; + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa), &comp, 0); + if (err) + dev_dbg(dev, "Failed to reset hw, status %d: %pe\n", + comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_set_status(struct pds_vdpa_device *pdsv, u8 status) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_status.opcode = PDS_VDPA_CMD_STATUS_UPDATE, + .vdpa_status.vdpa_index = pdsv->vdpa_index, + .vdpa_status.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_status.status = status, + }; + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_status), &comp, 0); + if (err) + dev_dbg(dev, "Failed to set status to %#x, error status %d: %pe\n", + status, comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_set_mac(struct pds_vdpa_device *pdsv, u8 *mac) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR, + .vdpa_setattr.vdpa_index = pdsv->vdpa_index, + .vdpa_setattr.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_setattr.attr = PDS_VDPA_ATTR_MAC, + }; + union pds_core_adminq_comp comp = {}; + int err; + + ether_addr_copy(cmd.vdpa_setattr.mac, mac); + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_setattr), + &comp, 0); + if (err) + dev_dbg(dev, "Failed to set mac address %pM, status %d: %pe\n", + mac, comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_set_max_vq_pairs(struct pds_vdpa_device *pdsv, u16 max_vqp) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR, + .vdpa_setattr.vdpa_index = pdsv->vdpa_index, + .vdpa_setattr.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_setattr.attr = PDS_VDPA_ATTR_MAX_VQ_PAIRS, + .vdpa_setattr.max_vq_pairs = cpu_to_le16(max_vqp), + }; + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_setattr), + &comp, 0); + if (err) + dev_dbg(dev, "Failed to set max vq pairs %u, status %d: %pe\n", + max_vqp, comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_init_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, + struct pds_vdpa_vq_info *vq_info) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_vq_init.opcode = PDS_VDPA_CMD_VQ_INIT, + .vdpa_vq_init.vdpa_index = pdsv->vdpa_index, + .vdpa_vq_init.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_vq_init.qid = cpu_to_le16(qid), + .vdpa_vq_init.len = cpu_to_le16(ilog2(vq_info->q_len)), + .vdpa_vq_init.desc_addr = cpu_to_le64(vq_info->desc_addr), + .vdpa_vq_init.avail_addr = cpu_to_le64(vq_info->avail_addr), + .vdpa_vq_init.used_addr = cpu_to_le64(vq_info->used_addr), + .vdpa_vq_init.intr_index = cpu_to_le16(qid), + .vdpa_vq_init.avail_index = cpu_to_le16(vq_info->avail_idx ^ invert_idx), + .vdpa_vq_init.used_index = cpu_to_le16(vq_info->used_idx ^ invert_idx), + }; + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "%s: qid %d len %d desc_addr %#llx avail_addr %#llx used_addr %#llx\n", + __func__, qid, ilog2(vq_info->q_len), + vq_info->desc_addr, vq_info->avail_addr, vq_info->used_addr); + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_vq_init), + &comp, 0); + if (err) + dev_dbg(dev, "Failed to init vq %d, status %d: %pe\n", + qid, comp.status, ERR_PTR(err)); + + return err; +} + +int pds_vdpa_cmd_reset_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, + struct pds_vdpa_vq_info *vq_info) +{ + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + union pds_core_adminq_cmd cmd = { + .vdpa_vq_reset.opcode = PDS_VDPA_CMD_VQ_RESET, + .vdpa_vq_reset.vdpa_index = pdsv->vdpa_index, + .vdpa_vq_reset.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), + .vdpa_vq_reset.qid = cpu_to_le16(qid), + }; + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_vq_reset), + &comp, 0); + if (err) { + dev_dbg(dev, "Failed to reset vq %d, status %d: %pe\n", + qid, comp.status, ERR_PTR(err)); + return err; + } + + vq_info->avail_idx = le16_to_cpu(comp.vdpa_vq_reset.avail_index) ^ invert_idx; + vq_info->used_idx = le16_to_cpu(comp.vdpa_vq_reset.used_index) ^ invert_idx; + + return 0; +} diff --git a/drivers/vdpa/pds/cmds.h b/drivers/vdpa/pds/cmds.h new file mode 100644 index 000000000000..6b1bc33356b0 --- /dev/null +++ b/drivers/vdpa/pds/cmds.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _VDPA_CMDS_H_ +#define _VDPA_CMDS_H_ + +int pds_vdpa_init_hw(struct pds_vdpa_device *pdsv); + +int pds_vdpa_cmd_reset(struct pds_vdpa_device *pdsv); +int pds_vdpa_cmd_set_status(struct pds_vdpa_device *pdsv, u8 status); +int pds_vdpa_cmd_set_mac(struct pds_vdpa_device *pdsv, u8 *mac); +int pds_vdpa_cmd_set_max_vq_pairs(struct pds_vdpa_device *pdsv, u16 max_vqp); +int pds_vdpa_cmd_init_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, + struct pds_vdpa_vq_info *vq_info); +int pds_vdpa_cmd_reset_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, + struct pds_vdpa_vq_info *vq_info); +#endif /* _VDPA_CMDS_H_ */ diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c new file mode 100644 index 000000000000..c328e694f6e7 --- /dev/null +++ b/drivers/vdpa/pds/debugfs.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#include <linux/pci.h> +#include <linux/vdpa.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_auxbus.h> + +#include "aux_drv.h" +#include "vdpa_dev.h" +#include "debugfs.h" + +static struct dentry *dbfs_dir; + +void pds_vdpa_debugfs_create(void) +{ + dbfs_dir = debugfs_create_dir(PDS_VDPA_DRV_NAME, NULL); +} + +void pds_vdpa_debugfs_destroy(void) +{ + debugfs_remove_recursive(dbfs_dir); + dbfs_dir = NULL; +} + +#define PRINT_SBIT_NAME(__seq, __f, __name) \ + do { \ + if ((__f) & (__name)) \ + seq_printf(__seq, " %s", &#__name[16]); \ + } while (0) + +static void print_status_bits(struct seq_file *seq, u8 status) +{ + seq_puts(seq, "status:"); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_ACKNOWLEDGE); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER_OK); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FEATURES_OK); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_NEEDS_RESET); + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FAILED); + seq_puts(seq, "\n"); +} + +static void print_feature_bits_all(struct seq_file *seq, u64 features) +{ + int i; + + seq_puts(seq, "features:"); + + for (i = 0; i < (sizeof(u64) * 8); i++) { + u64 mask = BIT_ULL(i); + + switch (features & mask) { + case BIT_ULL(VIRTIO_NET_F_CSUM): + seq_puts(seq, " VIRTIO_NET_F_CSUM"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_CSUM): + seq_puts(seq, " VIRTIO_NET_F_GUEST_CSUM"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS): + seq_puts(seq, " VIRTIO_NET_F_CTRL_GUEST_OFFLOADS"); + break; + case BIT_ULL(VIRTIO_NET_F_MTU): + seq_puts(seq, " VIRTIO_NET_F_MTU"); + break; + case BIT_ULL(VIRTIO_NET_F_MAC): + seq_puts(seq, " VIRTIO_NET_F_MAC"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_TSO4): + seq_puts(seq, " VIRTIO_NET_F_GUEST_TSO4"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_TSO6): + seq_puts(seq, " VIRTIO_NET_F_GUEST_TSO6"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_ECN): + seq_puts(seq, " VIRTIO_NET_F_GUEST_ECN"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_UFO): + seq_puts(seq, " VIRTIO_NET_F_GUEST_UFO"); + break; + case BIT_ULL(VIRTIO_NET_F_HOST_TSO4): + seq_puts(seq, " VIRTIO_NET_F_HOST_TSO4"); + break; + case BIT_ULL(VIRTIO_NET_F_HOST_TSO6): + seq_puts(seq, " VIRTIO_NET_F_HOST_TSO6"); + break; + case BIT_ULL(VIRTIO_NET_F_HOST_ECN): + seq_puts(seq, " VIRTIO_NET_F_HOST_ECN"); + break; + case BIT_ULL(VIRTIO_NET_F_HOST_UFO): + seq_puts(seq, " VIRTIO_NET_F_HOST_UFO"); + break; + case BIT_ULL(VIRTIO_NET_F_MRG_RXBUF): + seq_puts(seq, " VIRTIO_NET_F_MRG_RXBUF"); + break; + case BIT_ULL(VIRTIO_NET_F_STATUS): + seq_puts(seq, " VIRTIO_NET_F_STATUS"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_VQ): + seq_puts(seq, " VIRTIO_NET_F_CTRL_VQ"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_RX): + seq_puts(seq, " VIRTIO_NET_F_CTRL_RX"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_VLAN): + seq_puts(seq, " VIRTIO_NET_F_CTRL_VLAN"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA): + seq_puts(seq, " VIRTIO_NET_F_CTRL_RX_EXTRA"); + break; + case BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE): + seq_puts(seq, " VIRTIO_NET_F_GUEST_ANNOUNCE"); + break; + case BIT_ULL(VIRTIO_NET_F_MQ): + seq_puts(seq, " VIRTIO_NET_F_MQ"); + break; + case BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR): + seq_puts(seq, " VIRTIO_NET_F_CTRL_MAC_ADDR"); + break; + case BIT_ULL(VIRTIO_NET_F_HASH_REPORT): + seq_puts(seq, " VIRTIO_NET_F_HASH_REPORT"); + break; + case BIT_ULL(VIRTIO_NET_F_RSS): + seq_puts(seq, " VIRTIO_NET_F_RSS"); + break; + case BIT_ULL(VIRTIO_NET_F_RSC_EXT): + seq_puts(seq, " VIRTIO_NET_F_RSC_EXT"); + break; + case BIT_ULL(VIRTIO_NET_F_STANDBY): + seq_puts(seq, " VIRTIO_NET_F_STANDBY"); + break; + case BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX): + seq_puts(seq, " VIRTIO_NET_F_SPEED_DUPLEX"); + break; + case BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY): + seq_puts(seq, " VIRTIO_F_NOTIFY_ON_EMPTY"); + break; + case BIT_ULL(VIRTIO_F_ANY_LAYOUT): + seq_puts(seq, " VIRTIO_F_ANY_LAYOUT"); + break; + case BIT_ULL(VIRTIO_F_VERSION_1): + seq_puts(seq, " VIRTIO_F_VERSION_1"); + break; + case BIT_ULL(VIRTIO_F_ACCESS_PLATFORM): + seq_puts(seq, " VIRTIO_F_ACCESS_PLATFORM"); + break; + case BIT_ULL(VIRTIO_F_RING_PACKED): + seq_puts(seq, " VIRTIO_F_RING_PACKED"); + break; + case BIT_ULL(VIRTIO_F_ORDER_PLATFORM): + seq_puts(seq, " VIRTIO_F_ORDER_PLATFORM"); + break; + case BIT_ULL(VIRTIO_F_SR_IOV): + seq_puts(seq, " VIRTIO_F_SR_IOV"); + break; + case 0: + break; + default: + seq_printf(seq, " bit_%d", i); + break; + } + } + + seq_puts(seq, "\n"); +} + +void pds_vdpa_debugfs_add_pcidev(struct pds_vdpa_aux *vdpa_aux) +{ + vdpa_aux->dentry = debugfs_create_dir(pci_name(vdpa_aux->padev->vf_pdev), dbfs_dir); +} + +static int identity_show(struct seq_file *seq, void *v) +{ + struct pds_vdpa_aux *vdpa_aux = seq->private; + struct vdpa_mgmt_dev *mgmt; + u64 hw_features; + + seq_printf(seq, "aux_dev: %s\n", + dev_name(&vdpa_aux->padev->aux_dev.dev)); + + mgmt = &vdpa_aux->vdpa_mdev; + seq_printf(seq, "max_vqs: %d\n", mgmt->max_supported_vqs); + seq_printf(seq, "config_attr_mask: %#llx\n", mgmt->config_attr_mask); + hw_features = le64_to_cpu(vdpa_aux->ident.hw_features); + seq_printf(seq, "hw_features: %#llx\n", hw_features); + print_feature_bits_all(seq, hw_features); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(identity); + +void pds_vdpa_debugfs_add_ident(struct pds_vdpa_aux *vdpa_aux) +{ + debugfs_create_file("identity", 0400, vdpa_aux->dentry, + vdpa_aux, &identity_fops); +} + +static int config_show(struct seq_file *seq, void *v) +{ + struct pds_vdpa_device *pdsv = seq->private; + struct virtio_net_config vc; + u8 status; + + memcpy_fromio(&vc, pdsv->vdpa_aux->vd_mdev.device, + sizeof(struct virtio_net_config)); + + seq_printf(seq, "mac: %pM\n", vc.mac); + seq_printf(seq, "max_virtqueue_pairs: %d\n", + __virtio16_to_cpu(true, vc.max_virtqueue_pairs)); + seq_printf(seq, "mtu: %d\n", __virtio16_to_cpu(true, vc.mtu)); + seq_printf(seq, "speed: %d\n", le32_to_cpu(vc.speed)); + seq_printf(seq, "duplex: %d\n", vc.duplex); + seq_printf(seq, "rss_max_key_size: %d\n", vc.rss_max_key_size); + seq_printf(seq, "rss_max_indirection_table_length: %d\n", + le16_to_cpu(vc.rss_max_indirection_table_length)); + seq_printf(seq, "supported_hash_types: %#x\n", + le32_to_cpu(vc.supported_hash_types)); + seq_printf(seq, "vn_status: %#x\n", + __virtio16_to_cpu(true, vc.status)); + + status = vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev); + seq_printf(seq, "dev_status: %#x\n", status); + print_status_bits(seq, status); + seq_printf(seq, "negotiated_features: %#llx\n", pdsv->negotiated_features); + print_feature_bits_all(seq, pdsv->negotiated_features); + seq_printf(seq, "vdpa_index: %d\n", pdsv->vdpa_index); + seq_printf(seq, "num_vqs: %d\n", pdsv->num_vqs); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(config); + +static int vq_show(struct seq_file *seq, void *v) +{ + struct pds_vdpa_vq_info *vq = seq->private; + + seq_printf(seq, "ready: %d\n", vq->ready); + seq_printf(seq, "desc_addr: %#llx\n", vq->desc_addr); + seq_printf(seq, "avail_addr: %#llx\n", vq->avail_addr); + seq_printf(seq, "used_addr: %#llx\n", vq->used_addr); + seq_printf(seq, "q_len: %d\n", vq->q_len); + seq_printf(seq, "qid: %d\n", vq->qid); + + seq_printf(seq, "doorbell: %#llx\n", vq->doorbell); + seq_printf(seq, "avail_idx: %d\n", vq->avail_idx); + seq_printf(seq, "used_idx: %d\n", vq->used_idx); + seq_printf(seq, "irq: %d\n", vq->irq); + seq_printf(seq, "irq-name: %s\n", vq->irq_name); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(vq); + +void pds_vdpa_debugfs_add_vdpadev(struct pds_vdpa_aux *vdpa_aux) +{ + int i; + + debugfs_create_file("config", 0400, vdpa_aux->dentry, vdpa_aux->pdsv, &config_fops); + + for (i = 0; i < vdpa_aux->pdsv->num_vqs; i++) { + char name[16]; + + snprintf(name, sizeof(name), "vq%02d", i); + debugfs_create_file(name, 0400, vdpa_aux->dentry, + &vdpa_aux->pdsv->vqs[i], &vq_fops); + } +} + +void pds_vdpa_debugfs_del_vdpadev(struct pds_vdpa_aux *vdpa_aux) +{ + debugfs_remove_recursive(vdpa_aux->dentry); + vdpa_aux->dentry = NULL; +} + +void pds_vdpa_debugfs_reset_vdpadev(struct pds_vdpa_aux *vdpa_aux) +{ + /* we don't keep track of the entries, so remove it all + * then rebuild the basics + */ + pds_vdpa_debugfs_del_vdpadev(vdpa_aux); + pds_vdpa_debugfs_add_pcidev(vdpa_aux); + pds_vdpa_debugfs_add_ident(vdpa_aux); +} diff --git a/drivers/vdpa/pds/debugfs.h b/drivers/vdpa/pds/debugfs.h new file mode 100644 index 000000000000..c088a4e8f1e9 --- /dev/null +++ b/drivers/vdpa/pds/debugfs.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _PDS_VDPA_DEBUGFS_H_ +#define _PDS_VDPA_DEBUGFS_H_ + +#include <linux/debugfs.h> + +void pds_vdpa_debugfs_create(void); +void pds_vdpa_debugfs_destroy(void); +void pds_vdpa_debugfs_add_pcidev(struct pds_vdpa_aux *vdpa_aux); +void pds_vdpa_debugfs_add_ident(struct pds_vdpa_aux *vdpa_aux); +void pds_vdpa_debugfs_add_vdpadev(struct pds_vdpa_aux *vdpa_aux); +void pds_vdpa_debugfs_del_vdpadev(struct pds_vdpa_aux *vdpa_aux); +void pds_vdpa_debugfs_reset_vdpadev(struct pds_vdpa_aux *vdpa_aux); + +#endif /* _PDS_VDPA_DEBUGFS_H_ */ diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c new file mode 100644 index 000000000000..43426bd971ac --- /dev/null +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -0,0 +1,859 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#include <linux/pci.h> +#include <linux/vdpa.h> +#include <uapi/linux/vdpa.h> +#include <linux/virtio_pci_modern.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> +#include <linux/pds/pds_auxbus.h> + +#include "vdpa_dev.h" +#include "aux_drv.h" +#include "cmds.h" +#include "debugfs.h" + +static u64 pds_vdpa_get_driver_features(struct vdpa_device *vdpa_dev); + +static struct pds_vdpa_device *vdpa_to_pdsv(struct vdpa_device *vdpa_dev) +{ + return container_of(vdpa_dev, struct pds_vdpa_device, vdpa_dev); +} + +static int pds_vdpa_notify_handler(struct notifier_block *nb, + unsigned long ecode, + void *data) +{ + struct pds_vdpa_device *pdsv = container_of(nb, struct pds_vdpa_device, nb); + struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev; + + dev_dbg(dev, "%s: event code %lu\n", __func__, ecode); + + if (ecode == PDS_EVENT_RESET || ecode == PDS_EVENT_LINK_CHANGE) { + if (pdsv->config_cb.callback) + pdsv->config_cb.callback(pdsv->config_cb.private); + } + + return 0; +} + +static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv) +{ + struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev; + struct notifier_block *nb = &pdsv->nb; + int err; + + if (!nb->notifier_call) { + nb->notifier_call = pds_vdpa_notify_handler; + err = pdsc_register_notify(nb); + if (err) { + nb->notifier_call = NULL; + dev_err(dev, "failed to register pds event handler: %pe\n", + ERR_PTR(err)); + return -EINVAL; + } + dev_dbg(dev, "pds event handler registered\n"); + } + + return 0; +} + +static void pds_vdpa_unregister_event_handler(struct pds_vdpa_device *pdsv) +{ + if (pdsv->nb.notifier_call) { + pdsc_unregister_notify(&pdsv->nb); + pdsv->nb.notifier_call = NULL; + } +} + +static int pds_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, + u64 desc_addr, u64 driver_addr, u64 device_addr) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + pdsv->vqs[qid].desc_addr = desc_addr; + pdsv->vqs[qid].avail_addr = driver_addr; + pdsv->vqs[qid].used_addr = device_addr; + + return 0; +} + +static void pds_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid, u32 num) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + pdsv->vqs[qid].q_len = num; +} + +static void pds_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + iowrite16(qid, pdsv->vqs[qid].notify); +} + +static void pds_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, + struct vdpa_callback *cb) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + pdsv->vqs[qid].event_cb = *cb; +} + +static irqreturn_t pds_vdpa_isr(int irq, void *data) +{ + struct pds_vdpa_vq_info *vq; + + vq = data; + if (vq->event_cb.callback) + vq->event_cb.callback(vq->event_cb.private); + + return IRQ_HANDLED; +} + +static void pds_vdpa_release_irq(struct pds_vdpa_device *pdsv, int qid) +{ + if (pdsv->vqs[qid].irq == VIRTIO_MSI_NO_VECTOR) + return; + + free_irq(pdsv->vqs[qid].irq, &pdsv->vqs[qid]); + pdsv->vqs[qid].irq = VIRTIO_MSI_NO_VECTOR; +} + +static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct device *dev = &pdsv->vdpa_dev.dev; + u64 driver_features; + u16 invert_idx = 0; + int err; + + dev_dbg(dev, "%s: qid %d ready %d => %d\n", + __func__, qid, pdsv->vqs[qid].ready, ready); + if (ready == pdsv->vqs[qid].ready) + return; + + driver_features = pds_vdpa_get_driver_features(vdpa_dev); + if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) + invert_idx = PDS_VDPA_PACKED_INVERT_IDX; + + if (ready) { + /* Pass vq setup info to DSC using adminq to gather up and + * send all info at once so FW can do its full set up in + * one easy operation + */ + err = pds_vdpa_cmd_init_vq(pdsv, qid, invert_idx, &pdsv->vqs[qid]); + if (err) { + dev_err(dev, "Failed to init vq %d: %pe\n", + qid, ERR_PTR(err)); + ready = false; + } + } else { + err = pds_vdpa_cmd_reset_vq(pdsv, qid, invert_idx, &pdsv->vqs[qid]); + if (err) + dev_err(dev, "%s: reset_vq failed qid %d: %pe\n", + __func__, qid, ERR_PTR(err)); + } + + pdsv->vqs[qid].ready = ready; +} + +static bool pds_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return pdsv->vqs[qid].ready; +} + +static int pds_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid, + const struct vdpa_vq_state *state) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + u64 driver_features; + u16 avail; + u16 used; + + if (pdsv->vqs[qid].ready) { + dev_err(dev, "Setting device position is denied while vq is enabled\n"); + return -EINVAL; + } + + driver_features = pds_vdpa_get_driver_features(vdpa_dev); + if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + avail = state->packed.last_avail_idx | + (state->packed.last_avail_counter << 15); + used = state->packed.last_used_idx | + (state->packed.last_used_counter << 15); + + /* The avail and used index are stored with the packed wrap + * counter bit inverted. This way, in case set_vq_state is + * not called, the initial value can be set to zero prior to + * feature negotiation, and it is good for both packed and + * split vq. + */ + avail ^= PDS_VDPA_PACKED_INVERT_IDX; + used ^= PDS_VDPA_PACKED_INVERT_IDX; + } else { + avail = state->split.avail_index; + /* state->split does not provide a used_index: + * the vq will be set to "empty" here, and the vq will read + * the current used index the next time the vq is kicked. + */ + used = avail; + } + + if (used != avail) { + dev_dbg(dev, "Setting used equal to avail, for interoperability\n"); + used = avail; + } + + pdsv->vqs[qid].avail_idx = avail; + pdsv->vqs[qid].used_idx = used; + + return 0; +} + +static int pds_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid, + struct vdpa_vq_state *state) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; + struct device *dev = &padev->aux_dev.dev; + u64 driver_features; + u16 avail; + u16 used; + + if (pdsv->vqs[qid].ready) { + dev_err(dev, "Getting device position is denied while vq is enabled\n"); + return -EINVAL; + } + + avail = pdsv->vqs[qid].avail_idx; + used = pdsv->vqs[qid].used_idx; + + driver_features = pds_vdpa_get_driver_features(vdpa_dev); + if (driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + avail ^= PDS_VDPA_PACKED_INVERT_IDX; + used ^= PDS_VDPA_PACKED_INVERT_IDX; + + state->packed.last_avail_idx = avail & 0x7fff; + state->packed.last_avail_counter = avail >> 15; + state->packed.last_used_idx = used & 0x7fff; + state->packed.last_used_counter = used >> 15; + } else { + state->split.avail_index = avail; + /* state->split does not provide a used_index. */ + } + + return 0; +} + +static struct vdpa_notification_area +pds_vdpa_get_vq_notification(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct virtio_pci_modern_device *vd_mdev; + struct vdpa_notification_area area; + + area.addr = pdsv->vqs[qid].notify_pa; + + vd_mdev = &pdsv->vdpa_aux->vd_mdev; + if (!vd_mdev->notify_offset_multiplier) + area.size = PDS_PAGE_SIZE; + else + area.size = vd_mdev->notify_offset_multiplier; + + return area; +} + +static int pds_vdpa_get_vq_irq(struct vdpa_device *vdpa_dev, u16 qid) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return pdsv->vqs[qid].irq; +} + +static u32 pds_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) +{ + return PDS_PAGE_SIZE; +} + +static u32 pds_vdpa_get_vq_group(struct vdpa_device *vdpa_dev, u16 idx) +{ + return 0; +} + +static u64 pds_vdpa_get_device_features(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return pdsv->supported_features; +} + +static int pds_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 features) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct device *dev = &pdsv->vdpa_dev.dev; + u64 driver_features; + u64 nego_features; + u64 hw_features; + u64 missing; + + if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) { + dev_err(dev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n"); + return -EOPNOTSUPP; + } + + /* Check for valid feature bits */ + nego_features = features & pdsv->supported_features; + missing = features & ~nego_features; + if (missing) { + dev_err(dev, "Can't support all requested features in %#llx, missing %#llx features\n", + features, missing); + return -EOPNOTSUPP; + } + + driver_features = pds_vdpa_get_driver_features(vdpa_dev); + pdsv->negotiated_features = nego_features; + dev_dbg(dev, "%s: %#llx => %#llx\n", + __func__, driver_features, nego_features); + + /* if we're faking the F_MAC, strip it before writing to device */ + hw_features = le64_to_cpu(pdsv->vdpa_aux->ident.hw_features); + if (!(hw_features & BIT_ULL(VIRTIO_NET_F_MAC))) + nego_features &= ~BIT_ULL(VIRTIO_NET_F_MAC); + + if (driver_features == nego_features) + return 0; + + vp_modern_set_features(&pdsv->vdpa_aux->vd_mdev, nego_features); + + return 0; +} + +static u64 pds_vdpa_get_driver_features(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return pdsv->negotiated_features; +} + +static void pds_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, + struct vdpa_callback *cb) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + pdsv->config_cb.callback = cb->callback; + pdsv->config_cb.private = cb->private; +} + +static u16 pds_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + /* qemu has assert() that vq_num_max <= VIRTQUEUE_MAX_SIZE (1024) */ + return min_t(u16, 1024, BIT(le16_to_cpu(pdsv->vdpa_aux->ident.max_qlen))); +} + +static u32 pds_vdpa_get_device_id(struct vdpa_device *vdpa_dev) +{ + return VIRTIO_ID_NET; +} + +static u32 pds_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) +{ + return PCI_VENDOR_ID_PENSANDO; +} + +static u8 pds_vdpa_get_status(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + + return vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev); +} + +static int pds_vdpa_request_irqs(struct pds_vdpa_device *pdsv) +{ + struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev; + struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux; + struct device *dev = &pdsv->vdpa_dev.dev; + int max_vq, nintrs, qid, err; + + max_vq = vdpa_aux->vdpa_mdev.max_supported_vqs; + + nintrs = pci_alloc_irq_vectors(pdev, max_vq, max_vq, PCI_IRQ_MSIX); + if (nintrs < 0) { + dev_err(dev, "Couldn't get %d msix vectors: %pe\n", + max_vq, ERR_PTR(nintrs)); + return nintrs; + } + + for (qid = 0; qid < pdsv->num_vqs; ++qid) { + int irq = pci_irq_vector(pdev, qid); + + snprintf(pdsv->vqs[qid].irq_name, sizeof(pdsv->vqs[qid].irq_name), + "vdpa-%s-%d", dev_name(dev), qid); + + err = request_irq(irq, pds_vdpa_isr, 0, + pdsv->vqs[qid].irq_name, + &pdsv->vqs[qid]); + if (err) { + dev_err(dev, "%s: no irq for qid %d: %pe\n", + __func__, qid, ERR_PTR(err)); + goto err_release; + } + + pdsv->vqs[qid].irq = irq; + } + + vdpa_aux->nintrs = nintrs; + + return 0; + +err_release: + while (qid--) + pds_vdpa_release_irq(pdsv, qid); + + pci_free_irq_vectors(pdev); + + vdpa_aux->nintrs = 0; + + return err; +} + +void pds_vdpa_release_irqs(struct pds_vdpa_device *pdsv) +{ + struct pds_vdpa_aux *vdpa_aux; + struct pci_dev *pdev; + int qid; + + if (!pdsv) + return; + + pdev = pdsv->vdpa_aux->padev->vf_pdev; + vdpa_aux = pdsv->vdpa_aux; + + if (!vdpa_aux->nintrs) + return; + + for (qid = 0; qid < pdsv->num_vqs; qid++) + pds_vdpa_release_irq(pdsv, qid); + + pci_free_irq_vectors(pdev); + + vdpa_aux->nintrs = 0; +} + +static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct device *dev = &pdsv->vdpa_dev.dev; + u8 old_status; + int i; + + old_status = pds_vdpa_get_status(vdpa_dev); + dev_dbg(dev, "%s: old %#x new %#x\n", __func__, old_status, status); + + if (status & ~old_status & VIRTIO_CONFIG_S_DRIVER_OK) { + if (pds_vdpa_request_irqs(pdsv)) + status = old_status | VIRTIO_CONFIG_S_FAILED; + } + + pds_vdpa_cmd_set_status(pdsv, status); + + if (status == 0) { + struct vdpa_callback null_cb = { }; + + pds_vdpa_set_config_cb(vdpa_dev, &null_cb); + pds_vdpa_cmd_reset(pdsv); + + for (i = 0; i < pdsv->num_vqs; i++) { + pdsv->vqs[i].avail_idx = 0; + pdsv->vqs[i].used_idx = 0; + } + + pds_vdpa_cmd_set_mac(pdsv, pdsv->mac); + } + + if (status & ~old_status & VIRTIO_CONFIG_S_FEATURES_OK) { + for (i = 0; i < pdsv->num_vqs; i++) { + pdsv->vqs[i].notify = + vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev, + i, &pdsv->vqs[i].notify_pa); + } + } + + if (old_status & ~status & VIRTIO_CONFIG_S_DRIVER_OK) + pds_vdpa_release_irqs(pdsv); +} + +static void pds_vdpa_init_vqs_entry(struct pds_vdpa_device *pdsv, int qid, + void __iomem *notify) +{ + memset(&pdsv->vqs[qid], 0, sizeof(pdsv->vqs[0])); + pdsv->vqs[qid].qid = qid; + pdsv->vqs[qid].pdsv = pdsv; + pdsv->vqs[qid].ready = false; + pdsv->vqs[qid].irq = VIRTIO_MSI_NO_VECTOR; + pdsv->vqs[qid].notify = notify; +} + +static int pds_vdpa_reset(struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct device *dev; + int err = 0; + u8 status; + int i; + + dev = &pdsv->vdpa_aux->padev->aux_dev.dev; + status = pds_vdpa_get_status(vdpa_dev); + + if (status == 0) + return 0; + + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + /* Reset the vqs */ + for (i = 0; i < pdsv->num_vqs && !err; i++) { + err = pds_vdpa_cmd_reset_vq(pdsv, i, 0, &pdsv->vqs[i]); + if (err) + dev_err(dev, "%s: reset_vq failed qid %d: %pe\n", + __func__, i, ERR_PTR(err)); + } + } + + pds_vdpa_set_status(vdpa_dev, 0); + + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + /* Reset the vq info */ + for (i = 0; i < pdsv->num_vqs && !err; i++) + pds_vdpa_init_vqs_entry(pdsv, i, pdsv->vqs[i].notify); + } + + return 0; +} + +static size_t pds_vdpa_get_config_size(struct vdpa_device *vdpa_dev) +{ + return sizeof(struct virtio_net_config); +} + +static void pds_vdpa_get_config(struct vdpa_device *vdpa_dev, + unsigned int offset, + void *buf, unsigned int len) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + void __iomem *device; + + if (offset + len > sizeof(struct virtio_net_config)) { + WARN(true, "%s: bad read, offset %d len %d\n", __func__, offset, len); + return; + } + + device = pdsv->vdpa_aux->vd_mdev.device; + memcpy_fromio(buf, device + offset, len); +} + +static void pds_vdpa_set_config(struct vdpa_device *vdpa_dev, + unsigned int offset, const void *buf, + unsigned int len) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + void __iomem *device; + + if (offset + len > sizeof(struct virtio_net_config)) { + WARN(true, "%s: bad read, offset %d len %d\n", __func__, offset, len); + return; + } + + device = pdsv->vdpa_aux->vd_mdev.device; + memcpy_toio(device + offset, buf, len); +} + +static const struct vdpa_config_ops pds_vdpa_ops = { + .set_vq_address = pds_vdpa_set_vq_address, + .set_vq_num = pds_vdpa_set_vq_num, + .kick_vq = pds_vdpa_kick_vq, + .set_vq_cb = pds_vdpa_set_vq_cb, + .set_vq_ready = pds_vdpa_set_vq_ready, + .get_vq_ready = pds_vdpa_get_vq_ready, + .set_vq_state = pds_vdpa_set_vq_state, + .get_vq_state = pds_vdpa_get_vq_state, + .get_vq_notification = pds_vdpa_get_vq_notification, + .get_vq_irq = pds_vdpa_get_vq_irq, + .get_vq_align = pds_vdpa_get_vq_align, + .get_vq_group = pds_vdpa_get_vq_group, + + .get_device_features = pds_vdpa_get_device_features, + .set_driver_features = pds_vdpa_set_driver_features, + .get_driver_features = pds_vdpa_get_driver_features, + .set_config_cb = pds_vdpa_set_config_cb, + .get_vq_num_max = pds_vdpa_get_vq_num_max, + .get_device_id = pds_vdpa_get_device_id, + .get_vendor_id = pds_vdpa_get_vendor_id, + .get_status = pds_vdpa_get_status, + .set_status = pds_vdpa_set_status, + .reset = pds_vdpa_reset, + .get_config_size = pds_vdpa_get_config_size, + .get_config = pds_vdpa_get_config, + .set_config = pds_vdpa_set_config, +}; +static struct virtio_device_id pds_vdpa_id_table[] = { + {VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID}, + {0}, +}; + +static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, + const struct vdpa_dev_set_config *add_config) +{ + struct pds_vdpa_aux *vdpa_aux; + struct pds_vdpa_device *pdsv; + struct vdpa_mgmt_dev *mgmt; + u16 fw_max_vqs, vq_pairs; + struct device *dma_dev; + struct pci_dev *pdev; + struct device *dev; + u8 status; + int err; + int i; + + vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev); + dev = &vdpa_aux->padev->aux_dev.dev; + mgmt = &vdpa_aux->vdpa_mdev; + + if (vdpa_aux->pdsv) { + dev_warn(dev, "Multiple vDPA devices on a VF is not supported.\n"); + return -EOPNOTSUPP; + } + + pdsv = vdpa_alloc_device(struct pds_vdpa_device, vdpa_dev, + dev, &pds_vdpa_ops, NULL, + 1, 1, name, false); + if (IS_ERR(pdsv)) { + dev_err(dev, "Failed to allocate vDPA structure: %pe\n", pdsv); + return PTR_ERR(pdsv); + } + + vdpa_aux->pdsv = pdsv; + pdsv->vdpa_aux = vdpa_aux; + + pdev = vdpa_aux->padev->vf_pdev; + dma_dev = &pdev->dev; + pdsv->vdpa_dev.vmap.dma_dev = dma_dev; + + status = pds_vdpa_get_status(&pdsv->vdpa_dev); + if (status == 0xff) { + dev_err(dev, "Broken PCI - status %#x\n", status); + err = -ENXIO; + goto err_unmap; + } + + pdsv->supported_features = mgmt->supported_features; + + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { + u64 unsupp_features = + add_config->device_features & ~pdsv->supported_features; + + if (unsupp_features) { + dev_err(dev, "Unsupported features: %#llx\n", unsupp_features); + err = -EOPNOTSUPP; + goto err_unmap; + } + + pdsv->supported_features = add_config->device_features; + } + + err = pds_vdpa_cmd_reset(pdsv); + if (err) { + dev_err(dev, "Failed to reset hw: %pe\n", ERR_PTR(err)); + goto err_unmap; + } + + err = pds_vdpa_init_hw(pdsv); + if (err) { + dev_err(dev, "Failed to init hw: %pe\n", ERR_PTR(err)); + goto err_unmap; + } + + fw_max_vqs = le16_to_cpu(pdsv->vdpa_aux->ident.max_vqs); + vq_pairs = fw_max_vqs / 2; + + /* Make sure we have the queues being requested */ + if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) + vq_pairs = add_config->net.max_vq_pairs; + + pdsv->num_vqs = 2 * vq_pairs; + if (pdsv->supported_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) + pdsv->num_vqs++; + + if (pdsv->num_vqs > fw_max_vqs) { + dev_err(dev, "%s: queue count requested %u greater than max %u\n", + __func__, pdsv->num_vqs, fw_max_vqs); + err = -ENOSPC; + goto err_unmap; + } + + if (pdsv->num_vqs != fw_max_vqs) { + err = pds_vdpa_cmd_set_max_vq_pairs(pdsv, vq_pairs); + if (err) { + dev_err(dev, "Failed to set max_vq_pairs: %pe\n", + ERR_PTR(err)); + goto err_unmap; + } + } + + /* Set a mac, either from the user config if provided + * or use the device's mac if not 00:..:00 + * or set a random mac + */ + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + ether_addr_copy(pdsv->mac, add_config->net.mac); + } else { + struct virtio_net_config __iomem *vc; + + vc = pdsv->vdpa_aux->vd_mdev.device; + memcpy_fromio(pdsv->mac, vc->mac, sizeof(pdsv->mac)); + if (is_zero_ether_addr(pdsv->mac) && + (pdsv->supported_features & BIT_ULL(VIRTIO_NET_F_MAC))) { + eth_random_addr(pdsv->mac); + dev_info(dev, "setting random mac %pM\n", pdsv->mac); + } + } + pds_vdpa_cmd_set_mac(pdsv, pdsv->mac); + + for (i = 0; i < pdsv->num_vqs; i++) { + void __iomem *notify; + + notify = vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev, + i, &pdsv->vqs[i].notify_pa); + pds_vdpa_init_vqs_entry(pdsv, i, notify); + } + + pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev; + + err = pds_vdpa_register_event_handler(pdsv); + if (err) { + dev_err(dev, "Failed to register for PDS events: %pe\n", ERR_PTR(err)); + goto err_unmap; + } + + /* We use the _vdpa_register_device() call rather than the + * vdpa_register_device() to avoid a deadlock because our + * dev_add() is called with the vdpa_dev_lock already set + * by vdpa_nl_cmd_dev_add_set_doit() + */ + err = _vdpa_register_device(&pdsv->vdpa_dev, pdsv->num_vqs); + if (err) { + dev_err(dev, "Failed to register to vDPA bus: %pe\n", ERR_PTR(err)); + goto err_unevent; + } + + pds_vdpa_debugfs_add_vdpadev(vdpa_aux); + + return 0; + +err_unevent: + pds_vdpa_unregister_event_handler(pdsv); +err_unmap: + put_device(&pdsv->vdpa_dev.dev); + vdpa_aux->pdsv = NULL; + return err; +} + +static void pds_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, + struct vdpa_device *vdpa_dev) +{ + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); + struct pds_vdpa_aux *vdpa_aux; + + pds_vdpa_unregister_event_handler(pdsv); + + vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev); + _vdpa_unregister_device(vdpa_dev); + + pds_vdpa_cmd_reset(vdpa_aux->pdsv); + pds_vdpa_debugfs_reset_vdpadev(vdpa_aux); + + vdpa_aux->pdsv = NULL; + + dev_info(&vdpa_aux->padev->aux_dev.dev, "Removed vdpa device\n"); +} + +static const struct vdpa_mgmtdev_ops pds_vdpa_mgmt_dev_ops = { + .dev_add = pds_vdpa_dev_add, + .dev_del = pds_vdpa_dev_del +}; + +int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux) +{ + union pds_core_adminq_cmd cmd = { + .vdpa_ident.opcode = PDS_VDPA_CMD_IDENT, + .vdpa_ident.vf_id = cpu_to_le16(vdpa_aux->vf_id), + }; + union pds_core_adminq_comp comp = {}; + struct vdpa_mgmt_dev *mgmt; + struct pci_dev *pf_pdev; + struct device *pf_dev; + struct pci_dev *pdev; + dma_addr_t ident_pa; + struct device *dev; + u16 dev_intrs; + u16 max_vqs; + int err; + + dev = &vdpa_aux->padev->aux_dev.dev; + pdev = vdpa_aux->padev->vf_pdev; + mgmt = &vdpa_aux->vdpa_mdev; + + /* Get resource info through the PF's adminq. It is a block of info, + * so we need to map some memory for PF to make available to the + * firmware for writing the data. + */ + pf_pdev = pci_physfn(vdpa_aux->padev->vf_pdev); + pf_dev = &pf_pdev->dev; + ident_pa = dma_map_single(pf_dev, &vdpa_aux->ident, + sizeof(vdpa_aux->ident), DMA_FROM_DEVICE); + if (dma_mapping_error(pf_dev, ident_pa)) { + dev_err(dev, "Failed to map ident space\n"); + return -ENOMEM; + } + + cmd.vdpa_ident.ident_pa = cpu_to_le64(ident_pa); + cmd.vdpa_ident.len = cpu_to_le32(sizeof(vdpa_aux->ident)); + err = pds_client_adminq_cmd(vdpa_aux->padev, &cmd, + sizeof(cmd.vdpa_ident), &comp, 0); + dma_unmap_single(pf_dev, ident_pa, + sizeof(vdpa_aux->ident), DMA_FROM_DEVICE); + if (err) { + dev_err(dev, "Failed to ident hw, status %d: %pe\n", + comp.status, ERR_PTR(err)); + return err; + } + + max_vqs = le16_to_cpu(vdpa_aux->ident.max_vqs); + dev_intrs = pci_msix_vec_count(pdev); + dev_dbg(dev, "ident.max_vqs %d dev_intrs %d\n", max_vqs, dev_intrs); + + max_vqs = min_t(u16, dev_intrs, max_vqs); + mgmt->max_supported_vqs = min_t(u16, PDS_VDPA_MAX_QUEUES, max_vqs); + vdpa_aux->nintrs = 0; + + mgmt->ops = &pds_vdpa_mgmt_dev_ops; + mgmt->id_table = pds_vdpa_id_table; + mgmt->device = dev; + mgmt->supported_features = le64_to_cpu(vdpa_aux->ident.hw_features); + + /* advertise F_MAC even if the device doesn't */ + mgmt->supported_features |= BIT_ULL(VIRTIO_NET_F_MAC); + + mgmt->config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); + mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); + mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); + + return 0; +} diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h new file mode 100644 index 000000000000..84bdb45871ff --- /dev/null +++ b/drivers/vdpa/pds/vdpa_dev.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _VDPA_DEV_H_ +#define _VDPA_DEV_H_ + +#include <linux/pci.h> +#include <linux/vdpa.h> + +struct pds_vdpa_vq_info { + bool ready; + u64 desc_addr; + u64 avail_addr; + u64 used_addr; + u32 q_len; + u16 qid; + int irq; + char irq_name[32]; + + void __iomem *notify; + dma_addr_t notify_pa; + + u64 doorbell; + u16 avail_idx; + u16 used_idx; + + struct vdpa_callback event_cb; + struct pds_vdpa_device *pdsv; +}; + +#define PDS_VDPA_MAX_QUEUES 65 +#define PDS_VDPA_MAX_QLEN 32768 +struct pds_vdpa_device { + struct vdpa_device vdpa_dev; + struct pds_vdpa_aux *vdpa_aux; + + struct pds_vdpa_vq_info vqs[PDS_VDPA_MAX_QUEUES]; + u64 supported_features; /* supported device features */ + u64 negotiated_features; /* negotiated features */ + u8 vdpa_index; /* rsvd for future subdevice use */ + u8 num_vqs; /* num vqs in use */ + u8 mac[ETH_ALEN]; /* mac selected when the device was added */ + struct vdpa_callback config_cb; + struct notifier_block nb; +}; + +#define PDS_VDPA_PACKED_INVERT_IDX 0x8000 + +void pds_vdpa_release_irqs(struct pds_vdpa_device *pdsv); +int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux); +#endif /* _VDPA_DEV_H_ */ diff --git a/drivers/vdpa/solidrun/Makefile b/drivers/vdpa/solidrun/Makefile new file mode 100644 index 000000000000..9116252cd5fa --- /dev/null +++ b/drivers/vdpa/solidrun/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_SNET_VDPA) += snet_vdpa.o +snet_vdpa-$(CONFIG_SNET_VDPA) += snet_main.o +snet_vdpa-$(CONFIG_SNET_VDPA) += snet_ctrl.o +ifdef CONFIG_HWMON +snet_vdpa-$(CONFIG_SNET_VDPA) += snet_hwmon.o +endif diff --git a/drivers/vdpa/solidrun/snet_ctrl.c b/drivers/vdpa/solidrun/snet_ctrl.c new file mode 100644 index 000000000000..3cef2571d15d --- /dev/null +++ b/drivers/vdpa/solidrun/snet_ctrl.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ + +#include <linux/iopoll.h> + +#include "snet_vdpa.h" + +enum snet_ctrl_opcodes { + SNET_CTRL_OP_DESTROY = 1, + SNET_CTRL_OP_READ_VQ_STATE, + SNET_CTRL_OP_SUSPEND, + SNET_CTRL_OP_RESUME, +}; + +#define SNET_CTRL_TIMEOUT 2000000 + +#define SNET_CTRL_DATA_SIZE_MASK 0x0000FFFF +#define SNET_CTRL_IN_PROCESS_MASK 0x00010000 +#define SNET_CTRL_CHUNK_RDY_MASK 0x00020000 +#define SNET_CTRL_ERROR_MASK 0x0FFC0000 + +#define SNET_VAL_TO_ERR(val) (-(((val) & SNET_CTRL_ERROR_MASK) >> 18)) +#define SNET_EMPTY_CTRL(val) (((val) & SNET_CTRL_ERROR_MASK) || \ + !((val) & SNET_CTRL_IN_PROCESS_MASK)) +#define SNET_DATA_READY(val) ((val) & (SNET_CTRL_ERROR_MASK | SNET_CTRL_CHUNK_RDY_MASK)) + +/* Control register used to read data from the DPU */ +struct snet_ctrl_reg_ctrl { + /* Chunk size in 4B words */ + u16 data_size; + /* We are in the middle of a command */ + u16 in_process:1; + /* A data chunk is ready and can be consumed */ + u16 chunk_ready:1; + /* Error code */ + u16 error:10; + /* Saved for future usage */ + u16 rsvd:4; +}; + +/* Opcode register */ +struct snet_ctrl_reg_op { + u16 opcode; + /* Only if VQ index is relevant for the command */ + u16 vq_idx; +}; + +struct snet_ctrl_regs { + struct snet_ctrl_reg_op op; + struct snet_ctrl_reg_ctrl ctrl; + u32 rsvd; + u32 data[]; +}; + +static struct snet_ctrl_regs __iomem *snet_get_ctrl(struct snet *snet) +{ + return snet->bar + snet->psnet->cfg.ctrl_off; +} + +static int snet_wait_for_empty_ctrl(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->ctrl, val, SNET_EMPTY_CTRL(val), 10, + SNET_CTRL_TIMEOUT); +} + +static int snet_wait_for_empty_op(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->op, val, !val, 10, SNET_CTRL_TIMEOUT); +} + +static int snet_wait_for_data(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->ctrl, val, SNET_DATA_READY(val), 10, + SNET_CTRL_TIMEOUT); +} + +static u32 snet_read32_word(struct snet_ctrl_regs __iomem *ctrl_regs, u16 word_idx) +{ + return ioread32(&ctrl_regs->data[word_idx]); +} + +static u32 snet_read_ctrl(struct snet_ctrl_regs __iomem *ctrl_regs) +{ + return ioread32(&ctrl_regs->ctrl); +} + +static void snet_write_ctrl(struct snet_ctrl_regs __iomem *ctrl_regs, u32 val) +{ + iowrite32(val, &ctrl_regs->ctrl); +} + +static void snet_write_op(struct snet_ctrl_regs __iomem *ctrl_regs, u32 val) +{ + iowrite32(val, &ctrl_regs->op); +} + +static int snet_wait_for_dpu_completion(struct snet_ctrl_regs __iomem *ctrl_regs) +{ + /* Wait until the DPU finishes completely. + * It will clear the opcode register. + */ + return snet_wait_for_empty_op(ctrl_regs); +} + +/* Reading ctrl from the DPU: + * buf_size must be 4B aligned + * + * Steps: + * + * (1) Verify that the DPU is not in the middle of another operation by + * reading the in_process and error bits in the control register. + * (2) Write the request opcode and the VQ idx in the opcode register + * and write the buffer size in the control register. + * (3) Start readind chunks of data, chunk_ready bit indicates that a + * data chunk is available, we signal that we read the data by clearing the bit. + * (4) Detect that the transfer is completed when the in_process bit + * in the control register is cleared or when the an error appears. + */ +static int snet_ctrl_read_from_dpu(struct snet *snet, u16 opcode, u16 vq_idx, void *buffer, + u32 buf_size) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + u32 *bfr_ptr = (u32 *)buffer; + u32 val; + u16 buf_words; + int ret; + u16 words, i, tot_words = 0; + + /* Supported for config 2+ */ + if (!SNET_CFG_VER(snet, 2)) + return -EOPNOTSUPP; + + if (!IS_ALIGNED(buf_size, 4)) + return -EINVAL; + + mutex_lock(&snet->ctrl_lock); + + buf_words = buf_size / 4; + + /* Make sure control register is empty */ + ret = snet_wait_for_empty_ctrl(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control data to be consumed\n"); + goto exit; + } + + /* We need to write the buffer size in the control register, and the opcode + vq index in + * the opcode register. + * We use a spinlock to serialize the writes. + */ + spin_lock(&snet->ctrl_spinlock); + + snet_write_ctrl(regs, buf_words); + snet_write_op(regs, opcode | (vq_idx << 16)); + + spin_unlock(&snet->ctrl_spinlock); + + while (buf_words != tot_words) { + ret = snet_wait_for_data(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for control data\n"); + goto exit; + } + + val = snet_read_ctrl(regs); + + /* Error? */ + if (val & SNET_CTRL_ERROR_MASK) { + ret = SNET_VAL_TO_ERR(val); + SNET_WARN(pdev, "Error while reading control data from DPU, err %d\n", ret); + goto exit; + } + + words = min_t(u16, val & SNET_CTRL_DATA_SIZE_MASK, buf_words - tot_words); + + for (i = 0; i < words; i++) { + *bfr_ptr = snet_read32_word(regs, i); + bfr_ptr++; + } + + tot_words += words; + + /* Is the job completed? */ + if (!(val & SNET_CTRL_IN_PROCESS_MASK)) + break; + + /* Clear the chunk ready bit and continue */ + val &= ~SNET_CTRL_CHUNK_RDY_MASK; + snet_write_ctrl(regs, val); + } + + ret = snet_wait_for_dpu_completion(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for the DPU to complete a control command\n"); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +/* Send a control message to the DPU using the old mechanism + * used with config version 1. + */ +static int snet_send_ctrl_msg_old(struct snet *snet, u32 opcode) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + int ret; + + mutex_lock(&snet->ctrl_lock); + + /* Old mechanism uses just 1 register, the opcode register. + * Make sure that the opcode register is empty, and that the DPU isn't + * processing an old message. + */ + ret = snet_wait_for_empty_op(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control message to be ACKed\n"); + goto exit; + } + + /* Write the message */ + snet_write_op(regs, opcode); + + /* DPU ACKs the message by clearing the opcode register */ + ret = snet_wait_for_empty_op(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for a control message to be ACKed\n"); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +/* Send a control message to the DPU. + * A control message is a message without payload. + */ +static int snet_send_ctrl_msg(struct snet *snet, u16 opcode, u16 vq_idx) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + u32 val; + int ret; + + /* If config version is not 2+, use the old mechanism */ + if (!SNET_CFG_VER(snet, 2)) + return snet_send_ctrl_msg_old(snet, opcode); + + mutex_lock(&snet->ctrl_lock); + + /* Make sure control register is empty */ + ret = snet_wait_for_empty_ctrl(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control data to be consumed\n"); + goto exit; + } + + /* We need to clear the control register and write the opcode + vq index in the opcode + * register. + * We use a spinlock to serialize the writes. + */ + spin_lock(&snet->ctrl_spinlock); + + snet_write_ctrl(regs, 0); + snet_write_op(regs, opcode | (vq_idx << 16)); + + spin_unlock(&snet->ctrl_spinlock); + + /* The DPU ACKs control messages by setting the chunk ready bit + * without data. + */ + ret = snet_wait_for_data(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for control message to be ACKed\n"); + goto exit; + } + + /* Check for errors */ + val = snet_read_ctrl(regs); + ret = SNET_VAL_TO_ERR(val); + + /* Clear the chunk ready bit */ + val &= ~SNET_CTRL_CHUNK_RDY_MASK; + snet_write_ctrl(regs, val); + + ret = snet_wait_for_dpu_completion(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for DPU to complete a control command, err %d\n", + ret); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +void snet_ctrl_clear(struct snet *snet) +{ + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + + snet_write_op(regs, 0); +} + +int snet_destroy_dev(struct snet *snet) +{ + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_DESTROY, 0); +} + +int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state *state) +{ + return snet_ctrl_read_from_dpu(snet, SNET_CTRL_OP_READ_VQ_STATE, idx, state, + sizeof(*state)); +} + +int snet_suspend_dev(struct snet *snet) +{ + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_SUSPEND, 0); +} + +int snet_resume_dev(struct snet *snet) +{ + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_RESUME, 0); +} diff --git a/drivers/vdpa/solidrun/snet_hwmon.c b/drivers/vdpa/solidrun/snet_hwmon.c new file mode 100644 index 000000000000..af531a339082 --- /dev/null +++ b/drivers/vdpa/solidrun/snet_hwmon.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ +#include <linux/hwmon.h> + +#include "snet_vdpa.h" + +/* Monitor offsets */ +#define SNET_MON_TMP0_IN_OFF 0x00 +#define SNET_MON_TMP0_MAX_OFF 0x08 +#define SNET_MON_TMP0_CRIT_OFF 0x10 +#define SNET_MON_TMP1_IN_OFF 0x18 +#define SNET_MON_TMP1_CRIT_OFF 0x20 +#define SNET_MON_CURR_IN_OFF 0x28 +#define SNET_MON_CURR_MAX_OFF 0x30 +#define SNET_MON_CURR_CRIT_OFF 0x38 +#define SNET_MON_PWR_IN_OFF 0x40 +#define SNET_MON_VOLT_IN_OFF 0x48 +#define SNET_MON_VOLT_CRIT_OFF 0x50 +#define SNET_MON_VOLT_LCRIT_OFF 0x58 + +static void snet_hwmon_read_reg(struct psnet *psnet, u32 reg, long *out) +{ + *out = psnet_read64(psnet, psnet->cfg.hwmon_off + reg); +} + +static umode_t snet_howmon_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + return 0444; +} + +static int snet_howmon_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + struct psnet *psnet = dev_get_drvdata(dev); + int ret = 0; + + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_lcrit: + snet_hwmon_read_reg(psnet, SNET_MON_VOLT_LCRIT_OFF, val); + break; + case hwmon_in_crit: + snet_hwmon_read_reg(psnet, SNET_MON_VOLT_CRIT_OFF, val); + break; + case hwmon_in_input: + snet_hwmon_read_reg(psnet, SNET_MON_VOLT_IN_OFF, val); + break; + default: + ret = -EOPNOTSUPP; + break; + } + break; + + case hwmon_power: + switch (attr) { + case hwmon_power_input: + snet_hwmon_read_reg(psnet, SNET_MON_PWR_IN_OFF, val); + break; + + default: + ret = -EOPNOTSUPP; + break; + } + break; + + case hwmon_curr: + switch (attr) { + case hwmon_curr_input: + snet_hwmon_read_reg(psnet, SNET_MON_CURR_IN_OFF, val); + break; + case hwmon_curr_max: + snet_hwmon_read_reg(psnet, SNET_MON_CURR_MAX_OFF, val); + break; + case hwmon_curr_crit: + snet_hwmon_read_reg(psnet, SNET_MON_CURR_CRIT_OFF, val); + break; + default: + ret = -EOPNOTSUPP; + break; + } + break; + + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + if (channel == 0) + snet_hwmon_read_reg(psnet, SNET_MON_TMP0_IN_OFF, val); + else + snet_hwmon_read_reg(psnet, SNET_MON_TMP1_IN_OFF, val); + break; + case hwmon_temp_max: + if (channel == 0) + snet_hwmon_read_reg(psnet, SNET_MON_TMP0_MAX_OFF, val); + else + ret = -EOPNOTSUPP; + break; + case hwmon_temp_crit: + if (channel == 0) + snet_hwmon_read_reg(psnet, SNET_MON_TMP0_CRIT_OFF, val); + else + snet_hwmon_read_reg(psnet, SNET_MON_TMP1_CRIT_OFF, val); + break; + + default: + ret = -EOPNOTSUPP; + break; + } + break; + + default: + ret = -EOPNOTSUPP; + break; + } + return ret; +} + +static int snet_hwmon_read_string(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, const char **str) +{ + int ret = 0; + + switch (type) { + case hwmon_in: + *str = "main_vin"; + break; + case hwmon_power: + *str = "soc_pin"; + break; + case hwmon_curr: + *str = "soc_iin"; + break; + case hwmon_temp: + if (channel == 0) + *str = "power_stage_temp"; + else + *str = "ic_junction_temp"; + break; + default: + ret = -EOPNOTSUPP; + break; + } + return ret; +} + +static const struct hwmon_ops snet_hwmon_ops = { + .is_visible = snet_howmon_is_visible, + .read = snet_howmon_read, + .read_string = snet_hwmon_read_string +}; + +static const struct hwmon_channel_info * const snet_hwmon_info[] = { + HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_CRIT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_LABEL), + HWMON_CHANNEL_INFO(power, HWMON_P_INPUT | HWMON_P_LABEL), + HWMON_CHANNEL_INFO(curr, HWMON_C_INPUT | HWMON_C_MAX | HWMON_C_CRIT | HWMON_C_LABEL), + HWMON_CHANNEL_INFO(in, HWMON_I_INPUT | HWMON_I_CRIT | HWMON_I_LCRIT | HWMON_I_LABEL), + NULL +}; + +static const struct hwmon_chip_info snet_hwmono_info = { + .ops = &snet_hwmon_ops, + .info = snet_hwmon_info, +}; + +/* Create an HW monitor device */ +void psnet_create_hwmon(struct pci_dev *pdev) +{ + struct device *hwmon; + struct psnet *psnet = pci_get_drvdata(pdev); + + snprintf(psnet->hwmon_name, SNET_NAME_SIZE, "snet_%s", pci_name(pdev)); + hwmon = devm_hwmon_device_register_with_info(&pdev->dev, psnet->hwmon_name, psnet, + &snet_hwmono_info, NULL); + /* The monitor is not mandatory, Just alert user in case of an error */ + if (IS_ERR(hwmon)) + SNET_WARN(pdev, "Failed to create SNET hwmon, error %ld\n", PTR_ERR(hwmon)); +} diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c new file mode 100644 index 000000000000..4588211d57eb --- /dev/null +++ b/drivers/vdpa/solidrun/snet_main.c @@ -0,0 +1,1137 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ +#include <linux/iopoll.h> + +#include "snet_vdpa.h" + +/* SNET DPU device ID */ +#define SNET_DEVICE_ID 0x1000 +/* SNET signature */ +#define SNET_SIGNATURE 0xD0D06363 +/* Max. config version that we can work with */ +#define SNET_CFG_VERSION 0x2 +/* Queue align */ +#define SNET_QUEUE_ALIGNMENT PAGE_SIZE +/* Kick value to notify that new data is available */ +#define SNET_KICK_VAL 0x1 +#define SNET_CONFIG_OFF 0x0 +/* How long we are willing to wait for a SNET device */ +#define SNET_DETECT_TIMEOUT 5000000 +/* How long should we wait for the DPU to read our config */ +#define SNET_READ_CFG_TIMEOUT 3000000 +/* Size of configs written to the DPU */ +#define SNET_GENERAL_CFG_LEN 36 +#define SNET_GENERAL_CFG_VQ_LEN 40 + +static struct snet *vdpa_to_snet(struct vdpa_device *vdpa) +{ + return container_of(vdpa, struct snet, vdpa); +} + +static irqreturn_t snet_cfg_irq_hndlr(int irq, void *data) +{ + struct snet *snet = data; + /* Call callback if any */ + if (likely(snet->cb.callback)) + return snet->cb.callback(snet->cb.private); + + return IRQ_HANDLED; +} + +static irqreturn_t snet_vq_irq_hndlr(int irq, void *data) +{ + struct snet_vq *vq = data; + /* Call callback if any */ + if (likely(vq->cb.callback)) + return vq->cb.callback(vq->cb.private); + + return IRQ_HANDLED; +} + +static void snet_free_irqs(struct snet *snet) +{ + struct psnet *psnet = snet->psnet; + struct pci_dev *pdev; + u32 i; + + /* Which Device allcoated the IRQs? */ + if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF)) + pdev = snet->pdev->physfn; + else + pdev = snet->pdev; + + /* Free config's IRQ */ + if (snet->cfg_irq != -1) { + devm_free_irq(&pdev->dev, snet->cfg_irq, snet); + snet->cfg_irq = -1; + } + /* Free VQ IRQs */ + for (i = 0; i < snet->cfg->vq_num; i++) { + if (snet->vqs[i] && snet->vqs[i]->irq != -1) { + devm_free_irq(&pdev->dev, snet->vqs[i]->irq, snet->vqs[i]); + snet->vqs[i]->irq = -1; + } + } + + /* IRQ vectors are freed when the pci remove callback is called */ +} + +static int snet_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area, + u64 driver_area, u64 device_area) +{ + struct snet *snet = vdpa_to_snet(vdev); + /* save received parameters in vqueue sturct */ + snet->vqs[idx]->desc_area = desc_area; + snet->vqs[idx]->driver_area = driver_area; + snet->vqs[idx]->device_area = device_area; + + return 0; +} + +static void snet_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num) +{ + struct snet *snet = vdpa_to_snet(vdev); + /* save num in vqueue */ + snet->vqs[idx]->num = num; +} + +static void snet_kick_vq(struct vdpa_device *vdev, u16 idx) +{ + struct snet *snet = vdpa_to_snet(vdev); + /* not ready - ignore */ + if (unlikely(!snet->vqs[idx]->ready)) + return; + + iowrite32(SNET_KICK_VAL, snet->vqs[idx]->kick_ptr); +} + +static void snet_kick_vq_with_data(struct vdpa_device *vdev, u32 data) +{ + struct snet *snet = vdpa_to_snet(vdev); + u16 idx = data & 0xFFFF; + + /* not ready - ignore */ + if (unlikely(!snet->vqs[idx]->ready)) + return; + + iowrite32((data & 0xFFFF0000) | SNET_KICK_VAL, snet->vqs[idx]->kick_ptr); +} + +static void snet_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb) +{ + struct snet *snet = vdpa_to_snet(vdev); + + snet->vqs[idx]->cb.callback = cb->callback; + snet->vqs[idx]->cb.private = cb->private; +} + +static void snet_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready) +{ + struct snet *snet = vdpa_to_snet(vdev); + + snet->vqs[idx]->ready = ready; +} + +static bool snet_get_vq_ready(struct vdpa_device *vdev, u16 idx) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->vqs[idx]->ready; +} + +static bool snet_vq_state_is_initial(struct snet *snet, const struct vdpa_vq_state *state) +{ + if (SNET_HAS_FEATURE(snet, VIRTIO_F_RING_PACKED)) { + const struct vdpa_vq_state_packed *p = &state->packed; + + if (p->last_avail_counter == 1 && p->last_used_counter == 1 && + p->last_avail_idx == 0 && p->last_used_idx == 0) + return true; + } else { + const struct vdpa_vq_state_split *s = &state->split; + + if (s->avail_index == 0) + return true; + } + + return false; +} + +static int snet_set_vq_state(struct vdpa_device *vdev, u16 idx, const struct vdpa_vq_state *state) +{ + struct snet *snet = vdpa_to_snet(vdev); + + /* We can set any state for config version 2+ */ + if (SNET_CFG_VER(snet, 2)) { + memcpy(&snet->vqs[idx]->vq_state, state, sizeof(*state)); + return 0; + } + + /* Older config - we can't set the VQ state. + * Return 0 only if this is the initial state we use in the DPU. + */ + if (snet_vq_state_is_initial(snet, state)) + return 0; + + return -EOPNOTSUPP; +} + +static int snet_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet_read_vq_state(snet, idx, state); +} + +static int snet_get_vq_irq(struct vdpa_device *vdev, u16 idx) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->vqs[idx]->irq; +} + +static u32 snet_get_vq_align(struct vdpa_device *vdev) +{ + return (u32)SNET_QUEUE_ALIGNMENT; +} + +static int snet_reset_dev(struct snet *snet) +{ + struct pci_dev *pdev = snet->pdev; + int ret = 0; + u32 i; + + /* If status is 0, nothing to do */ + if (!snet->status) + return 0; + + /* If DPU started, destroy it */ + if (snet->status & VIRTIO_CONFIG_S_DRIVER_OK) + ret = snet_destroy_dev(snet); + + /* Clear VQs */ + for (i = 0; i < snet->cfg->vq_num; i++) { + if (!snet->vqs[i]) + continue; + snet->vqs[i]->cb.callback = NULL; + snet->vqs[i]->cb.private = NULL; + snet->vqs[i]->desc_area = 0; + snet->vqs[i]->device_area = 0; + snet->vqs[i]->driver_area = 0; + snet->vqs[i]->ready = false; + } + + /* Clear config callback */ + snet->cb.callback = NULL; + snet->cb.private = NULL; + /* Free IRQs */ + snet_free_irqs(snet); + /* Reset status */ + snet->status = 0; + snet->dpu_ready = false; + + if (ret) + SNET_WARN(pdev, "Incomplete reset to SNET[%u] device, err: %d\n", snet->sid, ret); + else + SNET_DBG(pdev, "Reset SNET[%u] device\n", snet->sid); + + return 0; +} + +static int snet_reset(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet_reset_dev(snet); +} + +static size_t snet_get_config_size(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return (size_t)snet->cfg->cfg_size; +} + +static u64 snet_get_features(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->cfg->features; +} + +static int snet_set_drv_features(struct vdpa_device *vdev, u64 features) +{ + struct snet *snet = vdpa_to_snet(vdev); + + snet->negotiated_features = snet->cfg->features & features; + return 0; +} + +static u64 snet_get_drv_features(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->negotiated_features; +} + +static u16 snet_get_vq_num_max(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return (u16)snet->cfg->vq_size; +} + +static void snet_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb) +{ + struct snet *snet = vdpa_to_snet(vdev); + + snet->cb.callback = cb->callback; + snet->cb.private = cb->private; +} + +static u32 snet_get_device_id(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->cfg->virtio_id; +} + +static u32 snet_get_vendor_id(struct vdpa_device *vdev) +{ + return (u32)PCI_VENDOR_ID_SOLIDRUN; +} + +static u8 snet_get_status(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + + return snet->status; +} + +static int snet_write_conf(struct snet *snet) +{ + u32 off, i, tmp; + int ret; + + /* No need to write the config twice */ + if (snet->dpu_ready) + return true; + + /* Snet data : + * + * General data: SNET_GENERAL_CFG_LEN bytes long + * 0 0x4 0x8 0xC 0x10 0x14 0x1C 0x24 + * | MAGIC NUMBER | CFG VER | SNET SID | NUMBER OF QUEUES | IRQ IDX | FEATURES | RSVD | + * + * For every VQ: SNET_GENERAL_CFG_VQ_LEN bytes long + * 0 0x4 0x8 + * | VQ SID AND QUEUE SIZE | IRQ Index | + * | DESC AREA | + * | DEVICE AREA | + * | DRIVER AREA | + * | VQ STATE (CFG 2+) | RSVD | + * + * Magic number should be written last, this is the DPU indication that the data is ready + */ + + /* Init offset */ + off = snet->psnet->cfg.host_cfg_off; + + /* Ignore magic number for now */ + off += 4; + snet_write32(snet, off, snet->psnet->negotiated_cfg_ver); + off += 4; + snet_write32(snet, off, snet->sid); + off += 4; + snet_write32(snet, off, snet->cfg->vq_num); + off += 4; + snet_write32(snet, off, snet->cfg_irq_idx); + off += 4; + snet_write64(snet, off, snet->negotiated_features); + off += 8; + /* Ignore reserved */ + off += 8; + /* Write VQs */ + for (i = 0 ; i < snet->cfg->vq_num ; i++) { + tmp = (i << 16) | (snet->vqs[i]->num & 0xFFFF); + snet_write32(snet, off, tmp); + off += 4; + snet_write32(snet, off, snet->vqs[i]->irq_idx); + off += 4; + snet_write64(snet, off, snet->vqs[i]->desc_area); + off += 8; + snet_write64(snet, off, snet->vqs[i]->device_area); + off += 8; + snet_write64(snet, off, snet->vqs[i]->driver_area); + off += 8; + /* Write VQ state if config version is 2+ */ + if (SNET_CFG_VER(snet, 2)) + snet_write32(snet, off, *(u32 *)&snet->vqs[i]->vq_state); + off += 4; + + /* Ignore reserved */ + off += 4; + } + + /* Write magic number - data is ready */ + snet_write32(snet, snet->psnet->cfg.host_cfg_off, SNET_SIGNATURE); + + /* The DPU will ACK the config by clearing the signature */ + ret = readx_poll_timeout(ioread32, snet->bar + snet->psnet->cfg.host_cfg_off, + tmp, !tmp, 10, SNET_READ_CFG_TIMEOUT); + if (ret) { + SNET_ERR(snet->pdev, "Timeout waiting for the DPU to read the config\n"); + return false; + } + + /* set DPU flag */ + snet->dpu_ready = true; + + return true; +} + +static int snet_request_irqs(struct pci_dev *pdev, struct snet *snet) +{ + int ret, i, irq; + + /* Request config IRQ */ + irq = pci_irq_vector(pdev, snet->cfg_irq_idx); + ret = devm_request_irq(&pdev->dev, irq, snet_cfg_irq_hndlr, 0, + snet->cfg_irq_name, snet); + if (ret) { + SNET_ERR(pdev, "Failed to request IRQ\n"); + return ret; + } + snet->cfg_irq = irq; + + /* Request IRQ for every VQ */ + for (i = 0; i < snet->cfg->vq_num; i++) { + irq = pci_irq_vector(pdev, snet->vqs[i]->irq_idx); + ret = devm_request_irq(&pdev->dev, irq, snet_vq_irq_hndlr, 0, + snet->vqs[i]->irq_name, snet->vqs[i]); + if (ret) { + SNET_ERR(pdev, "Failed to request IRQ\n"); + return ret; + } + snet->vqs[i]->irq = irq; + } + return 0; +} + +static void snet_set_status(struct vdpa_device *vdev, u8 status) +{ + struct snet *snet = vdpa_to_snet(vdev); + struct psnet *psnet = snet->psnet; + struct pci_dev *pdev = snet->pdev; + int ret; + bool pf_irqs; + + if (status == snet->status) + return; + + if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && + !(snet->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + /* Request IRQs */ + pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF); + ret = snet_request_irqs(pf_irqs ? pdev->physfn : pdev, snet); + if (ret) + goto set_err; + + /* Write config to the DPU */ + if (snet_write_conf(snet)) { + SNET_INFO(pdev, "Create SNET[%u] device\n", snet->sid); + } else { + snet_free_irqs(snet); + goto set_err; + } + } + + /* Save the new status */ + snet->status = status; + return; + +set_err: + snet->status |= VIRTIO_CONFIG_S_FAILED; +} + +static void snet_get_config(struct vdpa_device *vdev, unsigned int offset, + void *buf, unsigned int len) +{ + struct snet *snet = vdpa_to_snet(vdev); + void __iomem *cfg_ptr = snet->cfg->virtio_cfg + offset; + u8 *buf_ptr = buf; + u32 i; + + /* check for offset error */ + if (offset + len > snet->cfg->cfg_size) + return; + + /* Write into buffer */ + for (i = 0; i < len; i++) + *buf_ptr++ = ioread8(cfg_ptr + i); +} + +static void snet_set_config(struct vdpa_device *vdev, unsigned int offset, + const void *buf, unsigned int len) +{ + struct snet *snet = vdpa_to_snet(vdev); + void __iomem *cfg_ptr = snet->cfg->virtio_cfg + offset; + const u8 *buf_ptr = buf; + u32 i; + + /* check for offset error */ + if (offset + len > snet->cfg->cfg_size) + return; + + /* Write into PCI BAR */ + for (i = 0; i < len; i++) + iowrite8(*buf_ptr++, cfg_ptr + i); +} + +static int snet_suspend(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + int ret; + + ret = snet_suspend_dev(snet); + if (ret) + SNET_ERR(snet->pdev, "SNET[%u] suspend failed, err: %d\n", snet->sid, ret); + else + SNET_DBG(snet->pdev, "Suspend SNET[%u] device\n", snet->sid); + + return ret; +} + +static int snet_resume(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + int ret; + + ret = snet_resume_dev(snet); + if (ret) + SNET_ERR(snet->pdev, "SNET[%u] resume failed, err: %d\n", snet->sid, ret); + else + SNET_DBG(snet->pdev, "Resume SNET[%u] device\n", snet->sid); + + return ret; +} + +static const struct vdpa_config_ops snet_config_ops = { + .set_vq_address = snet_set_vq_address, + .set_vq_num = snet_set_vq_num, + .kick_vq = snet_kick_vq, + .kick_vq_with_data = snet_kick_vq_with_data, + .set_vq_cb = snet_set_vq_cb, + .set_vq_ready = snet_set_vq_ready, + .get_vq_ready = snet_get_vq_ready, + .set_vq_state = snet_set_vq_state, + .get_vq_state = snet_get_vq_state, + .get_vq_irq = snet_get_vq_irq, + .get_vq_align = snet_get_vq_align, + .reset = snet_reset, + .get_config_size = snet_get_config_size, + .get_device_features = snet_get_features, + .set_driver_features = snet_set_drv_features, + .get_driver_features = snet_get_drv_features, + .get_vq_num_min = snet_get_vq_num_max, + .get_vq_num_max = snet_get_vq_num_max, + .set_config_cb = snet_set_config_cb, + .get_device_id = snet_get_device_id, + .get_vendor_id = snet_get_vendor_id, + .get_status = snet_get_status, + .set_status = snet_set_status, + .get_config = snet_get_config, + .set_config = snet_set_config, + .suspend = snet_suspend, + .resume = snet_resume, +}; + +static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet) +{ + char *name; + unsigned short i; + bool bars_found = false; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "psnet[%s]-bars", pci_name(pdev)); + if (!name) + return -ENOMEM; + + /* We don't know which BAR will be used to communicate.. + * We will map every bar with len > 0. + * + * Later, we will discover the BAR and unmap all other BARs. + */ + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + void __iomem *io; + + if (pci_resource_len(pdev, i) == 0) + continue; + + io = pcim_iomap_region(pdev, i, name); + if (IS_ERR(io)) { + SNET_ERR(pdev, "Failed to request and map PCI BARs\n"); + return PTR_ERR(io); + } + + psnet->bars[i] = io; + bars_found = true; + } + + /* No BAR can be used.. */ + if (!bars_found) { + SNET_ERR(pdev, "Failed to find a PCI BAR\n"); + return -ENODEV; + } + + return 0; +} + +static int snet_open_vf_bar(struct pci_dev *pdev, struct snet *snet) +{ + char *name; + void __iomem *io; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "snet[%s]-bars", pci_name(pdev)); + if (!name) + return -ENOMEM; + + /* Request and map BAR */ + io = pcim_iomap_region(pdev, snet->psnet->cfg.vf_bar, name); + if (IS_ERR(io)) { + SNET_ERR(pdev, "Failed to request and map PCI BAR for a VF\n"); + return PTR_ERR(io); + } + + snet->bar = io; + + return 0; +} + +static void snet_free_cfg(struct snet_cfg *cfg) +{ + u32 i; + + if (!cfg->devs) + return; + + /* Free devices */ + for (i = 0; i < cfg->devices_num; i++) { + if (!cfg->devs[i]) + break; + + kfree(cfg->devs[i]); + } + /* Free pointers to devices */ + kfree(cfg->devs); +} + +/* Detect which BAR is used for communication with the device. */ +static int psnet_detect_bar(struct psnet *psnet, u32 off) +{ + unsigned long exit_time; + int i; + + exit_time = jiffies + usecs_to_jiffies(SNET_DETECT_TIMEOUT); + + /* SNET DPU will write SNET's signature when the config is ready. */ + while (time_before(jiffies, exit_time)) { + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + /* Is this BAR mapped? */ + if (!psnet->bars[i]) + continue; + + if (ioread32(psnet->bars[i] + off) == SNET_SIGNATURE) + return i; + } + usleep_range(1000, 10000); + } + + return -ENODEV; +} + +static void psnet_unmap_unused_bars(struct pci_dev *pdev, struct psnet *psnet) +{ + unsigned short i; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (psnet->bars[i] && i != psnet->barno) + pcim_iounmap_region(pdev, i); + } +} + +/* Read SNET config from PCI BAR */ +static int psnet_read_cfg(struct pci_dev *pdev, struct psnet *psnet) +{ + struct snet_cfg *cfg = &psnet->cfg; + u32 i, off; + int barno; + + /* Move to where the config starts */ + off = SNET_CONFIG_OFF; + + /* Find BAR used for communication */ + barno = psnet_detect_bar(psnet, off); + if (barno < 0) { + SNET_ERR(pdev, "SNET config is not ready.\n"); + return barno; + } + + /* Save used BAR number and unmap all other BARs */ + psnet->barno = barno; + SNET_DBG(pdev, "Using BAR number %d\n", barno); + + psnet_unmap_unused_bars(pdev, psnet); + + /* load config from BAR */ + cfg->key = psnet_read32(psnet, off); + off += 4; + cfg->cfg_size = psnet_read32(psnet, off); + off += 4; + cfg->cfg_ver = psnet_read32(psnet, off); + off += 4; + /* The negotiated config version is the lower one between this driver's config + * and the DPU's. + */ + psnet->negotiated_cfg_ver = min_t(u32, cfg->cfg_ver, SNET_CFG_VERSION); + SNET_DBG(pdev, "SNET config version %u\n", psnet->negotiated_cfg_ver); + + cfg->vf_num = psnet_read32(psnet, off); + off += 4; + cfg->vf_bar = psnet_read32(psnet, off); + off += 4; + cfg->host_cfg_off = psnet_read32(psnet, off); + off += 4; + cfg->max_size_host_cfg = psnet_read32(psnet, off); + off += 4; + cfg->virtio_cfg_off = psnet_read32(psnet, off); + off += 4; + cfg->kick_off = psnet_read32(psnet, off); + off += 4; + cfg->hwmon_off = psnet_read32(psnet, off); + off += 4; + cfg->ctrl_off = psnet_read32(psnet, off); + off += 4; + cfg->flags = psnet_read32(psnet, off); + off += 4; + /* Ignore Reserved */ + off += sizeof(cfg->rsvd); + + cfg->devices_num = psnet_read32(psnet, off); + off += 4; + /* Allocate memory to hold pointer to the devices */ + cfg->devs = kcalloc(cfg->devices_num, sizeof(void *), GFP_KERNEL); + if (!cfg->devs) + return -ENOMEM; + + /* Load device configuration from BAR */ + for (i = 0; i < cfg->devices_num; i++) { + cfg->devs[i] = kzalloc(sizeof(*cfg->devs[i]), GFP_KERNEL); + if (!cfg->devs[i]) { + snet_free_cfg(cfg); + return -ENOMEM; + } + /* Read device config */ + cfg->devs[i]->virtio_id = psnet_read32(psnet, off); + off += 4; + cfg->devs[i]->vq_num = psnet_read32(psnet, off); + off += 4; + cfg->devs[i]->vq_size = psnet_read32(psnet, off); + off += 4; + cfg->devs[i]->vfid = psnet_read32(psnet, off); + off += 4; + cfg->devs[i]->features = psnet_read64(psnet, off); + off += 8; + /* Ignore Reserved */ + off += sizeof(cfg->devs[i]->rsvd); + + cfg->devs[i]->cfg_size = psnet_read32(psnet, off); + off += 4; + + /* Is the config witten to the DPU going to be too big? */ + if (SNET_GENERAL_CFG_LEN + SNET_GENERAL_CFG_VQ_LEN * cfg->devs[i]->vq_num > + cfg->max_size_host_cfg) { + SNET_ERR(pdev, "Failed to read SNET config, the config is too big..\n"); + snet_free_cfg(cfg); + return -EINVAL; + } + } + return 0; +} + +static int psnet_alloc_irq_vector(struct pci_dev *pdev, struct psnet *psnet) +{ + int ret = 0; + u32 i, irq_num = 0; + + /* Let's count how many IRQs we need, 1 for every VQ + 1 for config change */ + for (i = 0; i < psnet->cfg.devices_num; i++) + irq_num += psnet->cfg.devs[i]->vq_num + 1; + + ret = pci_alloc_irq_vectors(pdev, irq_num, irq_num, PCI_IRQ_MSIX); + if (ret != irq_num) { + SNET_ERR(pdev, "Failed to allocate IRQ vectors\n"); + return ret; + } + SNET_DBG(pdev, "Allocated %u IRQ vectors from physical function\n", irq_num); + + return 0; +} + +static int snet_alloc_irq_vector(struct pci_dev *pdev, struct snet_dev_cfg *snet_cfg) +{ + int ret = 0; + u32 irq_num; + + /* We want 1 IRQ for every VQ + 1 for config change events */ + irq_num = snet_cfg->vq_num + 1; + + ret = pci_alloc_irq_vectors(pdev, irq_num, irq_num, PCI_IRQ_MSIX); + if (ret <= 0) { + SNET_ERR(pdev, "Failed to allocate IRQ vectors\n"); + return ret; + } + + return 0; +} + +static void snet_free_vqs(struct snet *snet) +{ + u32 i; + + if (!snet->vqs) + return; + + for (i = 0 ; i < snet->cfg->vq_num ; i++) { + if (!snet->vqs[i]) + break; + + kfree(snet->vqs[i]); + } + kfree(snet->vqs); +} + +static int snet_build_vqs(struct snet *snet) +{ + u32 i; + /* Allocate the VQ pointers array */ + snet->vqs = kcalloc(snet->cfg->vq_num, sizeof(void *), GFP_KERNEL); + if (!snet->vqs) + return -ENOMEM; + + /* Allocate the VQs */ + for (i = 0; i < snet->cfg->vq_num; i++) { + snet->vqs[i] = kzalloc(sizeof(*snet->vqs[i]), GFP_KERNEL); + if (!snet->vqs[i]) { + snet_free_vqs(snet); + return -ENOMEM; + } + /* Reset IRQ num */ + snet->vqs[i]->irq = -1; + /* VQ serial ID */ + snet->vqs[i]->sid = i; + /* Kick address - every VQ gets 4B */ + snet->vqs[i]->kick_ptr = snet->bar + snet->psnet->cfg.kick_off + + snet->vqs[i]->sid * 4; + /* Clear kick address for this VQ */ + iowrite32(0, snet->vqs[i]->kick_ptr); + } + return 0; +} + +static int psnet_get_next_irq_num(struct psnet *psnet) +{ + int irq; + + spin_lock(&psnet->lock); + irq = psnet->next_irq++; + spin_unlock(&psnet->lock); + + return irq; +} + +static void snet_reserve_irq_idx(struct pci_dev *pdev, struct snet *snet) +{ + struct psnet *psnet = snet->psnet; + int i; + + /* one IRQ for every VQ, and one for config changes */ + snet->cfg_irq_idx = psnet_get_next_irq_num(psnet); + snprintf(snet->cfg_irq_name, SNET_NAME_SIZE, "snet[%s]-cfg[%d]", + pci_name(pdev), snet->cfg_irq_idx); + + for (i = 0; i < snet->cfg->vq_num; i++) { + /* Get next free IRQ ID */ + snet->vqs[i]->irq_idx = psnet_get_next_irq_num(psnet); + /* Write IRQ name */ + snprintf(snet->vqs[i]->irq_name, SNET_NAME_SIZE, "snet[%s]-vq[%d]", + pci_name(pdev), snet->vqs[i]->irq_idx); + } +} + +/* Find a device config based on virtual function id */ +static struct snet_dev_cfg *snet_find_dev_cfg(struct snet_cfg *cfg, u32 vfid) +{ + u32 i; + + for (i = 0; i < cfg->devices_num; i++) { + if (cfg->devs[i]->vfid == vfid) + return cfg->devs[i]; + } + /* Oppss.. no config found.. */ + return NULL; +} + +/* Probe function for a physical PCI function */ +static int snet_vdpa_probe_pf(struct pci_dev *pdev) +{ + struct psnet *psnet; + int ret = 0; + bool pf_irqs = false; + + ret = pcim_enable_device(pdev); + if (ret) { + SNET_ERR(pdev, "Failed to enable PCI device\n"); + return ret; + } + + /* Allocate a PCI physical function device */ + psnet = kzalloc(sizeof(*psnet), GFP_KERNEL); + if (!psnet) + return -ENOMEM; + + /* Init PSNET spinlock */ + spin_lock_init(&psnet->lock); + + pci_set_master(pdev); + pci_set_drvdata(pdev, psnet); + + /* Open SNET MAIN BAR */ + ret = psnet_open_pf_bar(pdev, psnet); + if (ret) + goto free_psnet; + + /* Try to read SNET's config from PCI BAR */ + ret = psnet_read_cfg(pdev, psnet); + if (ret) + goto free_psnet; + + /* If SNET_CFG_FLAG_IRQ_PF flag is set, we should use + * PF MSI-X vectors + */ + pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF); + + if (pf_irqs) { + ret = psnet_alloc_irq_vector(pdev, psnet); + if (ret) + goto free_cfg; + } + + SNET_DBG(pdev, "Enable %u virtual functions\n", psnet->cfg.vf_num); + ret = pci_enable_sriov(pdev, psnet->cfg.vf_num); + if (ret) { + SNET_ERR(pdev, "Failed to enable SR-IOV\n"); + goto free_irq; + } + + /* Create HW monitor device */ + if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_HWMON)) { +#if IS_ENABLED(CONFIG_HWMON) + psnet_create_hwmon(pdev); +#else + SNET_WARN(pdev, "Can't start HWMON, CONFIG_HWMON is not enabled\n"); +#endif + } + + return 0; + +free_irq: + if (pf_irqs) + pci_free_irq_vectors(pdev); +free_cfg: + snet_free_cfg(&psnet->cfg); +free_psnet: + kfree(psnet); + return ret; +} + +/* Probe function for a virtual PCI function */ +static int snet_vdpa_probe_vf(struct pci_dev *pdev) +{ + struct pci_dev *pdev_pf = pdev->physfn; + struct psnet *psnet = pci_get_drvdata(pdev_pf); + struct snet_dev_cfg *dev_cfg; + struct snet *snet; + u32 vfid; + int ret; + bool pf_irqs = false; + + /* Get virtual function id. + * (the DPU counts the VFs from 1) + */ + ret = pci_iov_vf_id(pdev); + if (ret < 0) { + SNET_ERR(pdev, "Failed to find a VF id\n"); + return ret; + } + vfid = ret + 1; + + /* Find the snet_dev_cfg based on vfid */ + dev_cfg = snet_find_dev_cfg(&psnet->cfg, vfid); + if (!dev_cfg) { + SNET_WARN(pdev, "Failed to find a VF config..\n"); + return -ENODEV; + } + + /* Which PCI device should allocate the IRQs? + * If the SNET_CFG_FLAG_IRQ_PF flag set, the PF device allocates the IRQs + */ + pf_irqs = PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF); + + ret = pcim_enable_device(pdev); + if (ret) { + SNET_ERR(pdev, "Failed to enable PCI VF device\n"); + return ret; + } + + /* Request for MSI-X IRQs */ + if (!pf_irqs) { + ret = snet_alloc_irq_vector(pdev, dev_cfg); + if (ret) + return ret; + } + + /* Allocate vdpa device */ + snet = vdpa_alloc_device(struct snet, vdpa, &pdev->dev, &snet_config_ops, + NULL, 1, 1, NULL, false); + if (!snet) { + SNET_ERR(pdev, "Failed to allocate a vdpa device\n"); + ret = -ENOMEM; + goto free_irqs; + } + + /* Init control mutex and spinlock */ + mutex_init(&snet->ctrl_lock); + spin_lock_init(&snet->ctrl_spinlock); + + /* Save pci device pointer */ + snet->pdev = pdev; + snet->psnet = psnet; + snet->cfg = dev_cfg; + snet->dpu_ready = false; + snet->sid = vfid; + /* Reset IRQ value */ + snet->cfg_irq = -1; + + ret = snet_open_vf_bar(pdev, snet); + if (ret) + goto put_device; + + /* Create a VirtIO config pointer */ + snet->cfg->virtio_cfg = snet->bar + snet->psnet->cfg.virtio_cfg_off; + + /* Clear control registers */ + snet_ctrl_clear(snet); + + pci_set_master(pdev); + pci_set_drvdata(pdev, snet); + + ret = snet_build_vqs(snet); + if (ret) + goto put_device; + + /* Reserve IRQ indexes, + * The IRQs may be requested and freed multiple times, + * but the indexes won't change. + */ + snet_reserve_irq_idx(pf_irqs ? pdev_pf : pdev, snet); + + /* set map metadata */ + snet->vdpa.vmap.dma_dev = &pdev->dev; + + /* Register VDPA device */ + ret = vdpa_register_device(&snet->vdpa, snet->cfg->vq_num); + if (ret) { + SNET_ERR(pdev, "Failed to register vdpa device\n"); + goto free_vqs; + } + + return 0; + +free_vqs: + snet_free_vqs(snet); +put_device: + put_device(&snet->vdpa.dev); +free_irqs: + if (!pf_irqs) + pci_free_irq_vectors(pdev); + return ret; +} + +static int snet_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + if (pdev->is_virtfn) + return snet_vdpa_probe_vf(pdev); + else + return snet_vdpa_probe_pf(pdev); +} + +static void snet_vdpa_remove_pf(struct pci_dev *pdev) +{ + struct psnet *psnet = pci_get_drvdata(pdev); + + pci_disable_sriov(pdev); + /* If IRQs are allocated from the PF, we should free the IRQs */ + if (PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF)) + pci_free_irq_vectors(pdev); + + snet_free_cfg(&psnet->cfg); + kfree(psnet); +} + +static void snet_vdpa_remove_vf(struct pci_dev *pdev) +{ + struct snet *snet = pci_get_drvdata(pdev); + struct psnet *psnet = snet->psnet; + + vdpa_unregister_device(&snet->vdpa); + snet_free_vqs(snet); + /* If IRQs are allocated from the VF, we should free the IRQs */ + if (!PSNET_FLAG_ON(psnet, SNET_CFG_FLAG_IRQ_PF)) + pci_free_irq_vectors(pdev); +} + +static void snet_vdpa_remove(struct pci_dev *pdev) +{ + if (pdev->is_virtfn) + snet_vdpa_remove_vf(pdev); + else + snet_vdpa_remove_pf(pdev); +} + +static struct pci_device_id snet_driver_pci_ids[] = { + { PCI_DEVICE_SUB(PCI_VENDOR_ID_SOLIDRUN, SNET_DEVICE_ID, + PCI_VENDOR_ID_SOLIDRUN, SNET_DEVICE_ID) }, + { 0 }, +}; + +MODULE_DEVICE_TABLE(pci, snet_driver_pci_ids); + +static struct pci_driver snet_vdpa_driver = { + .name = "snet-vdpa-driver", + .id_table = snet_driver_pci_ids, + .probe = snet_vdpa_probe, + .remove = snet_vdpa_remove, +}; + +module_pci_driver(snet_vdpa_driver); + +MODULE_AUTHOR("Alvaro Karsz <alvaro.karsz@solid-run.com>"); +MODULE_DESCRIPTION("SolidRun vDPA driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/vdpa/solidrun/snet_vdpa.h b/drivers/vdpa/solidrun/snet_vdpa.h new file mode 100644 index 000000000000..36ac285835ea --- /dev/null +++ b/drivers/vdpa/solidrun/snet_vdpa.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ +#ifndef _SNET_VDPA_H_ +#define _SNET_VDPA_H_ + +#include <linux/vdpa.h> +#include <linux/pci.h> + +#define SNET_NAME_SIZE 256 + +#define SNET_ERR(pdev, fmt, ...) dev_err(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) +#define SNET_WARN(pdev, fmt, ...) dev_warn(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) +#define SNET_INFO(pdev, fmt, ...) dev_info(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) +#define SNET_DBG(pdev, fmt, ...) dev_dbg(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) +#define SNET_HAS_FEATURE(s, f) ((s)->negotiated_features & BIT_ULL(f)) +/* Check if negotiated config version is at least @ver */ +#define SNET_CFG_VER(snet, ver) ((snet)->psnet->negotiated_cfg_ver >= (ver)) + +/* VQ struct */ +struct snet_vq { + /* VQ callback */ + struct vdpa_callback cb; + /* VQ state received from bus */ + struct vdpa_vq_state vq_state; + /* desc base address */ + u64 desc_area; + /* device base address */ + u64 device_area; + /* driver base address */ + u64 driver_area; + /* Queue size */ + u32 num; + /* Serial ID for VQ */ + u32 sid; + /* is ready flag */ + bool ready; + /* IRQ number */ + u32 irq; + /* IRQ index, DPU uses this to parse data from MSI-X table */ + u32 irq_idx; + /* IRQ name */ + char irq_name[SNET_NAME_SIZE]; + /* pointer to mapped PCI BAR register used by this VQ to kick */ + void __iomem *kick_ptr; +}; + +struct snet { + /* vdpa device */ + struct vdpa_device vdpa; + /* Config callback */ + struct vdpa_callback cb; + /* To lock the control mechanism */ + struct mutex ctrl_lock; + /* Spinlock to protect critical parts in the control mechanism */ + spinlock_t ctrl_spinlock; + /* array of virqueues */ + struct snet_vq **vqs; + /* Used features */ + u64 negotiated_features; + /* Device serial ID */ + u32 sid; + /* device status */ + u8 status; + /* boolean indicating if snet config was passed to the device */ + bool dpu_ready; + /* IRQ number */ + u32 cfg_irq; + /* IRQ index, DPU uses this to parse data from MSI-X table */ + u32 cfg_irq_idx; + /* IRQ name */ + char cfg_irq_name[SNET_NAME_SIZE]; + /* BAR to access the VF */ + void __iomem *bar; + /* PCI device */ + struct pci_dev *pdev; + /* Pointer to snet pdev parent device */ + struct psnet *psnet; + /* Pointer to snet config device */ + struct snet_dev_cfg *cfg; +}; + +struct snet_dev_cfg { + /* Device ID following VirtIO spec. */ + u32 virtio_id; + /* Number of VQs for this device */ + u32 vq_num; + /* Size of every VQ */ + u32 vq_size; + /* Virtual Function id */ + u32 vfid; + /* Device features, following VirtIO spec */ + u64 features; + /* Reserved for future usage */ + u32 rsvd[6]; + /* VirtIO device specific config size */ + u32 cfg_size; + /* VirtIO device specific config address */ + void __iomem *virtio_cfg; +} __packed; + +struct snet_cfg { + /* Magic key */ + u32 key; + /* Size of total config in bytes */ + u32 cfg_size; + /* Config version */ + u32 cfg_ver; + /* Number of Virtual Functions to create */ + u32 vf_num; + /* BAR to use for the VFs */ + u32 vf_bar; + /* Where should we write the SNET's config */ + u32 host_cfg_off; + /* Max. allowed size for a SNET's config */ + u32 max_size_host_cfg; + /* VirtIO config offset in BAR */ + u32 virtio_cfg_off; + /* Offset in PCI BAR for VQ kicks */ + u32 kick_off; + /* Offset in PCI BAR for HW monitoring */ + u32 hwmon_off; + /* Offset in PCI BAR for Control mechanism */ + u32 ctrl_off; + /* Config general flags - enum snet_cfg_flags */ + u32 flags; + /* Reserved for future usage */ + u32 rsvd[6]; + /* Number of snet devices */ + u32 devices_num; + /* The actual devices */ + struct snet_dev_cfg **devs; +} __packed; + +/* SolidNET PCIe device, one device per PCIe physical function */ +struct psnet { + /* PCI BARs */ + void __iomem *bars[PCI_STD_NUM_BARS]; + /* Negotiated config version */ + u32 negotiated_cfg_ver; + /* Next IRQ index to use in case when the IRQs are allocated from this device */ + u32 next_irq; + /* BAR number used to communicate with the device */ + u8 barno; + /* spinlock to protect data that can be changed by SNET devices */ + spinlock_t lock; + /* Pointer to the device's config read from BAR */ + struct snet_cfg cfg; + /* Name of monitor device */ + char hwmon_name[SNET_NAME_SIZE]; +}; + +enum snet_cfg_flags { + /* Create a HWMON device */ + SNET_CFG_FLAG_HWMON = BIT(0), + /* USE IRQs from the physical function */ + SNET_CFG_FLAG_IRQ_PF = BIT(1), +}; + +#define PSNET_FLAG_ON(p, f) ((p)->cfg.flags & (f)) + +static inline u32 psnet_read32(struct psnet *psnet, u32 off) +{ + return ioread32(psnet->bars[psnet->barno] + off); +} + +static inline u32 snet_read32(struct snet *snet, u32 off) +{ + return ioread32(snet->bar + off); +} + +static inline void snet_write32(struct snet *snet, u32 off, u32 val) +{ + iowrite32(val, snet->bar + off); +} + +static inline u64 psnet_read64(struct psnet *psnet, u32 off) +{ + u64 val; + /* 64bits are written in 2 halves, low part first */ + val = (u64)psnet_read32(psnet, off); + val |= ((u64)psnet_read32(psnet, off + 4) << 32); + return val; +} + +static inline void snet_write64(struct snet *snet, u32 off, u64 val) +{ + /* The DPU expects a 64bit integer in 2 halves, the low part first */ + snet_write32(snet, off, (u32)val); + snet_write32(snet, off + 4, (u32)(val >> 32)); +} + +#if IS_ENABLED(CONFIG_HWMON) +void psnet_create_hwmon(struct pci_dev *pdev); +#endif + +void snet_ctrl_clear(struct snet *snet); +int snet_destroy_dev(struct snet *snet); +int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state *state); +int snet_suspend_dev(struct snet *snet); +int snet_resume_dev(struct snet *snet); + +#endif //_SNET_VDPA_H_ diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 8ef7aa1365cc..34874beb0152 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -39,6 +39,11 @@ static int vdpa_dev_probe(struct device *d) u32 max_num, min_num = 1; int ret = 0; + d->dma_mask = &d->coherent_dma_mask; + ret = dma_set_mask_and_coherent(d, DMA_BIT_MASK(64)); + if (ret) + return ret; + max_num = ops->get_vq_num_max(vdev); if (ops->get_vq_num_min) min_num = ops->get_vq_num_min(vdev); @@ -60,7 +65,7 @@ static void vdpa_dev_remove(struct device *d) drv->remove(vdev); } -static int vdpa_dev_match(struct device *dev, struct device_driver *drv) +static int vdpa_dev_match(struct device *dev, const struct device_driver *drv) { struct vdpa_device *vdev = dev_to_vdpa(dev); @@ -93,7 +98,7 @@ static ssize_t driver_override_show(struct device *dev, ssize_t len; device_lock(dev); - len = snprintf(buf, PAGE_SIZE, "%s\n", vdev->driver_override); + len = sysfs_emit(buf, "%s\n", vdev->driver_override); device_unlock(dev); return len; @@ -110,7 +115,7 @@ static const struct attribute_group vdpa_dev_group = { }; __ATTRIBUTE_GROUPS(vdpa_dev); -static struct bus_type vdpa_bus = { +static const struct bus_type vdpa_bus = { .name = "vdpa", .dev_groups = vdpa_dev_groups, .match = vdpa_dev_match, @@ -126,7 +131,7 @@ static void vdpa_release_dev(struct device *d) if (ops->free) ops->free(vdev); - ida_simple_remove(&vdpa_index_ida, vdev->index); + ida_free(&vdpa_index_ida, vdev->index); kfree(vdev->driver_override); kfree(vdev); } @@ -137,6 +142,7 @@ static void vdpa_release_dev(struct device *d) * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device + * @map: the map operations that is supported by this device * @ngroups: number of groups supported by this device * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data @@ -146,11 +152,12 @@ static void vdpa_release_dev(struct device *d) * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * - * Return: Returns an error when parent/config/dma_dev is not set or fail to get + * Return: Returns an error when parent/config/map is not set or fail to get * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, + const struct virtio_map_ops *map, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) @@ -182,6 +189,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->dev.release = vdpa_release_dev; vdev->index = err; vdev->config = config; + vdev->map = map; vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; @@ -200,7 +208,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, return vdev; err_name: - ida_simple_remove(&vdpa_index_ida, vdev->index); + ida_free(&vdpa_index_ida, vdev->index); err_ida: kfree(vdev); err: @@ -460,12 +468,28 @@ static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mg return 0; } +static u64 vdpa_mgmtdev_get_classes(const struct vdpa_mgmt_dev *mdev, + unsigned int *nclasses) +{ + u64 supported_classes = 0; + unsigned int n = 0; + + for (int i = 0; mdev->id_table[i].device; i++) { + if (mdev->id_table[i].device > 63) + continue; + supported_classes |= BIT_ULL(mdev->id_table[i].device); + n++; + } + if (nclasses) + *nclasses = n; + + return supported_classes; +} + static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg, u32 portid, u32 seq, int flags) { - u64 supported_classes = 0; void *hdr; - int i = 0; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW); @@ -475,14 +499,9 @@ static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *m if (err) goto msg_err; - while (mdev->id_table[i].device) { - if (mdev->id_table[i].device <= 63) - supported_classes |= BIT_ULL(mdev->id_table[i].device); - i++; - } - if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, - supported_classes, VDPA_ATTR_UNSPEC)) { + vdpa_mgmtdev_get_classes(mdev, NULL), + VDPA_ATTR_UNSPEC)) { err = -EMSGSIZE; goto msg_err; } @@ -566,13 +585,25 @@ out: BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) +/* + * Bitmask for all per-device features: feature bits VIRTIO_TRANSPORT_F_START + * through VIRTIO_TRANSPORT_F_END are unset, i.e. 0xfffffc000fffffff for + * all 64bit features. If the features are extended beyond 64 bits, or new + * "holes" are reserved for other type of features than per-device, this + * macro would have to be updated. + */ +#define VIRTIO_DEVICE_F_MASK (~0ULL << (VIRTIO_TRANSPORT_F_END + 1) | \ + ((1ULL << VIRTIO_TRANSPORT_F_START) - 1)) + static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_dev_set_config config = {}; struct nlattr **nl_attrs = info->attrs; struct vdpa_mgmt_dev *mdev; + unsigned int ncls = 0; const u8 *macaddr; const char *name; + u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) @@ -601,8 +632,26 @@ static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *i config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); } if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) { + u64 missing = 0x0ULL; + config.device_features = nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]); + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR] && + !(config.device_features & BIT_ULL(VIRTIO_NET_F_MAC))) + missing |= BIT_ULL(VIRTIO_NET_F_MAC); + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU] && + !(config.device_features & BIT_ULL(VIRTIO_NET_F_MTU))) + missing |= BIT_ULL(VIRTIO_NET_F_MTU); + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP] && + config.net.max_vq_pairs > 1 && + !(config.device_features & BIT_ULL(VIRTIO_NET_F_MQ))) + missing |= BIT_ULL(VIRTIO_NET_F_MQ); + if (missing) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "Missing features 0x%llx for provided attributes", + missing); + return -EINVAL; + } config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); } @@ -622,13 +671,33 @@ static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *i err = PTR_ERR(mdev); goto err; } + if ((config.mask & mdev->config_attr_mask) != config.mask) { - NL_SET_ERR_MSG_MOD(info->extack, - "All provided attributes are not supported"); + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "Some provided attributes are not supported: 0x%llx", + config.mask & ~mdev->config_attr_mask); err = -EOPNOTSUPP; goto err; } + classes = vdpa_mgmtdev_get_classes(mdev, &ncls); + if (config.mask & VDPA_DEV_NET_ATTRS_MASK && + !(classes & BIT_ULL(VIRTIO_ID_NET))) { + NL_SET_ERR_MSG_MOD(info->extack, + "Network class attributes provided on unsupported management device"); + err = -EINVAL; + goto err; + } + if (!(config.mask & VDPA_DEV_NET_ATTRS_MASK) && + config.mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES) && + classes & BIT_ULL(VIRTIO_ID_NET) && ncls > 1 && + config.device_features & VIRTIO_DEVICE_F_MASK) { + NL_SET_ERR_MSG_MOD(info->extack, + "Management device supports multi-class while device features specified are ambiguous"); + err = -EINVAL; + goto err; + } + err = mdev->ops->dev_add(mdev, name, &config); err: up_write(&vdpa_dev_lock); @@ -841,18 +910,25 @@ static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features, sizeof(config->mac), config->mac); } +static int vdpa_dev_net_status_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_net_config *config) +{ + u16 val_u16; + + if ((features & BIT_ULL(VIRTIO_NET_F_STATUS)) == 0) + return 0; + + val_u16 = __virtio16_to_cpu(true, config->status); + return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16); +} + static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_net_config config = {}; u64 features_device; - u16 val_u16; vdev->config->get_config(vdev, 0, &config, sizeof(config)); - val_u16 = __virtio16_to_cpu(true, config.status); - if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16)) - return -EMSGSIZE; - features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, @@ -865,10 +941,222 @@ static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *ms if (vdpa_dev_net_mac_config_fill(msg, features_device, &config)) return -EMSGSIZE; + if (vdpa_dev_net_status_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + return vdpa_dev_net_mq_config_fill(msg, features_device, &config); } static int +vdpa_dev_blk_capacity_config_fill(struct sk_buff *msg, + const struct virtio_blk_config *config) +{ + u64 val_u64; + + val_u64 = __virtio64_to_cpu(true, config->capacity); + + return nla_put_u64_64bit(msg, VDPA_ATTR_DEV_BLK_CFG_CAPACITY, + val_u64, VDPA_ATTR_PAD); +} + +static int +vdpa_dev_blk_seg_size_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_blk_config *config) +{ + u32 val_u32; + + if ((features & BIT_ULL(VIRTIO_BLK_F_SIZE_MAX)) == 0) + return 0; + + val_u32 = __virtio32_to_cpu(true, config->size_max); + + return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SIZE_MAX, val_u32); +} + +/* fill the block size*/ +static int +vdpa_dev_blk_block_size_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_blk_config *config) +{ + u32 val_u32; + + if ((features & BIT_ULL(VIRTIO_BLK_F_BLK_SIZE)) == 0) + return 0; + + val_u32 = __virtio32_to_cpu(true, config->blk_size); + + return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_BLK_SIZE, val_u32); +} + +static int +vdpa_dev_blk_seg_max_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_blk_config *config) +{ + u32 val_u32; + + if ((features & BIT_ULL(VIRTIO_BLK_F_SEG_MAX)) == 0) + return 0; + + val_u32 = __virtio32_to_cpu(true, config->seg_max); + + return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SEG_MAX, val_u32); +} + +static int vdpa_dev_blk_mq_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_blk_config *config) +{ + u16 val_u16; + + if ((features & BIT_ULL(VIRTIO_BLK_F_MQ)) == 0) + return 0; + + val_u16 = __virtio16_to_cpu(true, config->num_queues); + + return nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_NUM_QUEUES, val_u16); +} + +static int vdpa_dev_blk_topology_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_blk_config *config) +{ + u16 min_io_size; + u32 opt_io_size; + + if ((features & BIT_ULL(VIRTIO_BLK_F_TOPOLOGY)) == 0) + return 0; + + min_io_size = __virtio16_to_cpu(true, config->min_io_size); + opt_io_size = __virtio32_to_cpu(true, config->opt_io_size); + + if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_PHY_BLK_EXP, + config->physical_block_exp)) + return -EMSGSIZE; + + if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_ALIGN_OFFSET, + config->alignment_offset)) + return -EMSGSIZE; + + if (nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_MIN_IO_SIZE, min_io_size)) + return -EMSGSIZE; + + if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_OPT_IO_SIZE, opt_io_size)) + return -EMSGSIZE; + + return 0; +} + +static int vdpa_dev_blk_discard_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_blk_config *config) +{ + u32 val_u32; + + if ((features & BIT_ULL(VIRTIO_BLK_F_DISCARD)) == 0) + return 0; + + val_u32 = __virtio32_to_cpu(true, config->max_discard_sectors); + if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEC, val_u32)) + return -EMSGSIZE; + + val_u32 = __virtio32_to_cpu(true, config->max_discard_seg); + if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEG, val_u32)) + return -EMSGSIZE; + + val_u32 = __virtio32_to_cpu(true, config->discard_sector_alignment); + if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_DISCARD_SEC_ALIGN, val_u32)) + return -EMSGSIZE; + + return 0; +} + +static int +vdpa_dev_blk_write_zeroes_config_fill(struct sk_buff *msg, u64 features, + const struct virtio_blk_config *config) +{ + u32 val_u32; + + if ((features & BIT_ULL(VIRTIO_BLK_F_WRITE_ZEROES)) == 0) + return 0; + + val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_sectors); + if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEC, val_u32)) + return -EMSGSIZE; + + val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_seg); + if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEG, val_u32)) + return -EMSGSIZE; + + return 0; +} + +static int vdpa_dev_blk_ro_config_fill(struct sk_buff *msg, u64 features) +{ + u8 ro; + + ro = ((features & BIT_ULL(VIRTIO_BLK_F_RO)) == 0) ? 0 : 1; + if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_READ_ONLY, ro)) + return -EMSGSIZE; + + return 0; +} + +static int vdpa_dev_blk_flush_config_fill(struct sk_buff *msg, u64 features) +{ + u8 flush; + + flush = ((features & BIT_ULL(VIRTIO_BLK_F_FLUSH)) == 0) ? 0 : 1; + if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_FLUSH, flush)) + return -EMSGSIZE; + + return 0; +} + +static int vdpa_dev_blk_config_fill(struct vdpa_device *vdev, + struct sk_buff *msg) +{ + struct virtio_blk_config config = {}; + u64 features_device; + + vdev->config->get_config(vdev, 0, &config, sizeof(config)); + + features_device = vdev->config->get_device_features(vdev); + + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, + VDPA_ATTR_PAD)) + return -EMSGSIZE; + + if (vdpa_dev_blk_capacity_config_fill(msg, &config)) + return -EMSGSIZE; + + if (vdpa_dev_blk_seg_size_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_blk_block_size_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_blk_seg_max_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_blk_mq_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_blk_topology_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_blk_discard_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_blk_write_zeroes_config_fill(msg, features_device, &config)) + return -EMSGSIZE; + + if (vdpa_dev_blk_ro_config_fill(msg, features_device)) + return -EMSGSIZE; + + if (vdpa_dev_blk_flush_config_fill(msg, features_device)) + return -EMSGSIZE; + + return 0; +} + +static int vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { @@ -912,6 +1200,9 @@ vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, case VIRTIO_ID_NET: err = vdpa_dev_net_config_fill(vdev, msg); break; + case VIRTIO_ID_BLOCK: + err = vdpa_dev_blk_config_fill(vdev, msg); + break; default: err = -EOPNOTSUPP; break; @@ -1011,7 +1302,7 @@ static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev, switch (device_id) { case VIRTIO_ID_NET: if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { - NL_SET_ERR_MSG_MOD(info->extack, "queue index excceeds max value"); + NL_SET_ERR_MSG_MOD(info->extack, "queue index exceeds max value"); err = -ERANGE; break; } @@ -1073,6 +1364,80 @@ dev_err: return err; } +static int vdpa_dev_net_device_attr_set(struct vdpa_device *vdev, + struct genl_info *info) +{ + struct vdpa_dev_set_config set_config = {}; + struct vdpa_mgmt_dev *mdev = vdev->mdev; + struct nlattr **nl_attrs = info->attrs; + const u8 *macaddr; + int err = -EOPNOTSUPP; + + down_write(&vdev->cf_lock); + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { + set_config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); + macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); + + if (is_valid_ether_addr(macaddr)) { + ether_addr_copy(set_config.net.mac, macaddr); + if (mdev->ops->dev_set_attr) { + err = mdev->ops->dev_set_attr(mdev, vdev, + &set_config); + } else { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "Operation not supported by the device."); + } + } else { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "Invalid MAC address"); + } + } + up_write(&vdev->cf_lock); + return err; +} + +static int vdpa_nl_cmd_dev_attr_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct vdpa_device *vdev; + struct device *dev; + const char *name; + u64 classes; + int err = 0; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + + name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + + down_write(&vdpa_dev_lock); + dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto dev_err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); + err = -EINVAL; + goto mdev_err; + } + classes = vdpa_mgmtdev_get_classes(vdev->mdev, NULL); + if (classes & BIT_ULL(VIRTIO_ID_NET)) { + err = vdpa_dev_net_device_attr_set(vdev, info); + } else { + NL_SET_ERR_MSG_FMT_MOD(info->extack, "%s device not supported", + name); + } + +mdev_err: + put_device(dev); +dev_err: + up_write(&vdpa_dev_lock); + return err; +} + static int vdpa_dev_config_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); @@ -1171,47 +1536,49 @@ static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, + [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 }, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), + [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, + [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, }; static const struct genl_ops vdpa_nl_ops[] = { { .cmd = VDPA_CMD_MGMTDEV_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_mgmtdev_get_doit, .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_NEW, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_add_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_DEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_del_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_get_doit, .dumpit = vdpa_nl_cmd_dev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_CONFIG_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_config_get_doit, .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, }, { .cmd = VDPA_CMD_DEV_VSTATS_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_stats_get_doit, .flags = GENL_ADMIN_PERM, }, + { + .cmd = VDPA_CMD_DEV_ATTR_SET, + .doit = vdpa_nl_cmd_dev_attr_set_doit, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family vdpa_nl_family __ro_after_init = { @@ -1253,4 +1620,5 @@ core_initcall(vdpa_init); module_exit(vdpa_exit); MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); +MODULE_DESCRIPTION("vDPA bus"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index cb88891b44a8..c1c6431950e1 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -11,14 +11,14 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> +#include <linux/kthread.h> #include <linux/slab.h> -#include <linux/sched.h> #include <linux/dma-map-ops.h> #include <linux/vringh.h> #include <linux/vdpa.h> #include <linux/vhost_iotlb.h> -#include <linux/iova.h> #include <uapi/linux/vdpa.h> +#include <uapi/linux/vhost_types.h> #include "vdpa_sim.h" @@ -36,20 +36,47 @@ module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, "Maximum number of iotlb entries for each address space. 0 means unlimited. (default: 2048)"); +static bool use_va = true; +module_param(use_va, bool, 0444); +MODULE_PARM_DESC(use_va, "Enable/disable the device's ability to use VA"); + #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 #define VDPASIM_VENDOR_ID 0 -static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) +struct vdpasim_mm_work { + struct kthread_work work; + struct vdpasim *vdpasim; + struct mm_struct *mm_to_bind; + int ret; +}; + +static void vdpasim_mm_work_fn(struct kthread_work *work) { - return container_of(vdpa, struct vdpasim, vdpa); + struct vdpasim_mm_work *mm_work = + container_of(work, struct vdpasim_mm_work, work); + struct vdpasim *vdpasim = mm_work->vdpasim; + + mm_work->ret = 0; + + //TODO: should we attach the cgroup of the mm owner? + vdpasim->mm_bound = mm_work->mm_to_bind; } -static struct vdpasim *dev_to_sim(struct device *dev) +static void vdpasim_worker_change_mm_sync(struct vdpasim *vdpasim, + struct vdpasim_mm_work *mm_work) { - struct vdpa_device *vdpa = dev_to_vdpa(dev); + struct kthread_work *work = &mm_work->work; + + kthread_init_work(work, vdpasim_mm_work_fn); + kthread_queue_work(vdpasim->worker, work); - return vdpa_to_sim(vdpa); + kthread_flush_work(work); +} + +static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) +{ + return container_of(vdpa, struct vdpasim, vdpa); } static void vdpasim_vq_notify(struct vringh *vring) @@ -66,14 +93,34 @@ static void vdpasim_vq_notify(struct vringh *vring) static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + uint16_t last_avail_idx = vq->vring.last_avail_idx; + struct vring_desc *desc = (struct vring_desc *) + (uintptr_t)vq->desc_addr; + struct vring_avail *avail = (struct vring_avail *) + (uintptr_t)vq->driver_addr; + struct vring_used *used = (struct vring_used *) + (uintptr_t)vq->device_addr; + + if (use_va && vdpasim->mm_bound) { + vringh_init_iotlb_va(&vq->vring, vdpasim->features, vq->num, + true, desc, avail, used); + } else { + vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, + true, desc, avail, used); + } - vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, false, - (struct vring_desc *)(uintptr_t)vq->desc_addr, - (struct vring_avail *) - (uintptr_t)vq->driver_addr, - (struct vring_used *) - (uintptr_t)vq->device_addr); - + vq->vring.last_avail_idx = last_avail_idx; + + /* + * Since vdpa_sim does not support receive inflight descriptors as a + * destination of a migration, let's set both avail_idx and used_idx + * the same at vq start. This is how vhost-user works in a + * VHOST_SET_VRING_BASE call. + * + * Although the simple fix is to set last_used_idx at + * vdpasim_set_vq_state, it would be reset at vdpasim_queue_ready. + */ + vq->vring.last_used_idx = last_avail_idx; vq->vring.notify = vdpasim_vq_notify; } @@ -92,7 +139,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim, vq->vring.notify = NULL; } -static void vdpasim_do_reset(struct vdpasim *vdpasim) +static void vdpasim_do_reset(struct vdpasim *vdpasim, u32 flags) { int i; @@ -104,10 +151,16 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim) &vdpasim->iommu_lock); } - for (i = 0; i < vdpasim->dev_attr.nas; i++) - vhost_iotlb_reset(&vdpasim->iommu[i]); + if (flags & VDPA_RESET_F_CLEAN_MAP) { + for (i = 0; i < vdpasim->dev_attr.nas; i++) { + vhost_iotlb_reset(&vdpasim->iommu[i]); + vhost_iotlb_add_range(&vdpasim->iommu[i], 0, ULONG_MAX, + 0, VHOST_MAP_RW); + vdpasim->iommu_pt[i] = true; + } + } - vdpasim->running = true; + vdpasim->running = false; spin_unlock(&vdpasim->iommu_lock); vdpasim->features = 0; @@ -115,144 +168,40 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim) ++vdpasim->generation; } -static int dir_to_perm(enum dma_data_direction dir) -{ - int perm = -EFAULT; - - switch (dir) { - case DMA_FROM_DEVICE: - perm = VHOST_MAP_WO; - break; - case DMA_TO_DEVICE: - perm = VHOST_MAP_RO; - break; - case DMA_BIDIRECTIONAL: - perm = VHOST_MAP_RW; - break; - default: - break; - } - - return perm; -} - -static dma_addr_t vdpasim_map_range(struct vdpasim *vdpasim, phys_addr_t paddr, - size_t size, unsigned int perm) -{ - struct iova *iova; - dma_addr_t dma_addr; - int ret; - - /* We set the limit_pfn to the maximum (ULONG_MAX - 1) */ - iova = alloc_iova(&vdpasim->iova, size >> iova_shift(&vdpasim->iova), - ULONG_MAX - 1, true); - if (!iova) - return DMA_MAPPING_ERROR; - - dma_addr = iova_dma_addr(&vdpasim->iova, iova); - - spin_lock(&vdpasim->iommu_lock); - ret = vhost_iotlb_add_range(&vdpasim->iommu[0], (u64)dma_addr, - (u64)dma_addr + size - 1, (u64)paddr, perm); - spin_unlock(&vdpasim->iommu_lock); - - if (ret) { - __free_iova(&vdpasim->iova, iova); - return DMA_MAPPING_ERROR; - } - - return dma_addr; -} - -static void vdpasim_unmap_range(struct vdpasim *vdpasim, dma_addr_t dma_addr, - size_t size) -{ - spin_lock(&vdpasim->iommu_lock); - vhost_iotlb_del_range(&vdpasim->iommu[0], (u64)dma_addr, - (u64)dma_addr + size - 1); - spin_unlock(&vdpasim->iommu_lock); - - free_iova(&vdpasim->iova, iova_pfn(&vdpasim->iova, dma_addr)); -} - -static dma_addr_t vdpasim_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - struct vdpasim *vdpasim = dev_to_sim(dev); - phys_addr_t paddr = page_to_phys(page) + offset; - int perm = dir_to_perm(dir); - - if (perm < 0) - return DMA_MAPPING_ERROR; - - return vdpasim_map_range(vdpasim, paddr, size, perm); -} - -static void vdpasim_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - struct vdpasim *vdpasim = dev_to_sim(dev); - - vdpasim_unmap_range(vdpasim, dma_addr, size); -} +static const struct vdpa_config_ops vdpasim_config_ops; +static const struct vdpa_config_ops vdpasim_batch_config_ops; -static void *vdpasim_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag, - unsigned long attrs) +static void vdpasim_work_fn(struct kthread_work *work) { - struct vdpasim *vdpasim = dev_to_sim(dev); - phys_addr_t paddr; - void *addr; + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + struct mm_struct *mm = vdpasim->mm_bound; - addr = kmalloc(size, flag); - if (!addr) { - *dma_addr = DMA_MAPPING_ERROR; - return NULL; + if (use_va && mm) { + if (!mmget_not_zero(mm)) + return; + kthread_use_mm(mm); } - paddr = virt_to_phys(addr); + vdpasim->dev_attr.work_fn(vdpasim); - *dma_addr = vdpasim_map_range(vdpasim, paddr, size, VHOST_MAP_RW); - if (*dma_addr == DMA_MAPPING_ERROR) { - kfree(addr); - return NULL; + if (use_va && mm) { + kthread_unuse_mm(mm); + mmput(mm); } - - return addr; } -static void vdpasim_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs) -{ - struct vdpasim *vdpasim = dev_to_sim(dev); - - vdpasim_unmap_range(vdpasim, dma_addr, size); - - kfree(vaddr); -} - -static const struct dma_map_ops vdpasim_dma_ops = { - .map_page = vdpasim_map_page, - .unmap_page = vdpasim_unmap_page, - .alloc = vdpasim_alloc_coherent, - .free = vdpasim_free_coherent, -}; - -static const struct vdpa_config_ops vdpasim_config_ops; -static const struct vdpa_config_ops vdpasim_batch_config_ops; - struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, const struct vdpa_dev_set_config *config) { const struct vdpa_config_ops *ops; + struct vdpa_device *vdpa; struct vdpasim *vdpasim; struct device *dev; int i, ret = -ENOMEM; + if (!dev_attr->alloc_size) + return ERR_PTR(-EINVAL); + if (config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { if (config->device_features & ~dev_attr->supported_features) @@ -266,24 +215,31 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, else ops = &vdpasim_config_ops; - vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, - dev_attr->ngroups, dev_attr->nas, - dev_attr->name, false); - if (IS_ERR(vdpasim)) { - ret = PTR_ERR(vdpasim); + vdpa = __vdpa_alloc_device(NULL, ops, NULL, + dev_attr->ngroups, dev_attr->nas, + dev_attr->alloc_size, + dev_attr->name, use_va); + if (IS_ERR(vdpa)) { + ret = PTR_ERR(vdpa); goto err_alloc; } + vdpasim = vdpa_to_sim(vdpa); vdpasim->dev_attr = *dev_attr; - INIT_WORK(&vdpasim->work, dev_attr->work_fn); - spin_lock_init(&vdpasim->lock); + dev = &vdpasim->vdpa.dev; + + kthread_init_work(&vdpasim->work, vdpasim_work_fn); + vdpasim->worker = kthread_run_worker(0, "vDPA sim worker: %s", + dev_attr->name); + if (IS_ERR(vdpasim->worker)) + goto err_iommu; + + mutex_init(&vdpasim->mutex); spin_lock_init(&vdpasim->iommu_lock); - dev = &vdpasim->vdpa.dev; dev->dma_mask = &dev->coherent_dma_mask; if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) goto err_iommu; - set_dma_ops(dev, &vdpasim_dma_ops); vdpasim->vdpa.mdev = dev_attr->mgmt_dev; vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL); @@ -300,25 +256,23 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, if (!vdpasim->iommu) goto err_iommu; - for (i = 0; i < vdpasim->dev_attr.nas; i++) - vhost_iotlb_init(&vdpasim->iommu[i], max_iotlb_entries, 0); - - vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); - if (!vdpasim->buffer) + vdpasim->iommu_pt = kmalloc_array(vdpasim->dev_attr.nas, + sizeof(*vdpasim->iommu_pt), GFP_KERNEL); + if (!vdpasim->iommu_pt) goto err_iommu; + for (i = 0; i < vdpasim->dev_attr.nas; i++) { + vhost_iotlb_init(&vdpasim->iommu[i], max_iotlb_entries, 0); + vhost_iotlb_add_range(&vdpasim->iommu[i], 0, ULONG_MAX, 0, + VHOST_MAP_RW); + vdpasim->iommu_pt[i] = true; + } + for (i = 0; i < dev_attr->nvqs; i++) vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0], &vdpasim->iommu_lock); - ret = iova_cache_get(); - if (ret) - goto err_iommu; - - /* For simplicity we use an IOVA allocator with byte granularity */ - init_iova_domain(&vdpasim->iova, 1, 0); - - vdpasim->vdpa.dma_dev = dev; + vdpasim->vdpa.vmap.dma_dev = dev; return vdpasim; @@ -329,6 +283,12 @@ err_alloc: } EXPORT_SYMBOL_GPL(vdpasim_create); +void vdpasim_schedule_work(struct vdpasim *vdpasim) +{ + kthread_queue_work(vdpasim->worker, &vdpasim->work); +} +EXPORT_SYMBOL_GPL(vdpasim_schedule_work); + static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx, u64 desc_area, u64 driver_area, u64 device_area) @@ -351,13 +311,30 @@ static void vdpasim_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num) vq->num = num; } +static u16 vdpasim_get_vq_size(struct vdpa_device *vdpa, u16 idx) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + + if (vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK) + return vq->num; + else + return VDPASIM_QUEUE_MAX; +} + static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + if (!vdpasim->running && + (vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + vdpasim->pending_kick = true; + return; + } + if (vq->ready) - schedule_work(&vdpasim->work); + vdpasim_schedule_work(vdpasim); } static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx, @@ -376,13 +353,13 @@ static void vdpasim_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready) struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; bool old_ready; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); old_ready = vq->ready; vq->ready = ready; if (vq->ready && !old_ready) { vdpasim_queue_ready(vdpasim, idx); } - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); } static bool vdpasim_get_vq_ready(struct vdpa_device *vdpa, u16 idx) @@ -400,9 +377,9 @@ static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; struct vringh *vrh = &vq->vring; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vrh->last_avail_idx = state->split.avail_index; - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return 0; } @@ -418,6 +395,18 @@ static int vdpasim_get_vq_state(struct vdpa_device *vdpa, u16 idx, return 0; } +static int vdpasim_get_vq_stats(struct vdpa_device *vdpa, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + if (vdpasim->dev_attr.get_stats) + return vdpasim->dev_attr.get_stats(vdpasim, idx, + msg, extack); + return -EOPNOTSUPP; +} + static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa) { return VDPASIM_QUEUE_ALIGN; @@ -439,6 +428,11 @@ static u64 vdpasim_get_device_features(struct vdpa_device *vdpa) return vdpasim->dev_attr.supported_features; } +static u64 vdpasim_get_backend_features(const struct vdpa_device *vdpa) +{ + return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK); +} + static int vdpasim_set_driver_features(struct vdpa_device *vdpa, u64 features) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -487,9 +481,9 @@ static u8 vdpasim_get_status(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); u8 status; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); status = vdpasim->status; - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return status; } @@ -498,30 +492,57 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vdpasim->status = status; - spin_unlock(&vdpasim->lock); + vdpasim->running = (status & VIRTIO_CONFIG_S_DRIVER_OK) != 0; + mutex_unlock(&vdpasim->mutex); } -static int vdpasim_reset(struct vdpa_device *vdpa) +static int vdpasim_compat_reset(struct vdpa_device *vdpa, u32 flags) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vdpasim->status = 0; - vdpasim_do_reset(vdpasim); - spin_unlock(&vdpasim->lock); + vdpasim_do_reset(vdpasim, flags); + mutex_unlock(&vdpasim->mutex); return 0; } +static int vdpasim_reset(struct vdpa_device *vdpa) +{ + return vdpasim_compat_reset(vdpa, 0); +} + static int vdpasim_suspend(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vdpasim->running = false; - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); + + return 0; +} + +static int vdpasim_resume(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + int i; + + mutex_lock(&vdpasim->mutex); + vdpasim->running = true; + + if (vdpasim->pending_kick) { + /* Process pending descriptors */ + for (i = 0; i < vdpasim->dev_attr.nvqs; ++i) + vdpasim_kick_vq(vdpa, i); + + vdpasim->pending_kick = false; + } + + mutex_unlock(&vdpasim->mutex); return 0; } @@ -593,14 +614,14 @@ static int vdpasim_set_group_asid(struct vdpa_device *vdpa, unsigned int group, iommu = &vdpasim->iommu[asid]; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); for (i = 0; i < vdpasim->dev_attr.nvqs; i++) if (vdpasim_get_vq_group(vdpa, i) == group) vringh_set_iotlb(&vdpasim->vqs[i].vring, iommu, &vdpasim->iommu_lock); - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return 0; } @@ -621,6 +642,7 @@ static int vdpasim_set_map(struct vdpa_device *vdpa, unsigned int asid, iommu = &vdpasim->iommu[asid]; vhost_iotlb_reset(iommu); + vdpasim->iommu_pt[asid] = false; for (map = vhost_iotlb_itree_first(iotlb, start, last); map; map = vhost_iotlb_itree_next(map, start, last)) { @@ -638,6 +660,49 @@ err: return ret; } +static int vdpasim_reset_map(struct vdpa_device *vdpa, unsigned int asid) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + if (asid >= vdpasim->dev_attr.nas) + return -EINVAL; + + spin_lock(&vdpasim->iommu_lock); + if (vdpasim->iommu_pt[asid]) + goto out; + vhost_iotlb_reset(&vdpasim->iommu[asid]); + vhost_iotlb_add_range(&vdpasim->iommu[asid], 0, ULONG_MAX, + 0, VHOST_MAP_RW); + vdpasim->iommu_pt[asid] = true; +out: + spin_unlock(&vdpasim->iommu_lock); + return 0; +} + +static int vdpasim_bind_mm(struct vdpa_device *vdpa, struct mm_struct *mm) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_mm_work mm_work; + + mm_work.vdpasim = vdpasim; + mm_work.mm_to_bind = mm; + + vdpasim_worker_change_mm_sync(vdpasim, &mm_work); + + return mm_work.ret; +} + +static void vdpasim_unbind_mm(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_mm_work mm_work; + + mm_work.vdpasim = vdpasim; + mm_work.mm_to_bind = NULL; + + vdpasim_worker_change_mm_sync(vdpasim, &mm_work); +} + static int vdpasim_dma_map(struct vdpa_device *vdpa, unsigned int asid, u64 iova, u64 size, u64 pa, u32 perm, void *opaque) @@ -649,6 +714,10 @@ static int vdpasim_dma_map(struct vdpa_device *vdpa, unsigned int asid, return -EINVAL; spin_lock(&vdpasim->iommu_lock); + if (vdpasim->iommu_pt[asid]) { + vhost_iotlb_reset(&vdpasim->iommu[asid]); + vdpasim->iommu_pt[asid] = false; + } ret = vhost_iotlb_add_range_ctx(&vdpasim->iommu[asid], iova, iova + size - 1, pa, perm, opaque); spin_unlock(&vdpasim->iommu_lock); @@ -664,6 +733,11 @@ static int vdpasim_dma_unmap(struct vdpa_device *vdpa, unsigned int asid, if (asid >= vdpasim->dev_attr.nas) return -EINVAL; + if (vdpasim->iommu_pt[asid]) { + vhost_iotlb_reset(&vdpasim->iommu[asid]); + vdpasim->iommu_pt[asid] = false; + } + spin_lock(&vdpasim->iommu_lock); vhost_iotlb_del_range(&vdpasim->iommu[asid], iova, iova + size - 1); spin_unlock(&vdpasim->iommu_lock); @@ -676,22 +750,20 @@ static void vdpasim_free(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); int i; - cancel_work_sync(&vdpasim->work); + kthread_cancel_work_sync(&vdpasim->work); + kthread_destroy_worker(vdpasim->worker); for (i = 0; i < vdpasim->dev_attr.nvqs; i++) { vringh_kiov_cleanup(&vdpasim->vqs[i].out_iov); vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov); } - if (vdpa_get_dma_dev(vdpa)) { - put_iova_domain(&vdpasim->iova); - iova_cache_put(); - } + vdpasim->dev_attr.free(vdpasim); - kvfree(vdpasim->buffer); for (i = 0; i < vdpasim->dev_attr.nas; i++) vhost_iotlb_reset(&vdpasim->iommu[i]); kfree(vdpasim->iommu); + kfree(vdpasim->iommu_pt); kfree(vdpasim->vqs); kfree(vdpasim->config); } @@ -704,20 +776,25 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .set_vq_ready = vdpasim_set_vq_ready, .get_vq_ready = vdpasim_get_vq_ready, .set_vq_state = vdpasim_set_vq_state, + .get_vendor_vq_stats = vdpasim_get_vq_stats, .get_vq_state = vdpasim_get_vq_state, .get_vq_align = vdpasim_get_vq_align, .get_vq_group = vdpasim_get_vq_group, .get_device_features = vdpasim_get_device_features, + .get_backend_features = vdpasim_get_backend_features, .set_driver_features = vdpasim_set_driver_features, .get_driver_features = vdpasim_get_driver_features, .set_config_cb = vdpasim_set_config_cb, .get_vq_num_max = vdpasim_get_vq_num_max, + .get_vq_size = vdpasim_get_vq_size, .get_device_id = vdpasim_get_device_id, .get_vendor_id = vdpasim_get_vendor_id, .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, .reset = vdpasim_reset, + .compat_reset = vdpasim_compat_reset, .suspend = vdpasim_suspend, + .resume = vdpasim_resume, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, @@ -726,6 +803,9 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .set_group_asid = vdpasim_set_group_asid, .dma_map = vdpasim_dma_map, .dma_unmap = vdpasim_dma_unmap, + .reset_map = vdpasim_reset_map, + .bind_mm = vdpasim_bind_mm, + .unbind_mm = vdpasim_unbind_mm, .free = vdpasim_free, }; @@ -737,10 +817,12 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .set_vq_ready = vdpasim_set_vq_ready, .get_vq_ready = vdpasim_get_vq_ready, .set_vq_state = vdpasim_set_vq_state, + .get_vendor_vq_stats = vdpasim_get_vq_stats, .get_vq_state = vdpasim_get_vq_state, .get_vq_align = vdpasim_get_vq_align, .get_vq_group = vdpasim_get_vq_group, .get_device_features = vdpasim_get_device_features, + .get_backend_features = vdpasim_get_backend_features, .set_driver_features = vdpasim_set_driver_features, .get_driver_features = vdpasim_get_driver_features, .set_config_cb = vdpasim_set_config_cb, @@ -750,7 +832,9 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, .reset = vdpasim_reset, + .compat_reset = vdpasim_compat_reset, .suspend = vdpasim_suspend, + .resume = vdpasim_resume, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, @@ -758,6 +842,9 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_iova_range = vdpasim_get_iova_range, .set_group_asid = vdpasim_set_group_asid, .set_map = vdpasim_set_map, + .reset_map = vdpasim_reset_map, + .bind_mm = vdpasim_bind_mm, + .unbind_mm = vdpasim_unbind_mm, .free = vdpasim_free, }; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 0e78737dcc16..bb137e479763 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -37,42 +37,49 @@ struct vdpasim_dev_attr { struct vdpa_mgmt_dev *mgmt_dev; const char *name; u64 supported_features; + size_t alloc_size; size_t config_size; - size_t buffer_size; int nvqs; u32 id; u32 ngroups; u32 nas; - work_func_t work_fn; + void (*work_fn)(struct vdpasim *vdpasim); void (*get_config)(struct vdpasim *vdpasim, void *config); void (*set_config)(struct vdpasim *vdpasim, const void *config); + int (*get_stats)(struct vdpasim *vdpasim, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack); + void (*free)(struct vdpasim *vdpasim); }; /* State of each vdpasim device */ struct vdpasim { struct vdpa_device vdpa; struct vdpasim_virtqueue *vqs; - struct work_struct work; + struct kthread_worker *worker; + struct kthread_work work; + struct mm_struct *mm_bound; struct vdpasim_dev_attr dev_attr; - /* spinlock to synchronize virtqueue state */ - spinlock_t lock; + /* mutex to synchronize virtqueue state */ + struct mutex mutex; /* virtio config according to device type */ void *config; struct vhost_iotlb *iommu; - struct iova_domain iova; - void *buffer; + bool *iommu_pt; u32 status; u32 generation; u64 features; u32 groups; bool running; + bool pending_kick; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr, const struct vdpa_dev_set_config *config); +void vdpasim_schedule_work(struct vdpasim *vdpasim); /* TODO: cross-endian support */ static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index f745926237a8..b137f3679343 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/blkdev.h> #include <linux/vringh.h> #include <linux/vdpa.h> @@ -44,8 +43,39 @@ #define VDPASIM_BLK_AS_NUM 1 #define VDPASIM_BLK_GROUP_NUM 1 +struct vdpasim_blk { + struct vdpasim vdpasim; + void *buffer; + bool shared_backend; +}; + +static struct vdpasim_blk *sim_to_blk(struct vdpasim *vdpasim) +{ + return container_of(vdpasim, struct vdpasim_blk, vdpasim); +} + static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; +static bool shared_backend; +module_param(shared_backend, bool, 0444); +MODULE_PARM_DESC(shared_backend, "Enable the shared backend between virtio-blk devices"); + +static void *shared_buffer; +/* mutex to synchronize shared_buffer access */ +static DEFINE_MUTEX(shared_buffer_mutex); + +static void vdpasim_blk_buffer_lock(struct vdpasim_blk *blk) +{ + if (blk->shared_backend) + mutex_lock(&shared_buffer_mutex); +} + +static void vdpasim_blk_buffer_unlock(struct vdpasim_blk *blk) +{ + if (blk->shared_backend) + mutex_unlock(&shared_buffer_mutex); +} + static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, u64 num_sectors, u64 max_sectors) { @@ -79,6 +109,7 @@ static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, struct vdpasim_virtqueue *vq) { + struct vdpasim_blk *blk = sim_to_blk(vdpasim); size_t pushed = 0, to_pull, to_push; struct virtio_blk_outhdr hdr; bool handled = false; @@ -144,9 +175,10 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, break; } + vdpasim_blk_buffer_lock(blk); bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, - vdpasim->buffer + offset, - to_push); + blk->buffer + offset, to_push); + vdpasim_blk_buffer_unlock(blk); if (bytes < 0) { dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", @@ -166,9 +198,10 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, break; } + vdpasim_blk_buffer_lock(blk); bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, - vdpasim->buffer + offset, - to_pull); + blk->buffer + offset, to_pull); + vdpasim_blk_buffer_unlock(blk); if (bytes < 0) { dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", @@ -248,8 +281,10 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, } if (type == VIRTIO_BLK_T_WRITE_ZEROES) { - memset(vdpasim->buffer + offset, 0, + vdpasim_blk_buffer_lock(blk); + memset(blk->buffer + offset, 0, num_sectors << SECTOR_SHIFT); + vdpasim_blk_buffer_unlock(blk); } break; @@ -286,13 +321,12 @@ err: return handled; } -static void vdpasim_blk_work(struct work_struct *work) +static void vdpasim_blk_work(struct vdpasim *vdpasim) { - struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); bool reschedule = false; int i; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; @@ -323,10 +357,10 @@ static void vdpasim_blk_work(struct work_struct *work) } } out: - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); if (reschedule) - schedule_work(&vdpasim->work); + vdpasim_schedule_work(vdpasim); } static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) @@ -355,6 +389,14 @@ static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) } +static void vdpasim_blk_free(struct vdpasim *vdpasim) +{ + struct vdpasim_blk *blk = sim_to_blk(vdpasim); + + if (!blk->shared_backend) + kvfree(blk->buffer); +} + static void vdpasim_blk_mgmtdev_release(struct device *dev) { } @@ -368,6 +410,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, const struct vdpa_dev_set_config *config) { struct vdpasim_dev_attr dev_attr = {}; + struct vdpasim_blk *blk; struct vdpasim *simdev; int ret; @@ -378,15 +421,30 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; dev_attr.ngroups = VDPASIM_BLK_GROUP_NUM; dev_attr.nas = VDPASIM_BLK_AS_NUM; + dev_attr.alloc_size = sizeof(struct vdpasim_blk); dev_attr.config_size = sizeof(struct virtio_blk_config); dev_attr.get_config = vdpasim_blk_get_config; dev_attr.work_fn = vdpasim_blk_work; - dev_attr.buffer_size = VDPASIM_BLK_CAPACITY << SECTOR_SHIFT; + dev_attr.free = vdpasim_blk_free; simdev = vdpasim_create(&dev_attr, config); if (IS_ERR(simdev)) return PTR_ERR(simdev); + blk = sim_to_blk(simdev); + blk->shared_backend = shared_backend; + + if (blk->shared_backend) { + blk->buffer = shared_buffer; + } else { + blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + GFP_KERNEL); + if (!blk->buffer) { + ret = -ENOMEM; + goto put_dev; + } + } + ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_BLK_VQ_NUM); if (ret) goto put_dev; @@ -436,8 +494,18 @@ static int __init vdpasim_blk_init(void) if (ret) goto parent_err; - return 0; + if (shared_backend) { + shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + GFP_KERNEL); + if (!shared_buffer) { + ret = -ENOMEM; + goto mgmt_dev_err; + } + } + return 0; +mgmt_dev_err: + vdpa_mgmtdev_unregister(&mgmt_dev); parent_err: device_unregister(&vdpasim_blk_mgmtdev); return ret; @@ -445,6 +513,7 @@ parent_err: static void __exit vdpasim_blk_exit(void) { + kvfree(shared_buffer); vdpa_mgmtdev_unregister(&mgmt_dev); device_unregister(&vdpasim_blk_mgmtdev); } diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index 584b975a98a7..6caf09a1907b 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -11,10 +11,10 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/etherdevice.h> #include <linux/vringh.h> #include <linux/vdpa.h> +#include <net/netlink.h> #include <uapi/linux/virtio_net.h> #include <uapi/linux/vdpa.h> @@ -27,6 +27,7 @@ #define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ (1ULL << VIRTIO_NET_F_MAC) | \ + (1ULL << VIRTIO_NET_F_STATUS) | \ (1ULL << VIRTIO_NET_F_MTU) | \ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ (1ULL << VIRTIO_NET_F_CTRL_MAC_ADDR)) @@ -36,6 +37,35 @@ #define VDPASIM_NET_AS_NUM 2 #define VDPASIM_NET_GROUP_NUM 2 +struct vdpasim_dataq_stats { + struct u64_stats_sync syncp; + u64 pkts; + u64 bytes; + u64 drops; + u64 errors; + u64 overruns; +}; + +struct vdpasim_cq_stats { + struct u64_stats_sync syncp; + u64 requests; + u64 successes; + u64 errors; +}; + +struct vdpasim_net{ + struct vdpasim vdpasim; + struct vdpasim_dataq_stats tx_stats; + struct vdpasim_dataq_stats rx_stats; + struct vdpasim_cq_stats cq_stats; + void *buffer; +}; + +static struct vdpasim_net *sim_to_net(struct vdpasim *vdpasim) +{ + return container_of(vdpasim, struct vdpasim_net, vdpasim); +} + static void vdpasim_net_complete(struct vdpasim_virtqueue *vq, size_t len) { /* Make sure data is wrote before advancing index */ @@ -58,14 +88,15 @@ static bool receive_filter(struct vdpasim *vdpasim, size_t len) size_t hdr_len = modern ? sizeof(struct virtio_net_hdr_v1) : sizeof(struct virtio_net_hdr); struct virtio_net_config *vio_config = vdpasim->config; + struct vdpasim_net *net = sim_to_net(vdpasim); if (len < ETH_ALEN + hdr_len) return false; - if (is_broadcast_ether_addr(vdpasim->buffer + hdr_len) || - is_multicast_ether_addr(vdpasim->buffer + hdr_len)) + if (is_broadcast_ether_addr(net->buffer + hdr_len) || + is_multicast_ether_addr(net->buffer + hdr_len)) return true; - if (!strncmp(vdpasim->buffer + hdr_len, vio_config->mac, ETH_ALEN)) + if (!strncmp(net->buffer + hdr_len, vio_config->mac, ETH_ALEN)) return true; return false; @@ -96,9 +127,11 @@ static virtio_net_ctrl_ack vdpasim_handle_ctrl_mac(struct vdpasim *vdpasim, static void vdpasim_handle_cvq(struct vdpasim *vdpasim) { struct vdpasim_virtqueue *cvq = &vdpasim->vqs[2]; + struct vdpasim_net *net = sim_to_net(vdpasim); virtio_net_ctrl_ack status = VIRTIO_NET_ERR; struct virtio_net_ctrl_hdr ctrl; size_t read, write; + u64 requests = 0, errors = 0, successes = 0; int err; if (!(vdpasim->features & (1ULL << VIRTIO_NET_F_CTRL_VQ))) @@ -114,10 +147,13 @@ static void vdpasim_handle_cvq(struct vdpasim *vdpasim) if (err <= 0) break; + ++requests; read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->in_iov, &ctrl, sizeof(ctrl)); - if (read != sizeof(ctrl)) + if (read != sizeof(ctrl)) { + ++errors; break; + } switch (ctrl.class) { case VIRTIO_NET_CTRL_MAC: @@ -127,6 +163,11 @@ static void vdpasim_handle_cvq(struct vdpasim *vdpasim) break; } + if (status == VIRTIO_NET_OK) + ++successes; + else + ++errors; + /* Make sure data is wrote before advancing index */ smp_wmb(); @@ -144,18 +185,25 @@ static void vdpasim_handle_cvq(struct vdpasim *vdpasim) cvq->cb(cvq->private); local_bh_enable(); } + + u64_stats_update_begin(&net->cq_stats.syncp); + net->cq_stats.requests += requests; + net->cq_stats.errors += errors; + net->cq_stats.successes += successes; + u64_stats_update_end(&net->cq_stats.syncp); } -static void vdpasim_net_work(struct work_struct *work) +static void vdpasim_net_work(struct vdpasim *vdpasim) { - struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; + struct vdpasim_net *net = sim_to_net(vdpasim); ssize_t read, write; - int pkts = 0; + u64 tx_pkts = 0, rx_pkts = 0, tx_bytes = 0, rx_bytes = 0; + u64 rx_drops = 0, rx_overruns = 0, rx_errors = 0, tx_errors = 0; int err; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); if (!vdpasim->running) goto out; @@ -171,14 +219,20 @@ static void vdpasim_net_work(struct work_struct *work) while (true) { err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL, &txq->head, GFP_ATOMIC); - if (err <= 0) + if (err <= 0) { + if (err) + ++tx_errors; break; + } + ++tx_pkts; read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, - vdpasim->buffer, - PAGE_SIZE); + net->buffer, PAGE_SIZE); + + tx_bytes += read; if (!receive_filter(vdpasim, read)) { + ++rx_drops; vdpasim_net_complete(txq, 0); continue; } @@ -186,26 +240,171 @@ static void vdpasim_net_work(struct work_struct *work) err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov, &rxq->head, GFP_ATOMIC); if (err <= 0) { + ++rx_overruns; vdpasim_net_complete(txq, 0); break; } write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, - vdpasim->buffer, read); - if (write <= 0) + net->buffer, read); + if (write <= 0) { + ++rx_errors; break; + } + + ++rx_pkts; + rx_bytes += write; vdpasim_net_complete(txq, 0); vdpasim_net_complete(rxq, write); - if (++pkts > 4) { - schedule_work(&vdpasim->work); + if (tx_pkts > 4) { + vdpasim_schedule_work(vdpasim); goto out; } } out: - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); + + u64_stats_update_begin(&net->tx_stats.syncp); + net->tx_stats.pkts += tx_pkts; + net->tx_stats.bytes += tx_bytes; + net->tx_stats.errors += tx_errors; + u64_stats_update_end(&net->tx_stats.syncp); + + u64_stats_update_begin(&net->rx_stats.syncp); + net->rx_stats.pkts += rx_pkts; + net->rx_stats.bytes += rx_bytes; + net->rx_stats.drops += rx_drops; + net->rx_stats.errors += rx_errors; + net->rx_stats.overruns += rx_overruns; + u64_stats_update_end(&net->rx_stats.syncp); +} + +static int vdpasim_net_get_stats(struct vdpasim *vdpasim, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack) +{ + struct vdpasim_net *net = sim_to_net(vdpasim); + u64 rx_pkts, rx_bytes, rx_errors, rx_overruns, rx_drops; + u64 tx_pkts, tx_bytes, tx_errors, tx_drops; + u64 cq_requests, cq_successes, cq_errors; + unsigned int start; + int err = -EMSGSIZE; + + switch(idx) { + case 0: + do { + start = u64_stats_fetch_begin(&net->rx_stats.syncp); + rx_pkts = net->rx_stats.pkts; + rx_bytes = net->rx_stats.bytes; + rx_errors = net->rx_stats.errors; + rx_overruns = net->rx_stats.overruns; + rx_drops = net->rx_stats.drops; + } while (u64_stats_fetch_retry(&net->rx_stats.syncp, start)); + + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx packets")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_pkts, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx bytes")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_bytes, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx errors")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_errors, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx overruns")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_overruns, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "rx drops")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + rx_drops, VDPA_ATTR_PAD)) + break; + err = 0; + break; + case 1: + do { + start = u64_stats_fetch_begin(&net->tx_stats.syncp); + tx_pkts = net->tx_stats.pkts; + tx_bytes = net->tx_stats.bytes; + tx_errors = net->tx_stats.errors; + tx_drops = net->tx_stats.drops; + } while (u64_stats_fetch_retry(&net->tx_stats.syncp, start)); + + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "tx packets")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + tx_pkts, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "tx bytes")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + tx_bytes, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "tx errors")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + tx_errors, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "tx drops")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + tx_drops, VDPA_ATTR_PAD)) + break; + err = 0; + break; + case 2: + do { + start = u64_stats_fetch_begin(&net->cq_stats.syncp); + cq_requests = net->cq_stats.requests; + cq_successes = net->cq_stats.successes; + cq_errors = net->cq_stats.errors; + } while (u64_stats_fetch_retry(&net->cq_stats.syncp, start)); + + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "cvq requests")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + cq_requests, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "cvq successes")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + cq_successes, VDPA_ATTR_PAD)) + break; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, + "cvq errors")) + break; + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, + cq_errors, VDPA_ATTR_PAD)) + break; + err = 0; + break; + default: + err = -EINVAL; + break; + } + + return err; } static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) @@ -215,6 +414,24 @@ static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); } +static int vdpasim_net_set_attr(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev, + const struct vdpa_dev_set_config *config) +{ + struct vdpasim *vdpasim = container_of(dev, struct vdpasim, vdpa); + struct virtio_net_config *vio_config = vdpasim->config; + + mutex_lock(&vdpasim->mutex); + + if (config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + ether_addr_copy(vio_config->mac, config->net.mac); + mutex_unlock(&vdpasim->mutex); + return 0; + } + + mutex_unlock(&vdpasim->mutex); + return -EOPNOTSUPP; +} + static void vdpasim_net_setup_config(struct vdpasim *vdpasim, const struct vdpa_dev_set_config *config) { @@ -229,6 +446,13 @@ static void vdpasim_net_setup_config(struct vdpasim *vdpasim, vio_config->mtu = cpu_to_vdpasim16(vdpasim, 1500); } +static void vdpasim_net_free(struct vdpasim *vdpasim) +{ + struct vdpasim_net *net = sim_to_net(vdpasim); + + kvfree(net->buffer); +} + static void vdpasim_net_mgmtdev_release(struct device *dev) { } @@ -242,6 +466,7 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, const struct vdpa_dev_set_config *config) { struct vdpasim_dev_attr dev_attr = {}; + struct vdpasim_net *net; struct vdpasim *simdev; int ret; @@ -252,10 +477,12 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, dev_attr.nvqs = VDPASIM_NET_VQ_NUM; dev_attr.ngroups = VDPASIM_NET_GROUP_NUM; dev_attr.nas = VDPASIM_NET_AS_NUM; + dev_attr.alloc_size = sizeof(struct vdpasim_net); dev_attr.config_size = sizeof(struct virtio_net_config); dev_attr.get_config = vdpasim_net_get_config; dev_attr.work_fn = vdpasim_net_work; - dev_attr.buffer_size = PAGE_SIZE; + dev_attr.get_stats = vdpasim_net_get_stats; + dev_attr.free = vdpasim_net_free; simdev = vdpasim_create(&dev_attr, config); if (IS_ERR(simdev)) @@ -263,6 +490,23 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, vdpasim_net_setup_config(simdev, config); + net = sim_to_net(simdev); + + u64_stats_init(&net->tx_stats.syncp); + u64_stats_init(&net->rx_stats.syncp); + u64_stats_init(&net->cq_stats.syncp); + + net->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL); + if (!net->buffer) { + ret = -ENOMEM; + goto reg_err; + } + + /* + * Initialization must be completed before this call, since it can + * connect the device to the vDPA bus, so requests can arrive after + * this call. + */ ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM); if (ret) goto reg_err; @@ -284,7 +528,8 @@ static void vdpasim_net_dev_del(struct vdpa_mgmt_dev *mdev, static const struct vdpa_mgmtdev_ops vdpasim_net_mgmtdev_ops = { .dev_add = vdpasim_net_dev_add, - .dev_del = vdpasim_net_dev_del + .dev_del = vdpasim_net_dev_del, + .dev_set_attr = vdpasim_net_set_attr }; static struct virtio_device_id id_table[] = { diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index e682bc7ee6c9..4352b5cf74f0 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -103,19 +103,38 @@ void vduse_domain_clear_map(struct vduse_iova_domain *domain, static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain, u64 iova, u64 size, u64 paddr) { - struct vduse_bounce_map *map; + struct vduse_bounce_map *map, *head_map; + struct page *tmp_page; u64 last = iova + size - 1; while (iova <= last) { - map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + /* + * When PAGE_SIZE is larger than 4KB, multiple adjacent bounce_maps will + * point to the same memory page of PAGE_SIZE. Since bounce_maps originate + * from IO requests, we may not be able to guarantee that the orig_phys + * values of all IO requests within the same 64KB memory page are contiguous. + * Therefore, we need to store them separately. + * + * Bounce pages are allocated on demand. As a result, it may occur that + * multiple bounce pages corresponding to the same 64KB memory page attempt + * to allocate memory simultaneously, so we use cmpxchg to handle this + * concurrency. + */ + map = &domain->bounce_maps[iova >> BOUNCE_MAP_SHIFT]; if (!map->bounce_page) { - map->bounce_page = alloc_page(GFP_ATOMIC); - if (!map->bounce_page) - return -ENOMEM; + head_map = &domain->bounce_maps[(iova & PAGE_MASK) >> BOUNCE_MAP_SHIFT]; + if (!head_map->bounce_page) { + tmp_page = alloc_page(GFP_ATOMIC); + if (!tmp_page) + return -ENOMEM; + if (cmpxchg(&head_map->bounce_page, NULL, tmp_page)) + __free_page(tmp_page); + } + map->bounce_page = head_map->bounce_page; } map->orig_phys = paddr; - paddr += PAGE_SIZE; - iova += PAGE_SIZE; + paddr += BOUNCE_MAP_SIZE; + iova += BOUNCE_MAP_SIZE; } return 0; } @@ -127,12 +146,17 @@ static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain, u64 last = iova + size - 1; while (iova <= last) { - map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + map = &domain->bounce_maps[iova >> BOUNCE_MAP_SHIFT]; map->orig_phys = INVALID_PHYS_ADDR; - iova += PAGE_SIZE; + iova += BOUNCE_MAP_SIZE; } } +static unsigned int offset_in_bounce_page(dma_addr_t addr) +{ + return (addr & ~BOUNCE_MAP_MASK); +} + static void do_bounce(phys_addr_t orig, void *addr, size_t size, enum dma_data_direction dir) { @@ -162,7 +186,8 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain, enum dma_data_direction dir) { struct vduse_bounce_map *map; - unsigned int offset; + struct page *page; + unsigned int offset, head_offset; void *addr; size_t sz; @@ -170,16 +195,20 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain, return; while (size) { - map = &domain->bounce_maps[iova >> PAGE_SHIFT]; - offset = offset_in_page(iova); - sz = min_t(size_t, PAGE_SIZE - offset, size); + map = &domain->bounce_maps[iova >> BOUNCE_MAP_SHIFT]; + head_offset = offset_in_page(iova); + offset = offset_in_bounce_page(iova); + sz = min_t(size_t, BOUNCE_MAP_SIZE - offset, size); if (WARN_ON(!map->bounce_page || map->orig_phys == INVALID_PHYS_ADDR)) return; - addr = kmap_local_page(map->bounce_page); - do_bounce(map->orig_phys + offset, addr + offset, sz, dir); + page = domain->user_bounce_pages ? + map->user_bounce_page : map->bounce_page; + + addr = kmap_local_page(page); + do_bounce(map->orig_phys + offset, addr + head_offset, sz, dir); kunmap_local(addr); size -= sz; iova += sz; @@ -214,7 +243,7 @@ vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova) struct page *page = NULL; read_lock(&domain->bounce_lock); - map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + map = &domain->bounce_maps[iova >> BOUNCE_MAP_SHIFT]; if (domain->user_bounce_pages || !map->bounce_page) goto out; @@ -232,7 +261,7 @@ vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain) struct vduse_bounce_map *map; unsigned long pfn, bounce_pfns; - bounce_pfns = domain->bounce_size >> PAGE_SHIFT; + bounce_pfns = domain->bounce_size >> BOUNCE_MAP_SHIFT; for (pfn = 0; pfn < bounce_pfns; pfn++) { map = &domain->bounce_maps[pfn]; @@ -242,7 +271,8 @@ vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain) if (!map->bounce_page) continue; - __free_page(map->bounce_page); + if (!((pfn << BOUNCE_MAP_SHIFT) & ~PAGE_MASK)) + __free_page(map->bounce_page); map->bounce_page = NULL; } } @@ -250,8 +280,12 @@ vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain) int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, struct page **pages, int count) { - struct vduse_bounce_map *map; - int i, ret; + struct vduse_bounce_map *map, *head_map; + int i, j, ret; + int inner_pages = PAGE_SIZE / BOUNCE_MAP_SIZE; + int bounce_pfns = domain->bounce_size >> BOUNCE_MAP_SHIFT; + struct page *head_page = NULL; + bool need_copy; /* Now we don't support partial mapping */ if (count != (domain->bounce_size >> PAGE_SHIFT)) @@ -263,17 +297,23 @@ int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, goto out; for (i = 0; i < count; i++) { - map = &domain->bounce_maps[i]; - if (map->bounce_page) { + need_copy = false; + head_map = &domain->bounce_maps[(i * inner_pages)]; + head_page = head_map->bounce_page; + for (j = 0; j < inner_pages; j++) { + if ((i * inner_pages + j) >= bounce_pfns) + break; + map = &domain->bounce_maps[(i * inner_pages + j)]; /* Copy kernel page to user page if it's in use */ - if (map->orig_phys != INVALID_PHYS_ADDR) - memcpy_to_page(pages[i], 0, - page_address(map->bounce_page), - PAGE_SIZE); - __free_page(map->bounce_page); + if ((head_page) && (map->orig_phys != INVALID_PHYS_ADDR)) + need_copy = true; + map->user_bounce_page = pages[i]; } - map->bounce_page = pages[i]; get_page(pages[i]); + if ((head_page) && (need_copy)) + memcpy_to_page(pages[i], 0, + page_address(head_page), + PAGE_SIZE); } domain->user_bounce_pages = true; ret = 0; @@ -285,8 +325,12 @@ out: void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain) { - struct vduse_bounce_map *map; - unsigned long i, count; + struct vduse_bounce_map *map, *head_map; + unsigned long i, j, count; + int inner_pages = PAGE_SIZE / BOUNCE_MAP_SIZE; + int bounce_pfns = domain->bounce_size >> BOUNCE_MAP_SHIFT; + struct page *head_page = NULL; + bool need_copy; write_lock(&domain->bounce_lock); if (!domain->user_bounce_pages) @@ -294,20 +338,27 @@ void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain) count = domain->bounce_size >> PAGE_SHIFT; for (i = 0; i < count; i++) { - struct page *page = NULL; - - map = &domain->bounce_maps[i]; - if (WARN_ON(!map->bounce_page)) + need_copy = false; + head_map = &domain->bounce_maps[(i * inner_pages)]; + if (WARN_ON(!head_map->user_bounce_page)) continue; - - /* Copy user page to kernel page if it's in use */ - if (map->orig_phys != INVALID_PHYS_ADDR) { - page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL); - memcpy_from_page(page_address(page), - map->bounce_page, 0, PAGE_SIZE); + head_page = head_map->user_bounce_page; + + for (j = 0; j < inner_pages; j++) { + if ((i * inner_pages + j) >= bounce_pfns) + break; + map = &domain->bounce_maps[(i * inner_pages + j)]; + if (WARN_ON(!map->user_bounce_page)) + continue; + /* Copy user page to kernel page if it's in use */ + if ((map->orig_phys != INVALID_PHYS_ADDR) && (head_map->bounce_page)) + need_copy = true; + map->user_bounce_page = NULL; } - put_page(map->bounce_page); - map->bounce_page = page; + if (need_copy) + memcpy_from_page(page_address(head_map->bounce_page), + head_page, 0, PAGE_SIZE); + put_page(head_page); } domain->user_bounce_pages = false; out: @@ -373,6 +424,26 @@ static void vduse_domain_free_iova(struct iova_domain *iovad, free_iova_fast(iovad, iova >> shift, iova_len); } +void vduse_domain_sync_single_for_device(struct vduse_iova_domain *domain, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir) +{ + read_lock(&domain->bounce_lock); + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + vduse_domain_bounce(domain, dma_addr, size, DMA_TO_DEVICE); + read_unlock(&domain->bounce_lock); +} + +void vduse_domain_sync_single_for_cpu(struct vduse_iova_domain *domain, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir) +{ + read_lock(&domain->bounce_lock); + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE); + read_unlock(&domain->bounce_lock); +} + dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, @@ -393,7 +464,8 @@ dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa)) goto err_unlock; - if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE); read_unlock(&domain->bounce_lock); @@ -411,9 +483,9 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, enum dma_data_direction dir, unsigned long attrs) { struct iova_domain *iovad = &domain->stream_iovad; - read_lock(&domain->bounce_lock); - if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE); vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size); @@ -423,7 +495,7 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, size_t size, dma_addr_t *dma_addr, - gfp_t flag, unsigned long attrs) + gfp_t flag) { struct iova_domain *iovad = &domain->consistent_iovad; unsigned long limit = domain->iova_limit; @@ -512,7 +584,7 @@ static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma) { struct vduse_iova_domain *domain = file->private_data; - vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTDUMP | VM_DONTEXPAND); vma->vm_private_data = domain; vma->vm_ops = &vduse_domain_mmap_ops; @@ -557,7 +629,7 @@ vduse_domain_create(unsigned long iova_limit, size_t bounce_size) unsigned long pfn, bounce_pfns; int ret; - bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT; + bounce_pfns = PAGE_ALIGN(bounce_size) >> BOUNCE_MAP_SHIFT; if (iova_limit <= bounce_size) return NULL; @@ -589,7 +661,7 @@ vduse_domain_create(unsigned long iova_limit, size_t bounce_size) rwlock_init(&domain->bounce_lock); spin_lock_init(&domain->iotlb_lock); init_iova_domain(&domain->stream_iovad, - PAGE_SIZE, IOVA_START_PFN); + BOUNCE_MAP_SIZE, IOVA_START_PFN); ret = iova_domain_init_rcaches(&domain->stream_iovad); if (ret) goto err_iovad_stream; diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 4e0e50e7ac15..a923971a64f5 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -14,14 +14,17 @@ #include <linux/iova.h> #include <linux/dma-mapping.h> #include <linux/vhost_iotlb.h> -#include <linux/rwlock.h> #define IOVA_START_PFN 1 -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) +#define BOUNCE_MAP_SHIFT 12 +#define BOUNCE_MAP_SIZE (1 << BOUNCE_MAP_SHIFT) +#define BOUNCE_MAP_MASK (~(BOUNCE_MAP_SIZE - 1)) +#define BOUNCE_MAP_ALIGN(addr) (((addr) + BOUNCE_MAP_SIZE - 1) & ~(BOUNCE_MAP_SIZE - 1)) struct vduse_bounce_map { struct page *bounce_page; + struct page *user_bounce_page; u64 orig_phys; }; @@ -45,6 +48,14 @@ int vduse_domain_set_map(struct vduse_iova_domain *domain, void vduse_domain_clear_map(struct vduse_iova_domain *domain, struct vhost_iotlb *iotlb); +void vduse_domain_sync_single_for_device(struct vduse_iova_domain *domain, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir); + +void vduse_domain_sync_single_for_cpu(struct vduse_iova_domain *domain, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir); + dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, @@ -56,7 +67,7 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, size_t size, dma_addr_t *dma_addr, - gfp_t flag, unsigned long attrs); + gfp_t flag); void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, void *vaddr, dma_addr_t dma_addr, diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 0c3b48616a9f..ae357d014564 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -8,6 +8,7 @@ * */ +#include "linux/virtio_net.h" #include <linux/init.h> #include <linux/module.h> #include <linux/cdev.h> @@ -28,6 +29,7 @@ #include <uapi/linux/virtio_config.h> #include <uapi/linux/virtio_ids.h> #include <uapi/linux/virtio_blk.h> +#include <uapi/linux/virtio_ring.h> #include <linux/mod_devicetable.h> #include "iova_domain.h" @@ -37,10 +39,15 @@ #define DRV_LICENSE "GPL v2" #define VDUSE_DEV_MAX (1U << MINORBITS) +#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) +#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) -#define VDUSE_IOVA_SIZE (128 * 1024 * 1024) +/* 128 MB reserved for virtqueue creation */ +#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) #define VDUSE_MSG_DEFAULT_TIMEOUT 30 +#define IRQ_UNBOUND -1 + struct vduse_virtqueue { u16 index; u16 num_max; @@ -57,6 +64,9 @@ struct vduse_virtqueue { struct vdpa_callback cb; struct work_struct inject; struct work_struct kick; + int irq_effective_cpu; + struct cpumask irq_affinity; + struct kobject kobj; }; struct vduse_dev; @@ -76,7 +86,7 @@ struct vduse_umem { struct vduse_dev { struct vduse_vdpa *vdev; struct device *dev; - struct vduse_virtqueue *vqs; + struct vduse_virtqueue **vqs; struct vduse_iova_domain *domain; char *name; struct mutex lock; @@ -106,6 +116,8 @@ struct vduse_dev { u32 vq_align; struct vduse_umem *umem; struct mutex mem_lock; + unsigned int bounce_size; + struct mutex domain_lock; }; struct vduse_dev_msg { @@ -124,13 +136,15 @@ static DEFINE_MUTEX(vduse_lock); static DEFINE_IDR(vduse_idr); static dev_t vduse_major; -static struct class *vduse_class; static struct cdev vduse_ctrl_cdev; static struct cdev vduse_cdev; static struct workqueue_struct *vduse_irq_wq; +static struct workqueue_struct *vduse_irq_bound_wq; static u32 allowed_device_id[] = { VIRTIO_ID_BLOCK, + VIRTIO_ID_NET, + VIRTIO_ID_FS, }; static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa) @@ -419,7 +433,7 @@ static void vduse_dev_reset(struct vduse_dev *dev) struct vduse_iova_domain *domain = dev->domain; /* The coherent mappings are handled in vduse_dev_free_coherent() */ - if (domain->bounce_map) + if (domain && domain->bounce_map) vduse_domain_reset_bounce_map(domain); down_write(&dev->rwsem); @@ -434,7 +448,7 @@ static void vduse_dev_reset(struct vduse_dev *dev) flush_work(&dev->inject); for (i = 0; i < dev->vq_num; i++) { - struct vduse_virtqueue *vq = &dev->vqs[i]; + struct vduse_virtqueue *vq = dev->vqs[i]; vq->ready = false; vq->desc_addr = 0; @@ -453,6 +467,7 @@ static void vduse_dev_reset(struct vduse_dev *dev) spin_lock(&vq->irq_lock); vq->cb.callback = NULL; vq->cb.private = NULL; + vq->cb.trigger = NULL; spin_unlock(&vq->irq_lock); flush_work(&vq->inject); flush_work(&vq->kick); @@ -466,7 +481,7 @@ static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx, u64 device_area) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; vq->desc_addr = desc_area; vq->driver_addr = driver_area; @@ -482,7 +497,7 @@ static void vduse_vq_kick(struct vduse_virtqueue *vq) goto unlock; if (vq->kickfd) - eventfd_signal(vq->kickfd, 1); + eventfd_signal(vq->kickfd); else vq->kicked = true; unlock: @@ -500,7 +515,7 @@ static void vduse_vq_kick_work(struct work_struct *work) static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; if (!eventfd_signal_allowed()) { schedule_work(&vq->kick); @@ -513,27 +528,39 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx, struct vdpa_callback *cb) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; spin_lock(&vq->irq_lock); vq->cb.callback = cb->callback; vq->cb.private = cb->private; + vq->cb.trigger = cb->trigger; spin_unlock(&vq->irq_lock); } static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; vq->num = num; } +static u16 vduse_vdpa_get_vq_size(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = dev->vqs[idx]; + + if (vq->num) + return vq->num; + else + return vq->num_max; +} + static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; vq->ready = ready; } @@ -541,7 +568,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa, static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; return vq->ready; } @@ -550,7 +577,7 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, const struct vdpa_vq_state *state) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { vq->state.packed.last_avail_counter = @@ -569,7 +596,7 @@ static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpa_vq_state *state) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) return vduse_dev_get_vq_state_packed(dev, vq, &state->packed); @@ -624,8 +651,8 @@ static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa) int i; for (i = 0; i < dev->vq_num; i++) - if (num_max < dev->vqs[i].num_max) - num_max = dev->vqs[i].num_max; + if (num_max < dev->vqs[i]->num_max) + num_max = dev->vqs[i]->num_max; return num_max; } @@ -708,6 +735,27 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa) return dev->generation; } +static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx, + const struct cpumask *cpu_mask) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (cpu_mask) + cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask); + else + cpumask_setall(&dev->vqs[idx]->irq_affinity); + + return 0; +} + +static const struct cpumask * +vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return &dev->vqs[idx]->irq_affinity; +} + static int vduse_vdpa_set_map(struct vdpa_device *vdpa, unsigned int asid, struct vhost_iotlb *iotlb) @@ -740,6 +788,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .kick_vq = vduse_vdpa_kick_vq, .set_vq_cb = vduse_vdpa_set_vq_cb, .set_vq_num = vduse_vdpa_set_vq_num, + .get_vq_size = vduse_vdpa_get_vq_size, .set_vq_ready = vduse_vdpa_set_vq_ready, .get_vq_ready = vduse_vdpa_get_vq_ready, .set_vq_state = vduse_vdpa_set_vq_state, @@ -758,44 +807,60 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .get_config = vduse_vdpa_get_config, .set_config = vduse_vdpa_set_config, .get_generation = vduse_vdpa_get_generation, + .set_vq_affinity = vduse_vdpa_set_vq_affinity, + .get_vq_affinity = vduse_vdpa_get_vq_affinity, .reset = vduse_vdpa_reset, .set_map = vduse_vdpa_set_map, .free = vduse_vdpa_free, }; -static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page, +static void vduse_dev_sync_single_for_device(union virtio_map token, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir) +{ + struct vduse_iova_domain *domain = token.iova_domain; + + vduse_domain_sync_single_for_device(domain, dma_addr, size, dir); +} + +static void vduse_dev_sync_single_for_cpu(union virtio_map token, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir) +{ + struct vduse_iova_domain *domain = token.iova_domain; + + vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir); +} + +static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; return vduse_domain_map_page(domain, page, offset, size, dir, attrs); } -static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); } -static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag, - unsigned long attrs) +static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, + dma_addr_t *dma_addr, gfp_t flag) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; unsigned long iova; void *addr; *dma_addr = DMA_MAPPING_ERROR; addr = vduse_domain_alloc_coherent(domain, size, - (dma_addr_t *)&iova, flag, attrs); + (dma_addr_t *)&iova, flag); if (!addr) return NULL; @@ -804,29 +869,45 @@ static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, return addr; } -static void vduse_dev_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs) +static void vduse_dev_free_coherent(union virtio_map token, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); } -static size_t vduse_dev_max_mapping_size(struct device *dev) +static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; + + return dma_addr < domain->bounce_size; +} + +static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) +{ + if (unlikely(dma_addr == DMA_MAPPING_ERROR)) + return -ENOMEM; + return 0; +} + +static size_t vduse_dev_max_mapping_size(union virtio_map token) +{ + struct vduse_iova_domain *domain = token.iova_domain; return domain->bounce_size; } -static const struct dma_map_ops vduse_dev_dma_ops = { +static const struct virtio_map_ops vduse_map_ops = { + .sync_single_for_device = vduse_dev_sync_single_for_device, + .sync_single_for_cpu = vduse_dev_sync_single_for_cpu, .map_page = vduse_dev_map_page, .unmap_page = vduse_dev_unmap_page, .alloc = vduse_dev_alloc_coherent, .free = vduse_dev_free_coherent, + .need_sync = vduse_dev_need_sync, + .mapping_error = vduse_dev_mapping_error, .max_mapping_size = vduse_dev_max_mapping_size, }; @@ -863,7 +944,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev, return -EINVAL; index = array_index_nospec(eventfd->index, dev->vq_num); - vq = &dev->vqs[index]; + vq = dev->vqs[index]; if (eventfd->fd >= 0) { ctx = eventfd_ctx_fdget(eventfd->fd); if (IS_ERR(ctx)) @@ -876,7 +957,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev, eventfd_ctx_put(vq->kickfd); vq->kickfd = ctx; if (vq->ready && vq->kicked && vq->kickfd) { - eventfd_signal(vq->kickfd, 1); + eventfd_signal(vq->kickfd); vq->kicked = false; } spin_unlock(&vq->kick_lock); @@ -889,7 +970,7 @@ static bool vduse_dev_is_ready(struct vduse_dev *dev) int i; for (i = 0; i < dev->vq_num; i++) - if (!dev->vqs[i].num_max) + if (!dev->vqs[i]->num_max) return false; return true; @@ -899,10 +980,10 @@ static void vduse_dev_irq_inject(struct work_struct *work) { struct vduse_dev *dev = container_of(work, struct vduse_dev, inject); - spin_lock_irq(&dev->irq_lock); + spin_lock_bh(&dev->irq_lock); if (dev->config_cb.callback) dev->config_cb.callback(dev->config_cb.private); - spin_unlock_irq(&dev->irq_lock); + spin_unlock_bh(&dev->irq_lock); } static void vduse_vq_irq_inject(struct work_struct *work) @@ -910,14 +991,32 @@ static void vduse_vq_irq_inject(struct work_struct *work) struct vduse_virtqueue *vq = container_of(work, struct vduse_virtqueue, inject); - spin_lock_irq(&vq->irq_lock); + spin_lock_bh(&vq->irq_lock); if (vq->ready && vq->cb.callback) vq->cb.callback(vq->cb.private); + spin_unlock_bh(&vq->irq_lock); +} + +static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq) +{ + bool signal = false; + + if (!vq->cb.trigger) + return false; + + spin_lock_irq(&vq->irq_lock); + if (vq->ready && vq->cb.trigger) { + eventfd_signal(vq->cb.trigger); + signal = true; + } spin_unlock_irq(&vq->irq_lock); + + return signal; } static int vduse_dev_queue_irq_work(struct vduse_dev *dev, - struct work_struct *irq_work) + struct work_struct *irq_work, + int irq_effective_cpu) { int ret = -EINVAL; @@ -926,7 +1025,11 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev, goto unlock; ret = 0; - queue_work(vduse_irq_wq, irq_work); + if (irq_effective_cpu == IRQ_UNBOUND) + queue_work(vduse_irq_wq, irq_work); + else + queue_work_on(irq_effective_cpu, + vduse_irq_bound_wq, irq_work); unlock: up_read(&dev->rwsem); @@ -944,6 +1047,9 @@ static int vduse_dev_dereg_umem(struct vduse_dev *dev, goto unlock; ret = -EINVAL; + if (!dev->domain) + goto unlock; + if (dev->umem->iova != iova || size != dev->domain->bounce_size) goto unlock; @@ -970,7 +1076,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, unsigned long npages, lock_limit; int ret; - if (!dev->domain->bounce_map || + if (!dev->domain || !dev->domain->bounce_map || size != dev->domain->bounce_size || iova != 0 || uaddr & ~PAGE_MASK) return -EINVAL; @@ -995,7 +1101,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, goto out; pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, - page_list, NULL); + page_list); if (pinned != npages) { ret = pinned < 0 ? pinned : -ENOMEM; goto out; @@ -1029,6 +1135,22 @@ unlock: return ret; } +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq) +{ + int curr_cpu = vq->irq_effective_cpu; + + while (true) { + curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity); + if (cpu_online(curr_cpu)) + break; + + if (curr_cpu >= nr_cpu_ids) + curr_cpu = IRQ_UNBOUND; + } + + vq->irq_effective_cpu = curr_cpu; +} + static long vduse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1044,7 +1166,6 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, struct vduse_iotlb_entry entry; struct vhost_iotlb_map *map; struct vdpa_map_file *map_file; - struct vduse_iova_domain *domain = dev->domain; struct file *f = NULL; ret = -EFAULT; @@ -1055,8 +1176,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (entry.start > entry.last) break; - spin_lock(&domain->iotlb_lock); - map = vhost_iotlb_itree_first(domain->iotlb, + mutex_lock(&dev->domain_lock); + if (!dev->domain) { + mutex_unlock(&dev->domain_lock); + break; + } + spin_lock(&dev->domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->domain->iotlb, entry.start, entry.last); if (map) { map_file = (struct vdpa_map_file *)map->opaque; @@ -1066,7 +1192,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, entry.last = map->last; entry.perm = map->perm; } - spin_unlock(&domain->iotlb_lock); + spin_unlock(&dev->domain->iotlb_lock); + mutex_unlock(&dev->domain_lock); ret = -EINVAL; if (!f) break; @@ -1076,7 +1203,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, fput(f); break; } - ret = receive_fd(f, perm_to_file_flags(entry.perm)); + ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm)); fput(f); break; } @@ -1111,7 +1238,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; } case VDUSE_DEV_INJECT_CONFIG_IRQ: - ret = vduse_dev_queue_irq_work(dev, &dev->inject); + ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND); break; case VDUSE_VQ_SETUP: { struct vduse_vq_config config; @@ -1130,7 +1257,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; index = array_index_nospec(config.index, dev->vq_num); - dev->vqs[index].num_max = config.max_size; + dev->vqs[index]->num_max = config.max_size; ret = 0; break; } @@ -1148,7 +1275,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; index = array_index_nospec(vq_info.index, dev->vq_num); - vq = &dev->vqs[index]; + vq = dev->vqs[index]; vq_info.desc_addr = vq->desc_addr; vq_info.driver_addr = vq->driver_addr; vq_info.device_addr = vq->device_addr; @@ -1197,8 +1324,14 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (index >= dev->vq_num) break; + ret = 0; index = array_index_nospec(index, dev->vq_num); - ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject); + if (!vduse_vq_signal_irqfd(dev->vqs[index])) { + vduse_vq_update_effective_cpu(dev->vqs[index]); + ret = vduse_dev_queue_irq_work(dev, + &dev->vqs[index]->inject, + dev->vqs[index]->irq_effective_cpu); + } break; } case VDUSE_IOTLB_REG_UMEM: { @@ -1213,8 +1346,10 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, sizeof(umem.reserved))) break; + mutex_lock(&dev->domain_lock); ret = vduse_dev_reg_umem(dev, umem.iova, umem.uaddr, umem.size); + mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_DEREG_UMEM: { @@ -1228,15 +1363,15 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (!is_mem_zero((const char *)umem.reserved, sizeof(umem.reserved))) break; - + mutex_lock(&dev->domain_lock); ret = vduse_dev_dereg_umem(dev, umem.iova, umem.size); + mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_GET_INFO: { struct vduse_iova_info info; struct vhost_iotlb_map *map; - struct vduse_iova_domain *domain = dev->domain; ret = -EFAULT; if (copy_from_user(&info, argp, sizeof(info))) @@ -1250,18 +1385,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, sizeof(info.reserved))) break; - spin_lock(&domain->iotlb_lock); - map = vhost_iotlb_itree_first(domain->iotlb, + mutex_lock(&dev->domain_lock); + if (!dev->domain) { + mutex_unlock(&dev->domain_lock); + break; + } + spin_lock(&dev->domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->domain->iotlb, info.start, info.last); if (map) { info.start = map->start; info.last = map->last; info.capability = 0; - if (domain->bounce_map && map->start == 0 && - map->last == domain->bounce_size - 1) + if (dev->domain->bounce_map && map->start == 0 && + map->last == dev->domain->bounce_size - 1) info.capability |= VDUSE_IOVA_CAP_UMEM; } - spin_unlock(&domain->iotlb_lock); + spin_unlock(&dev->domain->iotlb_lock); + mutex_unlock(&dev->domain_lock); if (!map) break; @@ -1284,7 +1425,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file) { struct vduse_dev *dev = file->private_data; - vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); + mutex_lock(&dev->domain_lock); + if (dev->domain) + vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); + mutex_unlock(&dev->domain_lock); spin_lock(&dev->msg_lock); /* Make sure the inflight messages can processed after reconncection */ list_splice_init(&dev->recv_list, &dev->send_list); @@ -1339,6 +1483,161 @@ static const struct file_operations vduse_dev_fops = { .llseek = noop_llseek, }; +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf) +{ + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity)); +} + +static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq, + const char *buf, size_t count) +{ + cpumask_var_t new_value; + int ret; + + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + ret = cpumask_parse(buf, new_value); + if (ret) + goto free_mask; + + ret = -EINVAL; + if (!cpumask_intersects(new_value, cpu_online_mask)) + goto free_mask; + + cpumask_copy(&vq->irq_affinity, new_value); + ret = count; +free_mask: + free_cpumask_var(new_value); + return ret; +} + +struct vq_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct vduse_virtqueue *vq, char *buf); + ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf, + size_t count); +}; + +static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity); + +static struct attribute *vq_attrs[] = { + &irq_cb_affinity_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vq); + +static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + struct vq_sysfs_entry *entry = container_of(attr, + struct vq_sysfs_entry, attr); + + if (!entry->show) + return -EIO; + + return entry->show(vq, buf); +} + +static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + struct vq_sysfs_entry *entry = container_of(attr, + struct vq_sysfs_entry, attr); + + if (!entry->store) + return -EIO; + + return entry->store(vq, buf, count); +} + +static const struct sysfs_ops vq_sysfs_ops = { + .show = vq_attr_show, + .store = vq_attr_store, +}; + +static void vq_release(struct kobject *kobj) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + kfree(vq); +} + +static const struct kobj_type vq_type = { + .release = vq_release, + .sysfs_ops = &vq_sysfs_ops, + .default_groups = vq_groups, +}; + +static char *vduse_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev)); +} + +static const struct class vduse_class = { + .name = "vduse", + .devnode = vduse_devnode, +}; + +static void vduse_dev_deinit_vqs(struct vduse_dev *dev) +{ + int i; + + if (!dev->vqs) + return; + + for (i = 0; i < dev->vq_num; i++) + kobject_put(&dev->vqs[i]->kobj); + kfree(dev->vqs); +} + +static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num) +{ + int ret, i; + + dev->vq_align = vq_align; + dev->vq_num = vq_num; + dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL); + if (!dev->vqs) + return -ENOMEM; + + for (i = 0; i < vq_num; i++) { + dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL); + if (!dev->vqs[i]) { + ret = -ENOMEM; + goto err; + } + + dev->vqs[i]->index = i; + dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND; + INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject); + INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work); + spin_lock_init(&dev->vqs[i]->kick_lock); + spin_lock_init(&dev->vqs[i]->irq_lock); + cpumask_setall(&dev->vqs[i]->irq_affinity); + + kobject_init(&dev->vqs[i]->kobj, &vq_type); + ret = kobject_add(&dev->vqs[i]->kobj, + &dev->dev->kobj, "vq%d", i); + if (ret) { + kfree(dev->vqs[i]); + goto err; + } + } + + return 0; +err: + while (i--) + kobject_put(&dev->vqs[i]->kobj); + kfree(dev->vqs); + dev->vqs = NULL; + return ret; +} + static struct vduse_dev *vduse_dev_create(void) { struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL); @@ -1348,6 +1647,7 @@ static struct vduse_dev *vduse_dev_create(void) mutex_init(&dev->lock); mutex_init(&dev->mem_lock); + mutex_init(&dev->domain_lock); spin_lock_init(&dev->msg_lock); INIT_LIST_HEAD(&dev->send_list); INIT_LIST_HEAD(&dev->recv_list); @@ -1393,11 +1693,12 @@ static int vduse_destroy_dev(char *name) mutex_unlock(&dev->lock); vduse_dev_reset(dev); - device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); + device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); idr_remove(&vduse_idr, dev->minor); kvfree(dev->config); - kfree(dev->vqs); - vduse_domain_destroy(dev->domain); + vduse_dev_deinit_vqs(dev); + if (dev->domain) + vduse_domain_destroy(dev->domain); kfree(dev->name); vduse_dev_destroy(dev); module_put(THIS_MODULE); @@ -1416,13 +1717,21 @@ static bool device_is_allowed(u32 device_id) return false; } -static bool features_is_valid(u64 features) +static bool features_is_valid(struct vduse_dev_config *config) { - if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) + if (!(config->features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) return false; /* Now we only support read-only configuration space */ - if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE)) + if ((config->device_id == VIRTIO_ID_BLOCK) && + (config->features & BIT_ULL(VIRTIO_BLK_F_CONFIG_WCE))) + return false; + else if ((config->device_id == VIRTIO_ID_NET) && + (config->features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) + return false; + + if ((config->device_id == VIRTIO_ID_NET) && + !(config->features & BIT_ULL(VIRTIO_F_VERSION_1))) return false; return true; @@ -1443,10 +1752,13 @@ static bool vduse_validate_config(struct vduse_dev_config *config) if (config->vq_num > 0xffff) return false; + if (!config->name[0]) + return false; + if (!device_is_allowed(config->device_id)) return false; - if (!features_is_valid(config->features)) + if (!features_is_valid(config)) return false; return true; @@ -1476,8 +1788,48 @@ static ssize_t msg_timeout_store(struct device *device, static DEVICE_ATTR_RW(msg_timeout); +static ssize_t bounce_size_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + + return sysfs_emit(buf, "%u\n", dev->bounce_size); +} + +static ssize_t bounce_size_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + unsigned int bounce_size; + int ret; + + ret = -EPERM; + mutex_lock(&dev->domain_lock); + if (dev->domain) + goto unlock; + + ret = kstrtouint(buf, 10, &bounce_size); + if (ret < 0) + goto unlock; + + ret = -EINVAL; + if (bounce_size > VDUSE_MAX_BOUNCE_SIZE || + bounce_size < VDUSE_MIN_BOUNCE_SIZE) + goto unlock; + + dev->bounce_size = bounce_size & PAGE_MASK; + ret = count; +unlock: + mutex_unlock(&dev->domain_lock); + return ret; +} + +static DEVICE_ATTR_RW(bounce_size); + static struct attribute *vduse_dev_attrs[] = { &dev_attr_msg_timeout.attr, + &dev_attr_bounce_size.attr, NULL }; @@ -1486,9 +1838,13 @@ ATTRIBUTE_GROUPS(vduse_dev); static int vduse_create_dev(struct vduse_dev_config *config, void *config_buf, u64 api_version) { - int i, ret; + int ret; struct vduse_dev *dev; + ret = -EPERM; + if ((config->device_id == VIRTIO_ID_NET) && !capable(CAP_NET_ADMIN)) + goto err; + ret = -EEXIST; if (vduse_find_dev(config->name)) goto err; @@ -1506,26 +1862,9 @@ static int vduse_create_dev(struct vduse_dev_config *config, if (!dev->name) goto err_str; - dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, - VDUSE_BOUNCE_SIZE); - if (!dev->domain) - goto err_domain; - + dev->bounce_size = VDUSE_BOUNCE_SIZE; dev->config = config_buf; dev->config_size = config->config_size; - dev->vq_align = config->vq_align; - dev->vq_num = config->vq_num; - dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL); - if (!dev->vqs) - goto err_vqs; - - for (i = 0; i < dev->vq_num; i++) { - dev->vqs[i].index = i; - INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject); - INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work); - spin_lock_init(&dev->vqs[i].kick_lock); - spin_lock_init(&dev->vqs[i].irq_lock); - } ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL); if (ret < 0) @@ -1533,23 +1872,26 @@ static int vduse_create_dev(struct vduse_dev_config *config, dev->minor = ret; dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT; - dev->dev = device_create_with_groups(vduse_class, NULL, + dev->dev = device_create_with_groups(&vduse_class, NULL, MKDEV(MAJOR(vduse_major), dev->minor), dev, vduse_dev_groups, "%s", config->name); if (IS_ERR(dev->dev)) { ret = PTR_ERR(dev->dev); goto err_dev; } + + ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num); + if (ret) + goto err_vqs; + __module_get(THIS_MODULE); return 0; +err_vqs: + device_destroy(&vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); err_dev: idr_remove(&vduse_idr, dev->minor); err_idr: - kfree(dev->vqs); -err_vqs: - vduse_domain_destroy(dev->domain); -err_domain: kfree(dev->name); err_str: vduse_dev_destroy(dev); @@ -1659,11 +2001,6 @@ static const struct file_operations vduse_ctrl_fops = { .llseek = noop_llseek, }; -static char *vduse_devnode(const struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev)); -} - struct vduse_mgmt_dev { struct vdpa_mgmt_dev mgmt_dev; struct device dev; @@ -1674,26 +2011,18 @@ static struct vduse_mgmt_dev *vduse_mgmt; static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) { struct vduse_vdpa *vdev; - int ret; if (dev->vdev) return -EEXIST; vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, - &vduse_vdpa_config_ops, 1, 1, name, true); + &vduse_vdpa_config_ops, &vduse_map_ops, + 1, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); dev->vdev = vdev; vdev->dev = dev; - vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask; - ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64)); - if (ret) { - put_device(&vdev->vdpa.dev); - return ret; - } - set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops); - vdev->vdpa.dma_dev = &vdev->vdpa.dev; vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev; return 0; @@ -1716,9 +2045,24 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, if (ret) return ret; + mutex_lock(&dev->domain_lock); + if (!dev->domain) + dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, + dev->bounce_size); + mutex_unlock(&dev->domain_lock); + if (!dev->domain) { + put_device(&dev->vdev->vdpa.dev); + return -ENOMEM; + } + + dev->vdev->vdpa.vmap.iova_domain = dev->domain; ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); if (ret) { put_device(&dev->vdev->vdpa.dev); + mutex_lock(&dev->domain_lock); + vduse_domain_destroy(dev->domain); + dev->domain = NULL; + mutex_unlock(&dev->domain_lock); return ret; } @@ -1737,6 +2081,7 @@ static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = { static struct virtio_device_id id_table[] = { { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, { 0 }, }; @@ -1793,11 +2138,9 @@ static int vduse_init(void) int ret; struct device *dev; - vduse_class = class_create(THIS_MODULE, "vduse"); - if (IS_ERR(vduse_class)) - return PTR_ERR(vduse_class); - - vduse_class->devnode = vduse_devnode; + ret = class_register(&vduse_class); + if (ret) + return ret; ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse"); if (ret) @@ -1810,7 +2153,7 @@ static int vduse_init(void) if (ret) goto err_ctrl_cdev; - dev = device_create(vduse_class, NULL, vduse_major, NULL, "control"); + dev = device_create(&vduse_class, NULL, vduse_major, NULL, "control"); if (IS_ERR(dev)) { ret = PTR_ERR(dev); goto err_device; @@ -1824,12 +2167,16 @@ static int vduse_init(void) if (ret) goto err_cdev; + ret = -ENOMEM; vduse_irq_wq = alloc_workqueue("vduse-irq", WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0); - if (!vduse_irq_wq) { - ret = -ENOMEM; + if (!vduse_irq_wq) goto err_wq; - } + + vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", + WQ_HIGHPRI | WQ_PERCPU, 0); + if (!vduse_irq_bound_wq) + goto err_bound_wq; ret = vduse_domain_init(); if (ret) @@ -1843,17 +2190,19 @@ static int vduse_init(void) err_mgmtdev: vduse_domain_exit(); err_domain: + destroy_workqueue(vduse_irq_bound_wq); +err_bound_wq: destroy_workqueue(vduse_irq_wq); err_wq: cdev_del(&vduse_cdev); err_cdev: - device_destroy(vduse_class, vduse_major); + device_destroy(&vduse_class, vduse_major); err_device: cdev_del(&vduse_ctrl_cdev); err_ctrl_cdev: unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX); err_chardev_region: - class_destroy(vduse_class); + class_unregister(&vduse_class); return ret; } module_init(vduse_init); @@ -1862,12 +2211,14 @@ static void vduse_exit(void) { vduse_mgmtdev_exit(); vduse_domain_exit(); + destroy_workqueue(vduse_irq_bound_wq); destroy_workqueue(vduse_irq_wq); cdev_del(&vduse_cdev); - device_destroy(vduse_class, vduse_major); + device_destroy(&vduse_class, vduse_major); cdev_del(&vduse_ctrl_cdev); unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX); - class_destroy(vduse_class); + class_unregister(&vduse_class); + idr_destroy(&vduse_idr); } module_exit(vduse_exit); diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 8fe267ca3e76..17a19a728c9c 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -160,7 +160,13 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) struct pci_dev *pdev = mdev->pci_dev; int i, ret, irq; int queues = vp_vdpa->queues; - int vectors = queues + 1; + int vectors = 1; + int msix_vec = 0; + + for (i = 0; i < queues; i++) { + if (vp_vdpa->vring[i].cb.callback) + vectors++; + } ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX); if (ret != vectors) { @@ -173,9 +179,12 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) vp_vdpa->vectors = vectors; for (i = 0; i < queues; i++) { + if (!vp_vdpa->vring[i].cb.callback) + continue; + snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-%d\n", pci_name(pdev), i); - irq = pci_irq_vector(pdev, i); + irq = pci_irq_vector(pdev, msix_vec); ret = devm_request_irq(&pdev->dev, irq, vp_vdpa_vq_handler, 0, vp_vdpa->vring[i].msix_name, @@ -185,21 +194,22 @@ static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) "vp_vdpa: fail to request irq for vq %d\n", i); goto err; } - vp_modern_queue_vector(mdev, i, i); + vp_modern_queue_vector(mdev, i, msix_vec); vp_vdpa->vring[i].irq = irq; + msix_vec++; } snprintf(vp_vdpa->msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-config\n", pci_name(pdev)); - irq = pci_irq_vector(pdev, queues); + irq = pci_irq_vector(pdev, msix_vec); ret = devm_request_irq(&pdev->dev, irq, vp_vdpa_config_handler, 0, vp_vdpa->msix_name, vp_vdpa); if (ret) { dev_err(&pdev->dev, - "vp_vdpa: fail to request irq for vq %d\n", i); + "vp_vdpa: fail to request irq for config: %d\n", ret); goto err; } - vp_modern_config_vector(mdev, queues); + vp_modern_config_vector(mdev, msix_vec); vp_vdpa->config_irq = irq; return 0; @@ -216,7 +226,10 @@ static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status) if (status & VIRTIO_CONFIG_S_DRIVER_OK && !(s & VIRTIO_CONFIG_S_DRIVER_OK)) { - vp_vdpa_request_irq(vp_vdpa); + if (vp_vdpa_request_irq(vp_vdpa)) { + WARN_ON(1); + return; + } } vp_modern_set_status(mdev, status); @@ -328,6 +341,13 @@ static void vp_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 qid, vp_modern_set_queue_size(mdev, qid, num); } +static u16 vp_vdpa_get_vq_size(struct vdpa_device *vdpa, u16 qid) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_queue_size(mdev, qid); +} + static int vp_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 qid, u64 desc_area, u64 driver_area, u64 device_area) @@ -347,6 +367,14 @@ static void vp_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid) vp_iowrite16(qid, vp_vdpa->vring[qid].notify); } +static void vp_vdpa_kick_vq_with_data(struct vdpa_device *vdpa, u32 data) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + u16 qid = data & 0xFFFF; + + vp_iowrite32(data, vp_vdpa->vring[qid].notify); +} + static u32 vp_vdpa_get_generation(struct vdpa_device *vdpa) { struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); @@ -449,8 +477,10 @@ static const struct vdpa_config_ops vp_vdpa_ops = { .set_vq_ready = vp_vdpa_set_vq_ready, .get_vq_ready = vp_vdpa_get_vq_ready, .set_vq_num = vp_vdpa_set_vq_num, + .get_vq_size = vp_vdpa_get_vq_size, .set_vq_address = vp_vdpa_set_vq_address, .kick_vq = vp_vdpa_kick_vq, + .kick_vq_with_data = vp_vdpa_kick_vq_with_data, .get_generation = vp_vdpa_get_generation, .get_device_id = vp_vdpa_get_device_id, .get_vendor_id = vp_vdpa_get_vendor_id, @@ -481,7 +511,8 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, int ret, i; vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, - dev, &vp_vdpa_ops, 1, 1, name, false); + dev, &vp_vdpa_ops, NULL, + 1, 1, name, false); if (IS_ERR(vp_vdpa)) { dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); @@ -490,7 +521,7 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; - vp_vdpa->vdpa.dma_dev = &pdev->dev; + vp_vdpa->vdpa.vmap.dma_dev = &pdev->dev; vp_vdpa->queues = vp_modern_get_num_queues(mdev); vp_vdpa->mdev = mdev; @@ -591,7 +622,11 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto mdev_err; } - mdev_id = kzalloc(sizeof(struct virtio_device_id), GFP_KERNEL); + /* + * id_table should be a null terminated array, so allocate one additional + * entry here, see vdpa_mgmtdev_get_classes(). + */ + mdev_id = kcalloc(2, sizeof(struct virtio_device_id), GFP_KERNEL); if (!mdev_id) { err = -ENOMEM; goto mdev_id_err; @@ -611,8 +646,8 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto probe_err; } - mdev_id->device = mdev->id.device; - mdev_id->vendor = mdev->id.vendor; + mdev_id[0].device = mdev->id.device; + mdev_id[0].vendor = mdev->id.vendor; mgtdev->id_table = mdev_id; mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev); mgtdev->supported_features = vp_modern_get_features(mdev); @@ -645,8 +680,8 @@ static void vp_vdpa_remove(struct pci_dev *pdev) struct virtio_pci_modern_device *mdev = NULL; mdev = vp_vdpa_mgtdev->mdev; - vp_modern_remove(mdev); vdpa_mgmtdev_unregister(&vp_vdpa_mgtdev->mgtdev); + vp_modern_remove(mdev); kfree(vp_vdpa_mgtdev->mgtdev.id_table); kfree(mdev); kfree(vp_vdpa_mgtdev); |
