diff options
Diffstat (limited to 'drivers/vdpa')
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_base.c | 14 | ||||
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_base.h | 2 | ||||
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_main.c | 144 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/core/mlx5_vdpa.h | 11 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/net/mlx5_vnet.c | 173 | ||||
-rw-r--r-- | drivers/vdpa/vdpa.c | 14 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_sim/vdpa_sim.c | 18 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_sim/vdpa_sim.h | 1 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 176 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 3 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_user/iova_domain.c | 102 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_user/iova_domain.h | 8 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_user/vduse_dev.c | 180 |
13 files changed, 706 insertions, 140 deletions
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c index 48c4dadb0c7c..75a703b803a2 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.c +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -29,7 +29,6 @@ u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector) { struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; - cfg = hw->common_cfg; vp_iowrite16(vector, &cfg->msix_config); return vp_ioread16(&cfg->msix_config); @@ -128,6 +127,7 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev) break; case VIRTIO_PCI_CAP_DEVICE_CFG: hw->dev_cfg = get_cap_addr(hw, &cap); + hw->cap_dev_config_size = le32_to_cpu(cap.length); IFCVF_DBG(pdev, "hw->dev_cfg = %p\n", hw->dev_cfg); break; } @@ -233,15 +233,23 @@ int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features) u32 ifcvf_get_config_size(struct ifcvf_hw *hw) { struct ifcvf_adapter *adapter; + u32 net_config_size = sizeof(struct virtio_net_config); + u32 blk_config_size = sizeof(struct virtio_blk_config); + u32 cap_size = hw->cap_dev_config_size; u32 config_size; adapter = vf_to_adapter(hw); + /* If the onboard device config space size is greater than + * the size of struct virtio_net/blk_config, only the spec + * implementing contents size is returned, this is very + * unlikely, defensive programming. + */ switch (hw->dev_type) { case VIRTIO_ID_NET: - config_size = sizeof(struct virtio_net_config); + config_size = min(cap_size, net_config_size); break; case VIRTIO_ID_BLOCK: - config_size = sizeof(struct virtio_blk_config); + config_size = min(cap_size, blk_config_size); break; default: config_size = 0; diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 115b61f4924b..f5563f665cc6 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -87,6 +87,8 @@ struct ifcvf_hw { int config_irq; int vqs_reused_irq; u16 nr_vring; + /* VIRTIO_PCI_CAP_DEVICE_CFG size */ + u32 cap_dev_config_size; }; struct ifcvf_adapter { diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 0a5670729412..f9c0044c6442 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -685,7 +685,7 @@ static struct vdpa_notification_area ifcvf_get_vq_notification(struct vdpa_devic } /* - * IFCVF currently does't have on-chip IOMMU, so not + * IFCVF currently doesn't have on-chip IOMMU, so not * implemented set_map()/dma_map()/dma_unmap() */ static const struct vdpa_config_ops ifc_vdpa_ops = { @@ -752,59 +752,36 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, { struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; struct ifcvf_adapter *adapter; + struct vdpa_device *vdpa_dev; struct pci_dev *pdev; struct ifcvf_hw *vf; - struct device *dev; - int ret, i; + int ret; ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev); - if (ifcvf_mgmt_dev->adapter) + if (!ifcvf_mgmt_dev->adapter) return -EOPNOTSUPP; - pdev = ifcvf_mgmt_dev->pdev; - dev = &pdev->dev; - adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - dev, &ifc_vdpa_ops, 1, 1, name, false); - if (IS_ERR(adapter)) { - IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); - return PTR_ERR(adapter); - } - - ifcvf_mgmt_dev->adapter = adapter; - + adapter = ifcvf_mgmt_dev->adapter; vf = &adapter->vf; - vf->dev_type = get_dev_type(pdev); - vf->base = pcim_iomap_table(pdev); + pdev = adapter->pdev; + vdpa_dev = &adapter->vdpa; - adapter->pdev = pdev; - adapter->vdpa.dma_dev = &pdev->dev; - - ret = ifcvf_init_hw(vf, pdev); - if (ret) { - IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); - goto err; - } - - for (i = 0; i < vf->nr_vring; i++) - vf->vring[i].irq = -EINVAL; - - vf->hw_features = ifcvf_get_hw_features(vf); - vf->config_size = ifcvf_get_config_size(vf); + if (name) + ret = dev_set_name(&vdpa_dev->dev, "%s", name); + else + ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index); - adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev; ret = _vdpa_register_device(&adapter->vdpa, vf->nr_vring); if (ret) { + put_device(&adapter->vdpa.dev); IFCVF_ERR(pdev, "Failed to register to vDPA bus"); - goto err; + return ret; } return 0; - -err: - put_device(&adapter->vdpa.dev); - return ret; } + static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) { struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; @@ -823,61 +800,94 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; struct device *dev = &pdev->dev; + struct ifcvf_adapter *adapter; + struct ifcvf_hw *vf; u32 dev_type; - int ret; - - ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); - if (!ifcvf_mgmt_dev) { - IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); - return -ENOMEM; - } - - dev_type = get_dev_type(pdev); - switch (dev_type) { - case VIRTIO_ID_NET: - ifcvf_mgmt_dev->mdev.id_table = id_table_net; - break; - case VIRTIO_ID_BLOCK: - ifcvf_mgmt_dev->mdev.id_table = id_table_blk; - break; - default: - IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type); - ret = -EOPNOTSUPP; - goto err; - } - - ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; - ifcvf_mgmt_dev->mdev.device = dev; - ifcvf_mgmt_dev->pdev = pdev; + int ret, i; ret = pcim_enable_device(pdev); if (ret) { IFCVF_ERR(pdev, "Failed to enable device\n"); - goto err; + return ret; } - ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4), IFCVF_DRIVER_NAME); if (ret) { IFCVF_ERR(pdev, "Failed to request MMIO region\n"); - goto err; + return ret; } ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); if (ret) { IFCVF_ERR(pdev, "No usable DMA configuration\n"); - goto err; + return ret; } ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev); if (ret) { IFCVF_ERR(pdev, "Failed for adding devres for freeing irq vectors\n"); - goto err; + return ret; } pci_set_master(pdev); + adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, + dev, &ifc_vdpa_ops, 1, 1, NULL, false); + if (IS_ERR(adapter)) { + IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); + return PTR_ERR(adapter); + } + + vf = &adapter->vf; + vf->dev_type = get_dev_type(pdev); + vf->base = pcim_iomap_table(pdev); + + adapter->pdev = pdev; + adapter->vdpa.dma_dev = &pdev->dev; + + ret = ifcvf_init_hw(vf, pdev); + if (ret) { + IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); + return ret; + } + + for (i = 0; i < vf->nr_vring; i++) + vf->vring[i].irq = -EINVAL; + + vf->hw_features = ifcvf_get_hw_features(vf); + vf->config_size = ifcvf_get_config_size(vf); + + ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); + if (!ifcvf_mgmt_dev) { + IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); + return -ENOMEM; + } + + ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; + ifcvf_mgmt_dev->mdev.device = dev; + ifcvf_mgmt_dev->adapter = adapter; + + dev_type = get_dev_type(pdev); + switch (dev_type) { + case VIRTIO_ID_NET: + ifcvf_mgmt_dev->mdev.id_table = id_table_net; + break; + case VIRTIO_ID_BLOCK: + ifcvf_mgmt_dev->mdev.id_table = id_table_blk; + break; + default: + IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type); + ret = -EOPNOTSUPP; + goto err; + } + + ifcvf_mgmt_dev->mdev.max_supported_vqs = vf->nr_vring; + ifcvf_mgmt_dev->mdev.supported_features = vf->hw_features; + + adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev; + + ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev); if (ret) { IFCVF_ERR(pdev, diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 44104093163b..6af9fdbb86b7 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -70,6 +70,16 @@ struct mlx5_vdpa_wq_ent { struct mlx5_vdpa_dev *mvdev; }; +enum { + MLX5_VDPA_DATAVQ_GROUP, + MLX5_VDPA_CVQ_GROUP, + MLX5_VDPA_NUMVQ_GROUPS +}; + +enum { + MLX5_VDPA_NUM_AS = MLX5_VDPA_NUMVQ_GROUPS +}; + struct mlx5_vdpa_dev { struct vdpa_device vdev; struct mlx5_core_dev *mdev; @@ -85,6 +95,7 @@ struct mlx5_vdpa_dev { struct mlx5_vdpa_mr mr; struct mlx5_control_vq cvq; struct workqueue_struct *wq; + unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; }; int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index e85c1d71f4ed..ed100a35e596 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -164,6 +164,7 @@ struct mlx5_vdpa_net { bool setup; u32 cur_num_vqs; u32 rqt_size; + bool nb_registered; struct notifier_block nb; struct vdpa_callback config_cb; struct mlx5_vdpa_wq_ent cvq_ent; @@ -895,6 +896,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque if (err) goto err_cmd; + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT; kfree(in); mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); @@ -922,6 +924,7 @@ static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtq mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id); return; } + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; umems_destroy(ndev, mvq); } @@ -1121,6 +1124,20 @@ err_cmd: return err; } +static bool is_valid_state_change(int oldstate, int newstate) +{ + switch (oldstate) { + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT: + return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY; + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY: + return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND; + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND: + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR: + default: + return false; + } +} + static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state) { int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); @@ -1130,6 +1147,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque void *in; int err; + if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE) + return 0; + + if (!is_valid_state_change(mvq->fw_state, state)) + return -EINVAL; + in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; @@ -1440,7 +1463,7 @@ static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac, headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16); dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16); - memset(dmac_c, 0xff, ETH_ALEN); + eth_broadcast_addr(dmac_c); ether_addr_copy(dmac_v, mac); MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); if (tagged) { @@ -1992,6 +2015,7 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct mlx5_vdpa_virtqueue *mvq; + int err; if (!mvdev->actual_features) return; @@ -2005,8 +2029,16 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready } mvq = &ndev->vqs[idx]; - if (!ready) + if (!ready) { suspend_vq(ndev, mvq); + } else { + err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); + if (err) { + mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err); + ready = false; + } + } + mvq->ready = ready; } @@ -2095,9 +2127,14 @@ static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev) return PAGE_SIZE; } -static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx) +static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx) { - return 0; + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + if (is_ctrl_vq_idx(mvdev, idx)) + return MLX5_VDPA_CVQ_GROUP; + + return MLX5_VDPA_DATAVQ_GROUP; } enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9, @@ -2511,6 +2548,15 @@ err_clear: up_write(&ndev->reslock); } +static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev) +{ + int i; + + /* default mapping all groups are mapped to asid 0 */ + for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++) + mvdev->group2asid[i] = 0; +} + static int mlx5_vdpa_reset(struct vdpa_device *vdev) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); @@ -2529,7 +2575,9 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) ndev->mvdev.cvq.completed_desc = 0; memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1)); ndev->mvdev.actual_features = 0; + init_group_to_asid_map(mvdev); ++mvdev->generation; + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { if (mlx5_vdpa_create_mr(mvdev, NULL)) mlx5_vdpa_warn(mvdev, "create MR failed\n"); @@ -2567,26 +2615,63 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) return mvdev->generation; } -static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, - struct vhost_iotlb *iotlb) +static int set_map_control(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +{ + u64 start = 0ULL, last = 0ULL - 1; + struct vhost_iotlb_map *map; + int err = 0; + + spin_lock(&mvdev->cvq.iommu_lock); + vhost_iotlb_reset(mvdev->cvq.iotlb); + + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, + map->last, map->addr, map->perm); + if (err) + goto out; + } + +out: + spin_unlock(&mvdev->cvq.iommu_lock); + return err; +} + +static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) { - struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); - struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); bool change_map; int err; - down_write(&ndev->reslock); - err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map); if (err) { mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err); - goto err; + return err; } if (change_map) err = mlx5_vdpa_change_map(mvdev, iotlb); -err: + return err; +} + +static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, + struct vhost_iotlb *iotlb) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err = -EINVAL; + + down_write(&ndev->reslock); + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { + err = set_map_data(mvdev, iotlb); + if (err) + goto out; + } + + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) + err = set_map_control(mvdev, iotlb); + +out: up_write(&ndev->reslock); return err; } @@ -2733,6 +2818,49 @@ out_err: return err; } +static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_control_vq *cvq; + + if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) + return; + + cvq = &mvdev->cvq; + cvq->ready = false; +} + +static int mlx5_vdpa_suspend(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + int i; + + down_write(&ndev->reslock); + mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); + ndev->nb_registered = false; + flush_workqueue(ndev->mvdev.wq); + for (i = 0; i < ndev->cur_num_vqs; i++) { + mvq = &ndev->vqs[i]; + suspend_vq(ndev, mvq); + } + mlx5_vdpa_cvq_suspend(mvdev); + up_write(&ndev->reslock); + return 0; +} + +static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, + unsigned int asid) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + if (group >= MLX5_VDPA_NUMVQ_GROUPS) + return -EINVAL; + + mvdev->group2asid[group] = asid; + return 0; +} + static const struct vdpa_config_ops mlx5_vdpa_ops = { .set_vq_address = mlx5_vdpa_set_vq_address, .set_vq_num = mlx5_vdpa_set_vq_num, @@ -2762,7 +2890,9 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .set_config = mlx5_vdpa_set_config, .get_generation = mlx5_vdpa_get_generation, .set_map = mlx5_vdpa_set_map, + .set_group_asid = mlx5_set_group_asid, .free = mlx5_vdpa_free, + .suspend = mlx5_vdpa_suspend, }; static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu) @@ -2828,6 +2958,7 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev) mvq->index = i; mvq->ndev = ndev; mvq->fwqp.fw = true; + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; } for (; i < ndev->mvdev.max_vqs; i++) { mvq = &ndev->vqs[i]; @@ -2902,13 +3033,21 @@ static int event_handler(struct notifier_block *nb, unsigned long event, void *p switch (eqe->sub_type) { case MLX5_PORT_CHANGE_SUBTYPE_DOWN: case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + down_read(&ndev->reslock); + if (!ndev->nb_registered) { + up_read(&ndev->reslock); + return NOTIFY_DONE; + } wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); - if (!wqent) + if (!wqent) { + up_read(&ndev->reslock); return NOTIFY_DONE; + } wqent->mvdev = &ndev->mvdev; INIT_WORK(&wqent->work, update_carrier); queue_work(ndev->mvdev.wq, &wqent->work); + up_read(&ndev->reslock); ret = NOTIFY_OK; break; default: @@ -2982,7 +3121,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - 1, 1, name, false); + MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); @@ -3062,6 +3201,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, ndev->nb.notifier_call = event_handler; mlx5_notifier_register(mdev, &ndev->nb); + ndev->nb_registered = true; mvdev->vdev.mdev = &mgtdev->mgtdev; err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1); if (err) @@ -3093,7 +3233,10 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct workqueue_struct *wq; - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); + if (ndev->nb_registered) { + mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); + ndev->nb_registered = false; + } wq = mvdev->wq; mvdev->wq = NULL; destroy_workqueue(wq); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index ebf2f363fbe7..c06c02704461 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -824,11 +824,11 @@ static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *ms config.mac)) return -EMSGSIZE; - val_u16 = le16_to_cpu(config.status); + val_u16 = __virtio16_to_cpu(true, config.status); if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16)) return -EMSGSIZE; - val_u16 = le16_to_cpu(config.mtu); + val_u16 = __virtio16_to_cpu(true, config.mtu); if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16)) return -EMSGSIZE; @@ -846,17 +846,9 @@ vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, { u32 device_id; void *hdr; - u8 status; int err; down_read(&vdev->cf_lock); - status = vdev->config->get_status(vdev); - if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { - NL_SET_ERR_MSG_MOD(extack, "Features negotiation not completed"); - err = -EAGAIN; - goto out; - } - hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_CONFIG_GET); if (!hdr) { @@ -913,7 +905,7 @@ static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, } vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); - max_vqp = le16_to_cpu(config.max_virtqueue_pairs); + max_vqp = __virtio16_to_cpu(true, config.max_virtqueue_pairs); if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, max_vqp)) return -EMSGSIZE; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 0f2865899647..225b7f5d8be3 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -33,7 +33,7 @@ MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable"); static int max_iotlb_entries = 2048; module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, - "Maximum number of iotlb entries. 0 means unlimited. (default: 2048)"); + "Maximum number of iotlb entries for each address space. 0 means unlimited. (default: 2048)"); #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 @@ -107,6 +107,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim) for (i = 0; i < vdpasim->dev_attr.nas; i++) vhost_iotlb_reset(&vdpasim->iommu[i]); + vdpasim->running = true; spin_unlock(&vdpasim->iommu_lock); vdpasim->features = 0; @@ -291,7 +292,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) goto err_iommu; for (i = 0; i < vdpasim->dev_attr.nas; i++) - vhost_iotlb_init(&vdpasim->iommu[i], 0, 0); + vhost_iotlb_init(&vdpasim->iommu[i], max_iotlb_entries, 0); vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); if (!vdpasim->buffer) @@ -505,6 +506,17 @@ static int vdpasim_reset(struct vdpa_device *vdpa) return 0; } +static int vdpasim_suspend(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + spin_lock(&vdpasim->lock); + vdpasim->running = false; + spin_unlock(&vdpasim->lock); + + return 0; +} + static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -694,6 +706,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, .reset = vdpasim_reset, + .suspend = vdpasim_suspend, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, @@ -726,6 +739,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, .reset = vdpasim_reset, + .suspend = vdpasim_suspend, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 622782e92239..061986f30911 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -66,6 +66,7 @@ struct vdpasim { u32 generation; u64 features; u32 groups; + bool running; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 42d401d43911..c8bfea3b7db2 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -25,31 +25,49 @@ #define DRV_LICENSE "GPL v2" #define VDPASIM_BLK_FEATURES (VDPASIM_FEATURES | \ + (1ULL << VIRTIO_BLK_F_FLUSH) | \ (1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ - (1ULL << VIRTIO_BLK_F_MQ)) + (1ULL << VIRTIO_BLK_F_MQ) | \ + (1ULL << VIRTIO_BLK_F_DISCARD) | \ + (1ULL << VIRTIO_BLK_F_WRITE_ZEROES)) #define VDPASIM_BLK_CAPACITY 0x40000 #define VDPASIM_BLK_SIZE_MAX 0x1000 #define VDPASIM_BLK_SEG_MAX 32 +#define VDPASIM_BLK_DWZ_MAX_SECTORS UINT_MAX + +/* 1 virtqueue, 1 address space, 1 virtqueue group */ #define VDPASIM_BLK_VQ_NUM 1 +#define VDPASIM_BLK_AS_NUM 1 +#define VDPASIM_BLK_GROUP_NUM 1 static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; -static bool vdpasim_blk_check_range(u64 start_sector, size_t range_size) +static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, + u64 num_sectors, u64 max_sectors) { - u64 range_sectors = range_size >> SECTOR_SHIFT; - - if (range_size > VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX) - return false; + if (start_sector > VDPASIM_BLK_CAPACITY) { + dev_dbg(&vdpasim->vdpa.dev, + "starting sector exceeds the capacity - start: 0x%llx capacity: 0x%x\n", + start_sector, VDPASIM_BLK_CAPACITY); + } - if (start_sector > VDPASIM_BLK_CAPACITY) + if (num_sectors > max_sectors) { + dev_dbg(&vdpasim->vdpa.dev, + "number of sectors exceeds the max allowed in a request - num: 0x%llx max: 0x%llx\n", + num_sectors, max_sectors); return false; + } - if (range_sectors > VDPASIM_BLK_CAPACITY - start_sector) + if (num_sectors > VDPASIM_BLK_CAPACITY - start_sector) { + dev_dbg(&vdpasim->vdpa.dev, + "request exceeds the capacity - start: 0x%llx num: 0x%llx capacity: 0x%x\n", + start_sector, num_sectors, VDPASIM_BLK_CAPACITY); return false; + } return true; } @@ -63,6 +81,7 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, { size_t pushed = 0, to_pull, to_push; struct virtio_blk_outhdr hdr; + bool handled = false; ssize_t bytes; loff_t offset; u64 sector; @@ -76,14 +95,14 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, return false; if (vq->out_iov.used < 1 || vq->in_iov.used < 1) { - dev_err(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n", + dev_dbg(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n", vq->out_iov.used, vq->in_iov.used); - return false; + goto err; } if (vq->in_iov.iov[vq->in_iov.used - 1].iov_len < 1) { - dev_err(&vdpasim->vdpa.dev, "request in header too short\n"); - return false; + dev_dbg(&vdpasim->vdpa.dev, "request in header too short\n"); + goto err; } /* The last byte is the status and we checked if the last iov has @@ -96,8 +115,8 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &hdr, sizeof(hdr)); if (bytes != sizeof(hdr)) { - dev_err(&vdpasim->vdpa.dev, "request out header too short\n"); - return false; + dev_dbg(&vdpasim->vdpa.dev, "request out header too short\n"); + goto err; } to_pull -= bytes; @@ -107,12 +126,20 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, offset = sector << SECTOR_SHIFT; status = VIRTIO_BLK_S_OK; + if (type != VIRTIO_BLK_T_IN && type != VIRTIO_BLK_T_OUT && + sector != 0) { + dev_dbg(&vdpasim->vdpa.dev, + "sector must be 0 for %u request - sector: 0x%llx\n", + type, sector); + status = VIRTIO_BLK_S_IOERR; + goto err_status; + } + switch (type) { case VIRTIO_BLK_T_IN: - if (!vdpasim_blk_check_range(sector, to_push)) { - dev_err(&vdpasim->vdpa.dev, - "reading over the capacity - offset: 0x%llx len: 0x%zx\n", - offset, to_push); + if (!vdpasim_blk_check_range(vdpasim, sector, + to_push >> SECTOR_SHIFT, + VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { status = VIRTIO_BLK_S_IOERR; break; } @@ -121,7 +148,7 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, vdpasim->buffer + offset, to_push); if (bytes < 0) { - dev_err(&vdpasim->vdpa.dev, + dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", bytes, offset, to_push); status = VIRTIO_BLK_S_IOERR; @@ -132,10 +159,9 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, break; case VIRTIO_BLK_T_OUT: - if (!vdpasim_blk_check_range(sector, to_pull)) { - dev_err(&vdpasim->vdpa.dev, - "writing over the capacity - offset: 0x%llx len: 0x%zx\n", - offset, to_pull); + if (!vdpasim_blk_check_range(vdpasim, sector, + to_pull >> SECTOR_SHIFT, + VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { status = VIRTIO_BLK_S_IOERR; break; } @@ -144,7 +170,7 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, vdpasim->buffer + offset, to_pull); if (bytes < 0) { - dev_err(&vdpasim->vdpa.dev, + dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", bytes, offset, to_pull); status = VIRTIO_BLK_S_IOERR; @@ -157,7 +183,7 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, vdpasim_blk_id, VIRTIO_BLK_ID_BYTES); if (bytes < 0) { - dev_err(&vdpasim->vdpa.dev, + dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_push_iotlb() error: %zd\n", bytes); status = VIRTIO_BLK_S_IOERR; break; @@ -166,13 +192,76 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, pushed += bytes; break; + case VIRTIO_BLK_T_FLUSH: + /* nothing to do */ + break; + + case VIRTIO_BLK_T_DISCARD: + case VIRTIO_BLK_T_WRITE_ZEROES: { + struct virtio_blk_discard_write_zeroes range; + u32 num_sectors, flags; + + if (to_pull != sizeof(range)) { + dev_dbg(&vdpasim->vdpa.dev, + "discard/write_zeroes header len: 0x%zx [expected: 0x%zx]\n", + to_pull, sizeof(range)); + status = VIRTIO_BLK_S_IOERR; + break; + } + + bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &range, + to_pull); + if (bytes < 0) { + dev_dbg(&vdpasim->vdpa.dev, + "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", + bytes, offset, to_pull); + status = VIRTIO_BLK_S_IOERR; + break; + } + + sector = le64_to_cpu(range.sector); + offset = sector << SECTOR_SHIFT; + num_sectors = le32_to_cpu(range.num_sectors); + flags = le32_to_cpu(range.flags); + + if (type == VIRTIO_BLK_T_DISCARD && flags != 0) { + dev_dbg(&vdpasim->vdpa.dev, + "discard unexpected flags set - flags: 0x%x\n", + flags); + status = VIRTIO_BLK_S_UNSUPP; + break; + } + + if (type == VIRTIO_BLK_T_WRITE_ZEROES && + flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { + dev_dbg(&vdpasim->vdpa.dev, + "write_zeroes unexpected flags set - flags: 0x%x\n", + flags); + status = VIRTIO_BLK_S_UNSUPP; + break; + } + + if (!vdpasim_blk_check_range(vdpasim, sector, num_sectors, + VDPASIM_BLK_DWZ_MAX_SECTORS)) { + status = VIRTIO_BLK_S_IOERR; + break; + } + + if (type == VIRTIO_BLK_T_WRITE_ZEROES) { + memset(vdpasim->buffer + offset, 0, + num_sectors << SECTOR_SHIFT); + } + + break; + } default: - dev_warn(&vdpasim->vdpa.dev, - "Unsupported request type %d\n", type); + dev_dbg(&vdpasim->vdpa.dev, + "Unsupported request type %d\n", type); status = VIRTIO_BLK_S_IOERR; break; } +err_status: /* If some operations fail, we need to skip the remaining bytes * to put the status in the last byte */ @@ -182,21 +271,25 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, /* Last byte is the status */ bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, &status, 1); if (bytes != 1) - return false; + goto err; pushed += bytes; /* Make sure data is wrote before advancing index */ smp_wmb(); + handled = true; + +err: vringh_complete_iotlb(&vq->vring, vq->head, pushed); - return true; + return handled; } static void vdpasim_blk_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + bool reschedule = false; int i; spin_lock(&vdpasim->lock); @@ -204,8 +297,12 @@ static void vdpasim_blk_work(struct work_struct *work) if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; + if (!vdpasim->running) + goto out; + for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[i]; + int reqs = 0; if (!vq->ready) continue; @@ -218,10 +315,18 @@ static void vdpasim_blk_work(struct work_struct *work) if (vringh_need_notify_iotlb(&vq->vring) > 0) vringh_notify(&vq->vring); local_bh_enable(); + + if (++reqs > 4) { + reschedule = true; + break; + } } } out: spin_unlock(&vdpasim->lock); + + if (reschedule) + schedule_work(&vdpasim->work); } static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) @@ -237,6 +342,17 @@ static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) blk_config->min_io_size = cpu_to_vdpasim16(vdpasim, 1); blk_config->opt_io_size = cpu_to_vdpasim32(vdpasim, 1); blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); + /* VIRTIO_BLK_F_DISCARD */ + blk_config->discard_sector_alignment = + cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); + blk_config->max_discard_sectors = + cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); + blk_config->max_discard_seg = cpu_to_vdpasim32(vdpasim, 1); + /* VIRTIO_BLK_F_WRITE_ZEROES */ + blk_config->max_write_zeroes_sectors = + cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); + blk_config->max_write_zeroes_seg = cpu_to_vdpasim32(vdpasim, 1); + } static void vdpasim_blk_mgmtdev_release(struct device *dev) @@ -260,6 +376,8 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, dev_attr.id = VIRTIO_ID_BLOCK; dev_attr.supported_features = VDPASIM_BLK_FEATURES; dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; + dev_attr.ngroups = VDPASIM_BLK_GROUP_NUM; + dev_attr.nas = VDPASIM_BLK_AS_NUM; dev_attr.config_size = sizeof(struct virtio_blk_config); dev_attr.get_config = vdpasim_blk_get_config; dev_attr.work_fn = vdpasim_blk_work; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index 5125976a4df8..886449e88502 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -154,6 +154,9 @@ static void vdpasim_net_work(struct work_struct *work) spin_lock(&vdpasim->lock); + if (!vdpasim->running) + goto out; + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 6daa3978d290..e682bc7ee6c9 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -138,18 +138,17 @@ static void do_bounce(phys_addr_t orig, void *addr, size_t size, { unsigned long pfn = PFN_DOWN(orig); unsigned int offset = offset_in_page(orig); - char *buffer; + struct page *page; unsigned int sz = 0; while (size) { sz = min_t(size_t, PAGE_SIZE - offset, size); - buffer = kmap_atomic(pfn_to_page(pfn)); + page = pfn_to_page(pfn); if (dir == DMA_TO_DEVICE) - memcpy(addr, buffer + offset, sz); + memcpy_from_page(addr, page, offset, sz); else - memcpy(buffer + offset, addr, sz); - kunmap_atomic(buffer); + memcpy_to_page(page, offset, addr, sz); size -= sz; pfn++; @@ -179,8 +178,9 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain, map->orig_phys == INVALID_PHYS_ADDR)) return; - addr = page_address(map->bounce_page) + offset; - do_bounce(map->orig_phys + offset, addr, sz, dir); + addr = kmap_local_page(map->bounce_page); + do_bounce(map->orig_phys + offset, addr + offset, sz, dir); + kunmap_local(addr); size -= sz; iova += sz; } @@ -213,21 +213,21 @@ vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova) struct vduse_bounce_map *map; struct page *page = NULL; - spin_lock(&domain->iotlb_lock); + read_lock(&domain->bounce_lock); map = &domain->bounce_maps[iova >> PAGE_SHIFT]; - if (!map->bounce_page) + if (domain->user_bounce_pages || !map->bounce_page) goto out; page = map->bounce_page; get_page(page); out: - spin_unlock(&domain->iotlb_lock); + read_unlock(&domain->bounce_lock); return page; } static void -vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain) +vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain) { struct vduse_bounce_map *map; unsigned long pfn, bounce_pfns; @@ -247,6 +247,73 @@ vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain) } } +int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, + struct page **pages, int count) +{ + struct vduse_bounce_map *map; + int i, ret; + + /* Now we don't support partial mapping */ + if (count != (domain->bounce_size >> PAGE_SHIFT)) + return -EINVAL; + + write_lock(&domain->bounce_lock); + ret = -EEXIST; + if (domain->user_bounce_pages) + goto out; + + for (i = 0; i < count; i++) { + map = &domain->bounce_maps[i]; + if (map->bounce_page) { + /* Copy kernel page to user page if it's in use */ + if (map->orig_phys != INVALID_PHYS_ADDR) + memcpy_to_page(pages[i], 0, + page_address(map->bounce_page), + PAGE_SIZE); + __free_page(map->bounce_page); + } + map->bounce_page = pages[i]; + get_page(pages[i]); + } + domain->user_bounce_pages = true; + ret = 0; +out: + write_unlock(&domain->bounce_lock); + + return ret; +} + +void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain) +{ + struct vduse_bounce_map *map; + unsigned long i, count; + + write_lock(&domain->bounce_lock); + if (!domain->user_bounce_pages) + goto out; + + count = domain->bounce_size >> PAGE_SHIFT; + for (i = 0; i < count; i++) { + struct page *page = NULL; + + map = &domain->bounce_maps[i]; + if (WARN_ON(!map->bounce_page)) + continue; + + /* Copy user page to kernel page if it's in use */ + if (map->orig_phys != INVALID_PHYS_ADDR) { + page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL); + memcpy_from_page(page_address(page), + map->bounce_page, 0, PAGE_SIZE); + } + put_page(map->bounce_page); + map->bounce_page = page; + } + domain->user_bounce_pages = false; +out: + write_unlock(&domain->bounce_lock); +} + void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain) { if (!domain->bounce_map) @@ -322,13 +389,18 @@ dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, if (vduse_domain_init_bounce_map(domain)) goto err; + read_lock(&domain->bounce_lock); if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa)) - goto err; + goto err_unlock; if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE); + read_unlock(&domain->bounce_lock); + return iova; +err_unlock: + read_unlock(&domain->bounce_lock); err: vduse_domain_free_iova(iovad, iova, size); return DMA_MAPPING_ERROR; @@ -340,10 +412,12 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, { struct iova_domain *iovad = &domain->stream_iovad; + read_lock(&domain->bounce_lock); if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE); vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size); + read_unlock(&domain->bounce_lock); vduse_domain_free_iova(iovad, dma_addr, size); } @@ -451,7 +525,8 @@ static int vduse_domain_release(struct inode *inode, struct file *file) spin_lock(&domain->iotlb_lock); vduse_iotlb_del_range(domain, 0, ULLONG_MAX); - vduse_domain_free_bounce_pages(domain); + vduse_domain_remove_user_bounce_pages(domain); + vduse_domain_free_kernel_bounce_pages(domain); spin_unlock(&domain->iotlb_lock); put_iova_domain(&domain->stream_iovad); put_iova_domain(&domain->consistent_iovad); @@ -511,6 +586,7 @@ vduse_domain_create(unsigned long iova_limit, size_t bounce_size) goto err_file; domain->file = file; + rwlock_init(&domain->bounce_lock); spin_lock_init(&domain->iotlb_lock); init_iova_domain(&domain->stream_iovad, PAGE_SIZE, IOVA_START_PFN); diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 2722d9b8e21a..4e0e50e7ac15 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -14,6 +14,7 @@ #include <linux/iova.h> #include <linux/dma-mapping.h> #include <linux/vhost_iotlb.h> +#include <linux/rwlock.h> #define IOVA_START_PFN 1 @@ -34,6 +35,8 @@ struct vduse_iova_domain { struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; struct file *file; + bool user_bounce_pages; + rwlock_t bounce_lock; }; int vduse_domain_set_map(struct vduse_iova_domain *domain, @@ -61,6 +64,11 @@ void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain); +int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, + struct page **pages, int count); + +void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain); + void vduse_domain_destroy(struct vduse_iova_domain *domain); struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit, diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 3bc27de58f46..41c0b29739f1 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -21,6 +21,8 @@ #include <linux/uio.h> #include <linux/vdpa.h> #include <linux/nospec.h> +#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include <uapi/linux/vduse.h> #include <uapi/linux/vdpa.h> #include <uapi/linux/virtio_config.h> @@ -64,6 +66,13 @@ struct vduse_vdpa { struct vduse_dev *dev; }; +struct vduse_umem { + unsigned long iova; + unsigned long npages; + struct page **pages; + struct mm_struct *mm; +}; + struct vduse_dev { struct vduse_vdpa *vdev; struct device *dev; @@ -95,6 +104,8 @@ struct vduse_dev { u8 status; u32 vq_num; u32 vq_align; + struct vduse_umem *umem; + struct mutex mem_lock; }; struct vduse_dev_msg { @@ -917,6 +928,102 @@ unlock: return ret; } +static int vduse_dev_dereg_umem(struct vduse_dev *dev, + u64 iova, u64 size) +{ + int ret; + + mutex_lock(&dev->mem_lock); + ret = -ENOENT; + if (!dev->umem) + goto unlock; + + ret = -EINVAL; + if (dev->umem->iova != iova || size != dev->domain->bounce_size) + goto unlock; + + vduse_domain_remove_user_bounce_pages(dev->domain); + unpin_user_pages_dirty_lock(dev->umem->pages, + dev->umem->npages, true); + atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm); + mmdrop(dev->umem->mm); + vfree(dev->umem->pages); + kfree(dev->umem); + dev->umem = NULL; + ret = 0; +unlock: + mutex_unlock(&dev->mem_lock); + return ret; +} + +static int vduse_dev_reg_umem(struct vduse_dev *dev, + u64 iova, u64 uaddr, u64 size) +{ + struct page **page_list = NULL; + struct vduse_umem *umem = NULL; + long pinned = 0; + unsigned long npages, lock_limit; + int ret; + + if (!dev->domain->bounce_map || + size != dev->domain->bounce_size || + iova != 0 || uaddr & ~PAGE_MASK) + return -EINVAL; + + mutex_lock(&dev->mem_lock); + ret = -EEXIST; + if (dev->umem) + goto unlock; + + ret = -ENOMEM; + npages = size >> PAGE_SHIFT; + page_list = __vmalloc(array_size(npages, sizeof(struct page *)), + GFP_KERNEL_ACCOUNT); + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!page_list || !umem) + goto unlock; + + mmap_read_lock(current->mm); + + lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK)); + if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit) + goto out; + + pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, + page_list, NULL); + if (pinned != npages) { + ret = pinned < 0 ? pinned : -ENOMEM; + goto out; + } + + ret = vduse_domain_add_user_bounce_pages(dev->domain, + page_list, pinned); + if (ret) + goto out; + + atomic64_add(npages, ¤t->mm->pinned_vm); + + umem->pages = page_list; + umem->npages = pinned; + umem->iova = iova; + umem->mm = current->mm; + mmgrab(current->mm); + + dev->umem = umem; +out: + if (ret && pinned > 0) + unpin_user_pages(page_list, pinned); + + mmap_read_unlock(current->mm); +unlock: + if (ret) { + vfree(page_list); + kfree(umem); + } + mutex_unlock(&dev->mem_lock); + return ret; +} + static long vduse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1089,6 +1196,77 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject); break; } + case VDUSE_IOTLB_REG_UMEM: { + struct vduse_iova_umem umem; + + ret = -EFAULT; + if (copy_from_user(&umem, argp, sizeof(umem))) + break; + + ret = -EINVAL; + if (!is_mem_zero((const char *)umem.reserved, + sizeof(umem.reserved))) + break; + + ret = vduse_dev_reg_umem(dev, umem.iova, + umem.uaddr, umem.size); + break; + } + case VDUSE_IOTLB_DEREG_UMEM: { + struct vduse_iova_umem umem; + + ret = -EFAULT; + if (copy_from_user(&umem, argp, sizeof(umem))) + break; + + ret = -EINVAL; + if (!is_mem_zero((const char *)umem.reserved, + sizeof(umem.reserved))) + break; + + ret = vduse_dev_dereg_umem(dev, umem.iova, + umem.size); + break; + } + case VDUSE_IOTLB_GET_INFO: { + struct vduse_iova_info info; + struct vhost_iotlb_map *map; + struct vduse_iova_domain *domain = dev->domain; + + ret = -EFAULT; + if (copy_from_user(&info, argp, sizeof(info))) + break; + + ret = -EINVAL; + if (info.start > info.last) + break; + + if (!is_mem_zero((const char *)info.reserved, + sizeof(info.reserved))) + break; + + spin_lock(&domain->iotlb_lock); + map = vhost_iotlb_itree_first(domain->iotlb, + info.start, info.last); + if (map) { + info.start = map->start; + info.last = map->last; + info.capability = 0; + if (domain->bounce_map && map->start == 0 && + map->last == domain->bounce_size - 1) + info.capability |= VDUSE_IOVA_CAP_UMEM; + } + spin_unlock(&domain->iotlb_lock); + if (!map) + break; + + ret = -EFAULT; + if (copy_to_user(argp, &info, sizeof(info))) + break; + + ret = 0; + break; + } default: ret = -ENOIOCTLCMD; break; @@ -1101,6 +1279,7 @@ static int vduse_dev_release(struct inode *inode, struct file *file) { struct vduse_dev *dev = file->private_data; + vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); spin_lock(&dev->msg_lock); /* Make sure the inflight messages can processed after reconncection */ list_splice_init(&dev->recv_list, &dev->send_list); @@ -1163,6 +1342,7 @@ static struct vduse_dev *vduse_dev_create(void) return NULL; mutex_init(&dev->lock); + mutex_init(&dev->mem_lock); spin_lock_init(&dev->msg_lock); INIT_LIST_HEAD(&dev->send_list); INIT_LIST_HEAD(&dev->recv_list); |