summaryrefslogtreecommitdiff
path: root/drivers/vhost
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vhost')
-rw-r--r--drivers/vhost/Kconfig45
-rw-r--r--drivers/vhost/Kconfig.vringh6
-rw-r--r--drivers/vhost/Makefile6
-rw-r--r--drivers/vhost/iotlb.c177
-rw-r--r--drivers/vhost/net.c5
-rw-r--r--drivers/vhost/scsi.c2
-rw-r--r--drivers/vhost/vdpa.c883
-rw-r--r--drivers/vhost/vhost.c233
-rw-r--r--drivers/vhost/vhost.h45
-rw-r--r--drivers/vhost/vringh.c421
-rw-r--r--drivers/vhost/vsock.c2
11 files changed, 1603 insertions, 222 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 3d03ccbd1adc..362b832f5338 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,4 +1,29 @@
# SPDX-License-Identifier: GPL-2.0-only
+config VHOST_IOTLB
+ tristate
+ help
+ Generic IOTLB implementation for vhost and vringh.
+
+config VHOST_RING
+ tristate
+ select VHOST_IOTLB
+ help
+ This option is selected by any driver which needs to access
+ the host side of a virtio ring.
+
+config VHOST
+ tristate
+ select VHOST_IOTLB
+ help
+ This option is selected by any driver which needs to access
+ the core of vhost.
+
+menuconfig VHOST_MENU
+ bool "VHOST drivers"
+ default y
+
+if VHOST_MENU
+
config VHOST_NET
tristate "Host kernel accelerator for virtio net"
depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP)
@@ -23,8 +48,8 @@ config VHOST_SCSI
config VHOST_VSOCK
tristate "vhost virtio-vsock driver"
depends on VSOCKETS && EVENTFD
- select VIRTIO_VSOCKETS_COMMON
select VHOST
+ select VIRTIO_VSOCKETS_COMMON
default n
---help---
This kernel module can be loaded in the host kernel to provide AF_VSOCK
@@ -34,11 +59,17 @@ config VHOST_VSOCK
To compile this driver as a module, choose M here: the module will be called
vhost_vsock.
-config VHOST
- tristate
- ---help---
- This option is selected by any driver which needs to access
- the core of vhost.
+config VHOST_VDPA
+ tristate "Vhost driver for vDPA-based backend"
+ depends on EVENTFD
+ select VHOST
+ select VDPA
+ help
+ This kernel module can be loaded in host kernel to accelerate
+ guest virtio devices with the vDPA-based backends.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vhost_vdpa.
config VHOST_CROSS_ENDIAN_LEGACY
bool "Cross-endian support for vhost"
@@ -54,3 +85,5 @@ config VHOST_CROSS_ENDIAN_LEGACY
adds some overhead, it is disabled by default.
If unsure, say "N".
+
+endif
diff --git a/drivers/vhost/Kconfig.vringh b/drivers/vhost/Kconfig.vringh
deleted file mode 100644
index c1fe36a9b8d4..000000000000
--- a/drivers/vhost/Kconfig.vringh
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config VHOST_RING
- tristate
- ---help---
- This option is selected by any driver which needs to access
- the host side of a virtio ring.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 6c6df24f770c..f3e1897cce85 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -10,4 +10,10 @@ vhost_vsock-y := vsock.o
obj-$(CONFIG_VHOST_RING) += vringh.o
+obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o
+vhost_vdpa-y := vdpa.o
+
obj-$(CONFIG_VHOST) += vhost.o
+
+obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
+vhost_iotlb-y := iotlb.o
diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c
new file mode 100644
index 000000000000..1f0ca6e44410
--- /dev/null
+++ b/drivers/vhost/iotlb.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2020 Red Hat, Inc.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ * IOTLB implementation for vhost.
+ */
+#include <linux/slab.h>
+#include <linux/vhost_iotlb.h>
+#include <linux/module.h>
+
+#define MOD_VERSION "0.1"
+#define MOD_DESC "VHOST IOTLB"
+#define MOD_AUTHOR "Jason Wang <jasowang@redhat.com>"
+#define MOD_LICENSE "GPL v2"
+
+#define START(map) ((map)->start)
+#define LAST(map) ((map)->last)
+
+INTERVAL_TREE_DEFINE(struct vhost_iotlb_map,
+ rb, __u64, __subtree_last,
+ START, LAST, static inline, vhost_iotlb_itree);
+
+/**
+ * vhost_iotlb_map_free - remove a map node and free it
+ * @iotlb: the IOTLB
+ * @map: the map that want to be remove and freed
+ */
+void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
+ struct vhost_iotlb_map *map)
+{
+ vhost_iotlb_itree_remove(map, &iotlb->root);
+ list_del(&map->link);
+ kfree(map);
+ iotlb->nmaps--;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_map_free);
+
+/**
+ * vhost_iotlb_add_range - add a new range to vhost IOTLB
+ * @iotlb: the IOTLB
+ * @start: start of the IOVA range
+ * @last: last of IOVA range
+ * @addr: the address that is mapped to @start
+ * @perm: access permission of this range
+ *
+ * Returns an error last is smaller than start or memory allocation
+ * fails
+ */
+int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
+ u64 start, u64 last,
+ u64 addr, unsigned int perm)
+{
+ struct vhost_iotlb_map *map;
+
+ if (last < start)
+ return -EFAULT;
+
+ if (iotlb->limit &&
+ iotlb->nmaps == iotlb->limit &&
+ iotlb->flags & VHOST_IOTLB_FLAG_RETIRE) {
+ map = list_first_entry(&iotlb->list, typeof(*map), link);
+ vhost_iotlb_map_free(iotlb, map);
+ }
+
+ map = kmalloc(sizeof(*map), GFP_ATOMIC);
+ if (!map)
+ return -ENOMEM;
+
+ map->start = start;
+ map->size = last - start + 1;
+ map->last = last;
+ map->addr = addr;
+ map->perm = perm;
+
+ iotlb->nmaps++;
+ vhost_iotlb_itree_insert(map, &iotlb->root);
+
+ INIT_LIST_HEAD(&map->link);
+ list_add_tail(&map->link, &iotlb->list);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_add_range);
+
+/**
+ * vring_iotlb_del_range - delete overlapped ranges from vhost IOTLB
+ * @iotlb: the IOTLB
+ * @start: start of the IOVA range
+ * @last: last of IOVA range
+ */
+void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last)
+{
+ struct vhost_iotlb_map *map;
+
+ while ((map = vhost_iotlb_itree_iter_first(&iotlb->root,
+ start, last)))
+ vhost_iotlb_map_free(iotlb, map);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_del_range);
+
+/**
+ * vhost_iotlb_alloc - add a new vhost IOTLB
+ * @limit: maximum number of IOTLB entries
+ * @flags: VHOST_IOTLB_FLAG_XXX
+ *
+ * Returns an error is memory allocation fails
+ */
+struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags)
+{
+ struct vhost_iotlb *iotlb = kzalloc(sizeof(*iotlb), GFP_KERNEL);
+
+ if (!iotlb)
+ return NULL;
+
+ iotlb->root = RB_ROOT_CACHED;
+ iotlb->limit = limit;
+ iotlb->nmaps = 0;
+ iotlb->flags = flags;
+ INIT_LIST_HEAD(&iotlb->list);
+
+ return iotlb;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_alloc);
+
+/**
+ * vhost_iotlb_reset - reset vhost IOTLB (free all IOTLB entries)
+ * @iotlb: the IOTLB to be reset
+ */
+void vhost_iotlb_reset(struct vhost_iotlb *iotlb)
+{
+ vhost_iotlb_del_range(iotlb, 0ULL, 0ULL - 1);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_reset);
+
+/**
+ * vhost_iotlb_free - reset and free vhost IOTLB
+ * @iotlb: the IOTLB to be freed
+ */
+void vhost_iotlb_free(struct vhost_iotlb *iotlb)
+{
+ if (iotlb) {
+ vhost_iotlb_reset(iotlb);
+ kfree(iotlb);
+ }
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_free);
+
+/**
+ * vhost_iotlb_itree_first - return the first overlapped range
+ * @iotlb: the IOTLB
+ * @start: start of IOVA range
+ * @end: end of IOVA range
+ */
+struct vhost_iotlb_map *
+vhost_iotlb_itree_first(struct vhost_iotlb *iotlb, u64 start, u64 last)
+{
+ return vhost_iotlb_itree_iter_first(&iotlb->root, start, last);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_itree_first);
+
+/**
+ * vhost_iotlb_itree_first - return the next overlapped range
+ * @iotlb: the IOTLB
+ * @start: start of IOVA range
+ * @end: end of IOVA range
+ */
+struct vhost_iotlb_map *
+vhost_iotlb_itree_next(struct vhost_iotlb_map *map, u64 start, u64 last)
+{
+ return vhost_iotlb_itree_iter_next(map, start, last);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_itree_next);
+
+MODULE_VERSION(MOD_VERSION);
+MODULE_DESCRIPTION(MOD_DESC);
+MODULE_AUTHOR(MOD_AUTHOR);
+MODULE_LICENSE(MOD_LICENSE);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 18e205eeb9af..87469d67ede8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1324,7 +1324,8 @@ static int vhost_net_open(struct inode *inode, struct file *f)
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
UIO_MAXIOV + VHOST_NET_BATCH,
- VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
+ VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT,
+ NULL);
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
@@ -1586,7 +1587,7 @@ static long vhost_net_reset_owner(struct vhost_net *n)
struct socket *tx_sock = NULL;
struct socket *rx_sock = NULL;
long err;
- struct vhost_umem *umem;
+ struct vhost_iotlb *umem;
mutex_lock(&n->dev.mutex);
err = vhost_dev_check_owner(&n->dev);
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 0b949a14bce3..7653667a8cdc 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1628,7 +1628,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
}
vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV,
- VHOST_SCSI_WEIGHT, 0);
+ VHOST_SCSI_WEIGHT, 0, NULL);
vhost_scsi_init_inflight(vs, NULL);
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
new file mode 100644
index 000000000000..421f02a8530a
--- /dev/null
+++ b/drivers/vhost/vdpa.c
@@ -0,0 +1,883 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018-2020 Intel Corporation.
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * Author: Tiwei Bie <tiwei.bie@intel.com>
+ * Jason Wang <jasowang@redhat.com>
+ *
+ * Thanks Michael S. Tsirkin for the valuable comments and
+ * suggestions. And thanks to Cunming Liang and Zhihong Wang for all
+ * their supports.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/uuid.h>
+#include <linux/vdpa.h>
+#include <linux/nospec.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+
+#include "vhost.h"
+
+enum {
+ VHOST_VDPA_FEATURES =
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
+ (1ULL << VIRTIO_F_ANY_LAYOUT) |
+ (1ULL << VIRTIO_F_VERSION_1) |
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+ (1ULL << VIRTIO_F_RING_PACKED) |
+ (1ULL << VIRTIO_F_ORDER_PLATFORM) |
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+ (1ULL << VIRTIO_RING_F_EVENT_IDX),
+
+ VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES |
+ (1ULL << VIRTIO_NET_F_CSUM) |
+ (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
+ (1ULL << VIRTIO_NET_F_MTU) |
+ (1ULL << VIRTIO_NET_F_MAC) |
+ (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
+ (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
+ (1ULL << VIRTIO_NET_F_GUEST_ECN) |
+ (1ULL << VIRTIO_NET_F_GUEST_UFO) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) |
+ (1ULL << VIRTIO_NET_F_HOST_ECN) |
+ (1ULL << VIRTIO_NET_F_HOST_UFO) |
+ (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << VIRTIO_NET_F_STATUS) |
+ (1ULL << VIRTIO_NET_F_SPEED_DUPLEX),
+};
+
+/* Currently, only network backend w/o multiqueue is supported. */
+#define VHOST_VDPA_VQ_MAX 2
+
+#define VHOST_VDPA_DEV_MAX (1U << MINORBITS)
+
+struct vhost_vdpa {
+ struct vhost_dev vdev;
+ struct iommu_domain *domain;
+ struct vhost_virtqueue *vqs;
+ struct completion completion;
+ struct vdpa_device *vdpa;
+ struct device dev;
+ struct cdev cdev;
+ atomic_t opened;
+ int nvqs;
+ int virtio_id;
+ int minor;
+};
+
+static DEFINE_IDA(vhost_vdpa_ida);
+
+static dev_t vhost_vdpa_major;
+
+static const u64 vhost_vdpa_features[] = {
+ [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES,
+};
+
+static void handle_vq_kick(struct vhost_work *work)
+{
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev);
+ const struct vdpa_config_ops *ops = v->vdpa->config;
+
+ ops->kick_vq(v->vdpa, vq - v->vqs);
+}
+
+static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
+{
+ struct vhost_virtqueue *vq = private;
+ struct eventfd_ctx *call_ctx = vq->call_ctx;
+
+ if (call_ctx)
+ eventfd_signal(call_ctx, 1);
+
+ return IRQ_HANDLED;
+}
+
+static void vhost_vdpa_reset(struct vhost_vdpa *v)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ ops->set_status(vdpa, 0);
+}
+
+static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u32 device_id;
+
+ device_id = ops->get_device_id(vdpa);
+
+ if (copy_to_user(argp, &device_id, sizeof(device_id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u8 status;
+
+ status = ops->get_status(vdpa);
+
+ if (copy_to_user(statusp, &status, sizeof(status)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u8 status;
+
+ if (copy_from_user(&status, statusp, sizeof(status)))
+ return -EFAULT;
+
+ /*
+ * Userspace shouldn't remove status bits unless reset the
+ * status to 0.
+ */
+ if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
+ return -EINVAL;
+
+ ops->set_status(vdpa, status);
+
+ return 0;
+}
+
+static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
+ struct vhost_vdpa_config *c)
+{
+ long size = 0;
+
+ switch (v->virtio_id) {
+ case VIRTIO_ID_NET:
+ size = sizeof(struct virtio_net_config);
+ break;
+ }
+
+ if (c->len == 0)
+ return -EINVAL;
+
+ if (c->len > size - c->off)
+ return -E2BIG;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_config(struct vhost_vdpa *v,
+ struct vhost_vdpa_config __user *c)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa_config config;
+ unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+ u8 *buf;
+
+ if (copy_from_user(&config, c, size))
+ return -EFAULT;
+ if (vhost_vdpa_config_validate(v, &config))
+ return -EINVAL;
+ buf = kvzalloc(config.len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ ops->get_config(vdpa, config.off, buf, config.len);
+
+ if (copy_to_user(c->buf, buf, config.len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ kvfree(buf);
+ return 0;
+}
+
+static long vhost_vdpa_set_config(struct vhost_vdpa *v,
+ struct vhost_vdpa_config __user *c)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa_config config;
+ unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+ u8 *buf;
+
+ if (copy_from_user(&config, c, size))
+ return -EFAULT;
+ if (vhost_vdpa_config_validate(v, &config))
+ return -EINVAL;
+ buf = kvzalloc(config.len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, c->buf, config.len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ ops->set_config(vdpa, config.off, buf, config.len);
+
+ kvfree(buf);
+ return 0;
+}
+
+static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u64 features;
+
+ features = ops->get_features(vdpa);
+ features &= vhost_vdpa_features[v->virtio_id];
+
+ if (copy_to_user(featurep, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u64 features;
+
+ /*
+ * It's not allowed to change the features after they have
+ * been negotiated.
+ */
+ if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
+ return -EBUSY;
+
+ if (copy_from_user(&features, featurep, sizeof(features)))
+ return -EFAULT;
+
+ if (features & ~vhost_vdpa_features[v->virtio_id])
+ return -EINVAL;
+
+ if (ops->set_features(vdpa, features))
+ return -EINVAL;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u16 num;
+
+ num = ops->get_vq_num_max(vdpa);
+
+ if (copy_to_user(argp, &num, sizeof(num)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
+ void __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vdpa_callback cb;
+ struct vhost_virtqueue *vq;
+ struct vhost_vring_state s;
+ u8 status;
+ u32 idx;
+ long r;
+
+ r = get_user(idx, (u32 __user *)argp);
+ if (r < 0)
+ return r;
+
+ if (idx >= v->nvqs)
+ return -ENOBUFS;
+
+ idx = array_index_nospec(idx, v->nvqs);
+ vq = &v->vqs[idx];
+
+ status = ops->get_status(vdpa);
+
+ if (cmd == VHOST_VDPA_SET_VRING_ENABLE) {
+ if (copy_from_user(&s, argp, sizeof(s)))
+ return -EFAULT;
+ ops->set_vq_ready(vdpa, idx, s.num);
+ return 0;
+ }
+
+ if (cmd == VHOST_GET_VRING_BASE)
+ vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx);
+
+ r = vhost_vring_ioctl(&v->vdev, cmd, argp);
+ if (r)
+ return r;
+
+ switch (cmd) {
+ case VHOST_SET_VRING_ADDR:
+ if (ops->set_vq_address(vdpa, idx,
+ (u64)(uintptr_t)vq->desc,
+ (u64)(uintptr_t)vq->avail,
+ (u64)(uintptr_t)vq->used))
+ r = -EINVAL;
+ break;
+
+ case VHOST_SET_VRING_BASE:
+ if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx))
+ r = -EINVAL;
+ break;
+
+ case VHOST_SET_VRING_CALL:
+ if (vq->call_ctx) {
+ cb.callback = vhost_vdpa_virtqueue_cb;
+ cb.private = vq;
+ } else {
+ cb.callback = NULL;
+ cb.private = NULL;
+ }
+ ops->set_vq_cb(vdpa, idx, &cb);
+ break;
+
+ case VHOST_SET_VRING_NUM:
+ ops->set_vq_num(vdpa, idx, vq->num);
+ break;
+ }
+
+ return r;
+}
+
+static long vhost_vdpa_unlocked_ioctl(struct file *filep,
+ unsigned int cmd, unsigned long arg)
+{
+ struct vhost_vdpa *v = filep->private_data;
+ struct vhost_dev *d = &v->vdev;
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ mutex_lock(&d->mutex);
+
+ switch (cmd) {
+ case VHOST_VDPA_GET_DEVICE_ID:
+ r = vhost_vdpa_get_device_id(v, argp);
+ break;
+ case VHOST_VDPA_GET_STATUS:
+ r = vhost_vdpa_get_status(v, argp);
+ break;
+ case VHOST_VDPA_SET_STATUS:
+ r = vhost_vdpa_set_status(v, argp);
+ break;
+ case VHOST_VDPA_GET_CONFIG:
+ r = vhost_vdpa_get_config(v, argp);
+ break;
+ case VHOST_VDPA_SET_CONFIG:
+ r = vhost_vdpa_set_config(v, argp);
+ break;
+ case VHOST_GET_FEATURES:
+ r = vhost_vdpa_get_features(v, argp);
+ break;
+ case VHOST_SET_FEATURES:
+ r = vhost_vdpa_set_features(v, argp);
+ break;
+ case VHOST_VDPA_GET_VRING_NUM:
+ r = vhost_vdpa_get_vring_num(v, argp);
+ break;
+ case VHOST_SET_LOG_BASE:
+ case VHOST_SET_LOG_FD:
+ r = -ENOIOCTLCMD;
+ break;
+ default:
+ r = vhost_dev_ioctl(&v->vdev, cmd, argp);
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vdpa_vring_ioctl(v, cmd, argp);
+ break;
+ }
+
+ mutex_unlock(&d->mutex);
+ return r;
+}
+
+static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+{
+ struct vhost_dev *dev = &v->vdev;
+ struct vhost_iotlb *iotlb = dev->iotlb;
+ struct vhost_iotlb_map *map;
+ struct page *page;
+ unsigned long pfn, pinned;
+
+ while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
+ pinned = map->size >> PAGE_SHIFT;
+ for (pfn = map->addr >> PAGE_SHIFT;
+ pinned > 0; pfn++, pinned--) {
+ page = pfn_to_page(pfn);
+ if (map->perm & VHOST_ACCESS_WO)
+ set_page_dirty_lock(page);
+ unpin_user_page(page);
+ }
+ atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm);
+ vhost_iotlb_map_free(iotlb, map);
+ }
+}
+
+static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v)
+{
+ struct vhost_dev *dev = &v->vdev;
+
+ vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1);
+ kfree(dev->iotlb);
+ dev->iotlb = NULL;
+}
+
+static int perm_to_iommu_flags(u32 perm)
+{
+ int flags = 0;
+
+ switch (perm) {
+ case VHOST_ACCESS_WO:
+ flags |= IOMMU_WRITE;
+ break;
+ case VHOST_ACCESS_RO:
+ flags |= IOMMU_READ;
+ break;
+ case VHOST_ACCESS_RW:
+ flags |= (IOMMU_WRITE | IOMMU_READ);
+ break;
+ default:
+ WARN(1, "invalidate vhost IOTLB permission\n");
+ break;
+ }
+
+ return flags | IOMMU_CACHE;
+}
+
+static int vhost_vdpa_map(struct vhost_vdpa *v,
+ u64 iova, u64 size, u64 pa, u32 perm)
+{
+ struct vhost_dev *dev = &v->vdev;
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ int r = 0;
+
+ r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1,
+ pa, perm);
+ if (r)
+ return r;
+
+ if (ops->dma_map)
+ r = ops->dma_map(vdpa, iova, size, pa, perm);
+ else if (ops->set_map)
+ r = ops->set_map(vdpa, dev->iotlb);
+ else
+ r = iommu_map(v->domain, iova, pa, size,
+ perm_to_iommu_flags(perm));
+
+ return r;
+}
+
+static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)
+{
+ struct vhost_dev *dev = &v->vdev;
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1);
+
+ if (ops->dma_map)
+ ops->dma_unmap(vdpa, iova, size);
+ else if (ops->set_map)
+ ops->set_map(vdpa, dev->iotlb);
+ else
+ iommu_unmap(v->domain, iova, size);
+}
+
+static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
+ struct vhost_iotlb_msg *msg)
+{
+ struct vhost_dev *dev = &v->vdev;
+ struct vhost_iotlb *iotlb = dev->iotlb;
+ struct page **page_list;
+ unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+ unsigned int gup_flags = FOLL_LONGTERM;
+ unsigned long npages, cur_base, map_pfn, last_pfn = 0;
+ unsigned long locked, lock_limit, pinned, i;
+ u64 iova = msg->iova;
+ int ret = 0;
+
+ if (vhost_iotlb_itree_first(iotlb, msg->iova,
+ msg->iova + msg->size - 1))
+ return -EEXIST;
+
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list)
+ return -ENOMEM;
+
+ if (msg->perm & VHOST_ACCESS_WO)
+ gup_flags |= FOLL_WRITE;
+
+ npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
+ if (!npages)
+ return -EINVAL;
+
+ down_read(&dev->mm->mmap_sem);
+
+ locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (locked > lock_limit) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cur_base = msg->uaddr & PAGE_MASK;
+ iova &= PAGE_MASK;
+
+ while (npages) {
+ pinned = min_t(unsigned long, npages, list_size);
+ ret = pin_user_pages(cur_base, pinned,
+ gup_flags, page_list, NULL);
+ if (ret != pinned)
+ goto out;
+
+ if (!last_pfn)
+ map_pfn = page_to_pfn(page_list[0]);
+
+ for (i = 0; i < ret; i++) {
+ unsigned long this_pfn = page_to_pfn(page_list[i]);
+ u64 csize;
+
+ if (last_pfn && (this_pfn != last_pfn + 1)) {
+ /* Pin a contiguous chunk of memory */
+ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
+ if (vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm))
+ goto out;
+ map_pfn = this_pfn;
+ iova += csize;
+ }
+
+ last_pfn = this_pfn;
+ }
+
+ cur_base += ret << PAGE_SHIFT;
+ npages -= ret;
+ }
+
+ /* Pin the rest chunk */
+ ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT,
+ map_pfn << PAGE_SHIFT, msg->perm);
+out:
+ if (ret) {
+ vhost_vdpa_unmap(v, msg->iova, msg->size);
+ atomic64_sub(npages, &dev->mm->pinned_vm);
+ }
+ up_read(&dev->mm->mmap_sem);
+ free_page((unsigned long)page_list);
+ return ret;
+}
+
+static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg)
+{
+ struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev);
+ int r = 0;
+
+ r = vhost_dev_check_owner(dev);
+ if (r)
+ return r;
+
+ switch (msg->type) {
+ case VHOST_IOTLB_UPDATE:
+ r = vhost_vdpa_process_iotlb_update(v, msg);
+ break;
+ case VHOST_IOTLB_INVALIDATE:
+ vhost_vdpa_unmap(v, msg->iova, msg->size);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
+static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct vhost_vdpa *v = file->private_data;
+ struct vhost_dev *dev = &v->vdev;
+
+ return vhost_chr_write_iter(dev, from);
+}
+
+static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct device *dma_dev = vdpa_get_dma_dev(vdpa);
+ struct bus_type *bus;
+ int ret;
+
+ /* Device want to do DMA by itself */
+ if (ops->set_map || ops->dma_map)
+ return 0;
+
+ bus = dma_dev->bus;
+ if (!bus)
+ return -EFAULT;
+
+ if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
+ return -ENOTSUPP;
+
+ v->domain = iommu_domain_alloc(bus);
+ if (!v->domain)
+ return -EIO;
+
+ ret = iommu_attach_device(v->domain, dma_dev);
+ if (ret)
+ goto err_attach;
+
+ return 0;
+
+err_attach:
+ iommu_domain_free(v->domain);
+ return ret;
+}
+
+static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ struct device *dma_dev = vdpa_get_dma_dev(vdpa);
+
+ if (v->domain) {
+ iommu_detach_device(v->domain, dma_dev);
+ iommu_domain_free(v->domain);
+ }
+
+ v->domain = NULL;
+}
+
+static int vhost_vdpa_open(struct inode *inode, struct file *filep)
+{
+ struct vhost_vdpa *v;
+ struct vhost_dev *dev;
+ struct vhost_virtqueue **vqs;
+ int nvqs, i, r, opened;
+
+ v = container_of(inode->i_cdev, struct vhost_vdpa, cdev);
+ if (!v)
+ return -ENODEV;
+
+ opened = atomic_cmpxchg(&v->opened, 0, 1);
+ if (opened)
+ return -EBUSY;
+
+ nvqs = v->nvqs;
+ vhost_vdpa_reset(v);
+
+ vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
+ if (!vqs) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ dev = &v->vdev;
+ for (i = 0; i < nvqs; i++) {
+ vqs[i] = &v->vqs[i];
+ vqs[i]->handle_kick = handle_vq_kick;
+ }
+ vhost_dev_init(dev, vqs, nvqs, 0, 0, 0,
+ vhost_vdpa_process_iotlb_msg);
+
+ dev->iotlb = vhost_iotlb_alloc(0, 0);
+ if (!dev->iotlb) {
+ r = -ENOMEM;
+ goto err_init_iotlb;
+ }
+
+ r = vhost_vdpa_alloc_domain(v);
+ if (r)
+ goto err_init_iotlb;
+
+ filep->private_data = v;
+
+ return 0;
+
+err_init_iotlb:
+ vhost_dev_cleanup(&v->vdev);
+err:
+ atomic_dec(&v->opened);
+ return r;
+}
+
+static int vhost_vdpa_release(struct inode *inode, struct file *filep)
+{
+ struct vhost_vdpa *v = filep->private_data;
+ struct vhost_dev *d = &v->vdev;
+
+ mutex_lock(&d->mutex);
+ filep->private_data = NULL;
+ vhost_vdpa_reset(v);
+ vhost_dev_stop(&v->vdev);
+ vhost_vdpa_iotlb_free(v);
+ vhost_vdpa_free_domain(v);
+ vhost_dev_cleanup(&v->vdev);
+ kfree(v->vdev.vqs);
+ mutex_unlock(&d->mutex);
+
+ atomic_dec(&v->opened);
+ complete(&v->completion);
+
+ return 0;
+}
+
+static const struct file_operations vhost_vdpa_fops = {
+ .owner = THIS_MODULE,
+ .open = vhost_vdpa_open,
+ .release = vhost_vdpa_release,
+ .write_iter = vhost_vdpa_chr_write_iter,
+ .unlocked_ioctl = vhost_vdpa_unlocked_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+
+static void vhost_vdpa_release_dev(struct device *device)
+{
+ struct vhost_vdpa *v =
+ container_of(device, struct vhost_vdpa, dev);
+
+ ida_simple_remove(&vhost_vdpa_ida, v->minor);
+ kfree(v->vqs);
+ kfree(v);
+}
+
+static int vhost_vdpa_probe(struct vdpa_device *vdpa)
+{
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa *v;
+ int minor, nvqs = VHOST_VDPA_VQ_MAX;
+ int r;
+
+ /* Currently, we only accept the network devices. */
+ if (ops->get_device_id(vdpa) != VIRTIO_ID_NET)
+ return -ENOTSUPP;
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!v)
+ return -ENOMEM;
+
+ minor = ida_simple_get(&vhost_vdpa_ida, 0,
+ VHOST_VDPA_DEV_MAX, GFP_KERNEL);
+ if (minor < 0) {
+ kfree(v);
+ return minor;
+ }
+
+ atomic_set(&v->opened, 0);
+ v->minor = minor;
+ v->vdpa = vdpa;
+ v->nvqs = nvqs;
+ v->virtio_id = ops->get_device_id(vdpa);
+
+ device_initialize(&v->dev);
+ v->dev.release = vhost_vdpa_release_dev;
+ v->dev.parent = &vdpa->dev;
+ v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor);
+ v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
+ GFP_KERNEL);
+ if (!v->vqs) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
+ if (r)
+ goto err;
+
+ cdev_init(&v->cdev, &vhost_vdpa_fops);
+ v->cdev.owner = THIS_MODULE;
+
+ r = cdev_device_add(&v->cdev, &v->dev);
+ if (r)
+ goto err;
+
+ init_completion(&v->completion);
+ vdpa_set_drvdata(vdpa, v);
+
+ return 0;
+
+err:
+ put_device(&v->dev);
+ return r;
+}
+
+static void vhost_vdpa_remove(struct vdpa_device *vdpa)
+{
+ struct vhost_vdpa *v = vdpa_get_drvdata(vdpa);
+ int opened;
+
+ cdev_device_del(&v->cdev, &v->dev);
+
+ do {
+ opened = atomic_cmpxchg(&v->opened, 0, 1);
+ if (!opened)
+ break;
+ wait_for_completion(&v->completion);
+ } while (1);
+
+ put_device(&v->dev);
+}
+
+static struct vdpa_driver vhost_vdpa_driver = {
+ .driver = {
+ .name = "vhost_vdpa",
+ },
+ .probe = vhost_vdpa_probe,
+ .remove = vhost_vdpa_remove,
+};
+
+static int __init vhost_vdpa_init(void)
+{
+ int r;
+
+ r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX,
+ "vhost-vdpa");
+ if (r)
+ goto err_alloc_chrdev;
+
+ r = vdpa_register_driver(&vhost_vdpa_driver);
+ if (r)
+ goto err_vdpa_register_driver;
+
+ return 0;
+
+err_vdpa_register_driver:
+ unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
+err_alloc_chrdev:
+ return r;
+}
+module_init(vhost_vdpa_init);
+
+static void __exit vhost_vdpa_exit(void)
+{
+ vdpa_unregister_driver(&vhost_vdpa_driver);
+ unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
+}
+module_exit(vhost_vdpa_exit);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("vDPA-based vhost backend for virtio");
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f44340b41494..d450e16c5c25 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -50,10 +50,6 @@ enum {
#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
-INTERVAL_TREE_DEFINE(struct vhost_umem_node,
- rb, __u64, __subtree_last,
- START, LAST, static inline, vhost_umem_interval_tree);
-
#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
@@ -457,7 +453,9 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
void vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue **vqs, int nvqs,
- int iov_limit, int weight, int byte_weight)
+ int iov_limit, int weight, int byte_weight,
+ int (*msg_handler)(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg))
{
struct vhost_virtqueue *vq;
int i;
@@ -473,6 +471,7 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->iov_limit = iov_limit;
dev->weight = weight;
dev->byte_weight = byte_weight;
+ dev->msg_handler = msg_handler;
init_llist_head(&dev->work_list);
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
@@ -581,21 +580,25 @@ err_mm:
}
EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
-struct vhost_umem *vhost_dev_reset_owner_prepare(void)
+static struct vhost_iotlb *iotlb_alloc(void)
+{
+ return vhost_iotlb_alloc(max_iotlb_entries,
+ VHOST_IOTLB_FLAG_RETIRE);
+}
+
+struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
{
- return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL);
+ return iotlb_alloc();
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
/* Caller should have device mutex */
-void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
{
int i;
vhost_dev_cleanup(dev);
- /* Restore memory to default empty mapping. */
- INIT_LIST_HEAD(&umem->umem_list);
dev->umem = umem;
/* We don't need VQ locks below since vhost_dev_cleanup makes sure
* VQs aren't running.
@@ -618,28 +621,6 @@ void vhost_dev_stop(struct vhost_dev *dev)
}
EXPORT_SYMBOL_GPL(vhost_dev_stop);
-static void vhost_umem_free(struct vhost_umem *umem,
- struct vhost_umem_node *node)
-{
- vhost_umem_interval_tree_remove(node, &umem->umem_tree);
- list_del(&node->link);
- kfree(node);
- umem->numem--;
-}
-
-static void vhost_umem_clean(struct vhost_umem *umem)
-{
- struct vhost_umem_node *node, *tmp;
-
- if (!umem)
- return;
-
- list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
- vhost_umem_free(umem, node);
-
- kvfree(umem);
-}
-
static void vhost_clear_msg(struct vhost_dev *dev)
{
struct vhost_msg_node *node, *n;
@@ -677,9 +658,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
eventfd_ctx_put(dev->log_ctx);
dev->log_ctx = NULL;
/* No one will access memory at this point */
- vhost_umem_clean(dev->umem);
+ vhost_iotlb_free(dev->umem);
dev->umem = NULL;
- vhost_umem_clean(dev->iotlb);
+ vhost_iotlb_free(dev->iotlb);
dev->iotlb = NULL;
vhost_clear_msg(dev);
wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
@@ -715,27 +696,26 @@ static bool vhost_overflow(u64 uaddr, u64 size)
}
/* Caller should have vq mutex and device mutex. */
-static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
+static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
int log_all)
{
- struct vhost_umem_node *node;
+ struct vhost_iotlb_map *map;
if (!umem)
return false;
- list_for_each_entry(node, &umem->umem_list, link) {
- unsigned long a = node->userspace_addr;
+ list_for_each_entry(map, &umem->list, link) {
+ unsigned long a = map->addr;
- if (vhost_overflow(node->userspace_addr, node->size))
+ if (vhost_overflow(map->addr, map->size))
return false;
- if (!access_ok((void __user *)a,
- node->size))
+ if (!access_ok((void __user *)a, map->size))
return false;
else if (log_all && !log_access_ok(log_base,
- node->start,
- node->size))
+ map->start,
+ map->size))
return false;
}
return true;
@@ -745,17 +725,17 @@ static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
u64 addr, unsigned int size,
int type)
{
- const struct vhost_umem_node *node = vq->meta_iotlb[type];
+ const struct vhost_iotlb_map *map = vq->meta_iotlb[type];
- if (!node)
+ if (!map)
return NULL;
- return (void *)(uintptr_t)(node->userspace_addr + addr - node->start);
+ return (void *)(uintptr_t)(map->addr + addr - map->start);
}
/* Can we switch to this memory table? */
/* Caller should have device mutex but not vq mutex */
-static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
+static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
int log_all)
{
int i;
@@ -1020,47 +1000,6 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
}
-static int vhost_new_umem_range(struct vhost_umem *umem,
- u64 start, u64 size, u64 end,
- u64 userspace_addr, int perm)
-{
- struct vhost_umem_node *tmp, *node;
-
- if (!size)
- return -EFAULT;
-
- node = kmalloc(sizeof(*node), GFP_ATOMIC);
- if (!node)
- return -ENOMEM;
-
- if (umem->numem == max_iotlb_entries) {
- tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
- vhost_umem_free(umem, tmp);
- }
-
- node->start = start;
- node->size = size;
- node->last = end;
- node->userspace_addr = userspace_addr;
- node->perm = perm;
- INIT_LIST_HEAD(&node->link);
- list_add_tail(&node->link, &umem->umem_list);
- vhost_umem_interval_tree_insert(node, &umem->umem_tree);
- umem->numem++;
-
- return 0;
-}
-
-static void vhost_del_umem_range(struct vhost_umem *umem,
- u64 start, u64 end)
-{
- struct vhost_umem_node *node;
-
- while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
- start, end)))
- vhost_umem_free(umem, node);
-}
-
static void vhost_iotlb_notify_vq(struct vhost_dev *d,
struct vhost_iotlb_msg *msg)
{
@@ -1117,9 +1056,9 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
vhost_vq_meta_reset(dev);
- if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
- msg->iova + msg->size - 1,
- msg->uaddr, msg->perm)) {
+ if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
+ msg->iova + msg->size - 1,
+ msg->uaddr, msg->perm)) {
ret = -ENOMEM;
break;
}
@@ -1131,8 +1070,8 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
vhost_vq_meta_reset(dev);
- vhost_del_umem_range(dev->iotlb, msg->iova,
- msg->iova + msg->size - 1);
+ vhost_iotlb_del_range(dev->iotlb, msg->iova,
+ msg->iova + msg->size - 1);
break;
default:
ret = -EINVAL;
@@ -1178,7 +1117,12 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
ret = -EINVAL;
goto done;
}
- if (vhost_process_iotlb_msg(dev, &msg)) {
+
+ if (dev->msg_handler)
+ ret = dev->msg_handler(dev, &msg);
+ else
+ ret = vhost_process_iotlb_msg(dev, &msg);
+ if (ret) {
ret = -EFAULT;
goto done;
}
@@ -1311,44 +1255,42 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
}
static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
- const struct vhost_umem_node *node,
+ const struct vhost_iotlb_map *map,
int type)
{
int access = (type == VHOST_ADDR_USED) ?
VHOST_ACCESS_WO : VHOST_ACCESS_RO;
- if (likely(node->perm & access))
- vq->meta_iotlb[type] = node;
+ if (likely(map->perm & access))
+ vq->meta_iotlb[type] = map;
}
static bool iotlb_access_ok(struct vhost_virtqueue *vq,
int access, u64 addr, u64 len, int type)
{
- const struct vhost_umem_node *node;
- struct vhost_umem *umem = vq->iotlb;
+ const struct vhost_iotlb_map *map;
+ struct vhost_iotlb *umem = vq->iotlb;
u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
if (vhost_vq_meta_fetch(vq, addr, len, type))
return true;
while (len > s) {
- node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
- addr,
- last);
- if (node == NULL || node->start > addr) {
+ map = vhost_iotlb_itree_first(umem, addr, last);
+ if (map == NULL || map->start > addr) {
vhost_iotlb_miss(vq, addr, access);
return false;
- } else if (!(node->perm & access)) {
+ } else if (!(map->perm & access)) {
/* Report the possible access violation by
* request another translation from userspace.
*/
return false;
}
- size = node->size - addr + node->start;
+ size = map->size - addr + map->start;
if (orig_addr == addr && size >= len)
- vhost_vq_meta_update(vq, node, type);
+ vhost_vq_meta_update(vq, map, type);
s += size;
addr += size;
@@ -1364,12 +1306,12 @@ int vq_meta_prefetch(struct vhost_virtqueue *vq)
if (!vq->iotlb)
return 1;
- return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+ return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
- iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
+ iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
vhost_get_avail_size(vq, num),
VHOST_ADDR_AVAIL) &&
- iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
+ iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
vhost_get_used_size(vq, num), VHOST_ADDR_USED);
}
EXPORT_SYMBOL_GPL(vq_meta_prefetch);
@@ -1408,25 +1350,11 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
}
EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
-static struct vhost_umem *vhost_umem_alloc(void)
-{
- struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL);
-
- if (!umem)
- return NULL;
-
- umem->umem_tree = RB_ROOT_CACHED;
- umem->numem = 0;
- INIT_LIST_HEAD(&umem->umem_list);
-
- return umem;
-}
-
static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
{
struct vhost_memory mem, *newmem;
struct vhost_memory_region *region;
- struct vhost_umem *newumem, *oldumem;
+ struct vhost_iotlb *newumem, *oldumem;
unsigned long size = offsetof(struct vhost_memory, regions);
int i;
@@ -1448,7 +1376,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
return -EFAULT;
}
- newumem = vhost_umem_alloc();
+ newumem = iotlb_alloc();
if (!newumem) {
kvfree(newmem);
return -ENOMEM;
@@ -1457,13 +1385,12 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
for (region = newmem->regions;
region < newmem->regions + mem.nregions;
region++) {
- if (vhost_new_umem_range(newumem,
- region->guest_phys_addr,
- region->memory_size,
- region->guest_phys_addr +
- region->memory_size - 1,
- region->userspace_addr,
- VHOST_ACCESS_RW))
+ if (vhost_iotlb_add_range(newumem,
+ region->guest_phys_addr,
+ region->guest_phys_addr +
+ region->memory_size - 1,
+ region->userspace_addr,
+ VHOST_MAP_RW))
goto err;
}
@@ -1481,11 +1408,11 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
}
kvfree(newmem);
- vhost_umem_clean(oldumem);
+ vhost_iotlb_free(oldumem);
return 0;
err:
- vhost_umem_clean(newumem);
+ vhost_iotlb_free(newumem);
kvfree(newmem);
return -EFAULT;
}
@@ -1726,10 +1653,10 @@ EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
{
- struct vhost_umem *niotlb, *oiotlb;
+ struct vhost_iotlb *niotlb, *oiotlb;
int i;
- niotlb = vhost_umem_alloc();
+ niotlb = iotlb_alloc();
if (!niotlb)
return -ENOMEM;
@@ -1745,7 +1672,7 @@ int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
mutex_unlock(&vq->mutex);
}
- vhost_umem_clean(oiotlb);
+ vhost_iotlb_free(oiotlb);
return 0;
}
@@ -1875,8 +1802,8 @@ static int log_write(void __user *log_base,
static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
{
- struct vhost_umem *umem = vq->umem;
- struct vhost_umem_node *u;
+ struct vhost_iotlb *umem = vq->umem;
+ struct vhost_iotlb_map *u;
u64 start, end, l, min;
int r;
bool hit = false;
@@ -1886,16 +1813,15 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
/* More than one GPAs can be mapped into a single HVA. So
* iterate all possible umems here to be safe.
*/
- list_for_each_entry(u, &umem->umem_list, link) {
- if (u->userspace_addr > hva - 1 + len ||
- u->userspace_addr - 1 + u->size < hva)
+ list_for_each_entry(u, &umem->list, link) {
+ if (u->addr > hva - 1 + len ||
+ u->addr - 1 + u->size < hva)
continue;
- start = max(u->userspace_addr, hva);
- end = min(u->userspace_addr - 1 + u->size,
- hva - 1 + len);
+ start = max(u->addr, hva);
+ end = min(u->addr - 1 + u->size, hva - 1 + len);
l = end - start + 1;
r = log_write(vq->log_base,
- u->start + start - u->userspace_addr,
+ u->start + start - u->addr,
l);
if (r < 0)
return r;
@@ -2046,9 +1972,9 @@ EXPORT_SYMBOL_GPL(vhost_vq_init_access);
static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
struct iovec iov[], int iov_size, int access)
{
- const struct vhost_umem_node *node;
+ const struct vhost_iotlb_map *map;
struct vhost_dev *dev = vq->dev;
- struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
+ struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
struct iovec *_iov;
u64 s = 0;
int ret = 0;
@@ -2060,25 +1986,24 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
break;
}
- node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
- addr, addr + len - 1);
- if (node == NULL || node->start > addr) {
+ map = vhost_iotlb_itree_first(umem, addr, addr + len - 1);
+ if (map == NULL || map->start > addr) {
if (umem != dev->iotlb) {
ret = -EFAULT;
break;
}
ret = -EAGAIN;
break;
- } else if (!(node->perm & access)) {
+ } else if (!(map->perm & access)) {
ret = -EPERM;
break;
}
_iov = iov + ret;
- size = node->size - addr + node->start;
+ size = map->size - addr + map->start;
_iov->iov_len = min((u64)len - s, size);
_iov->iov_base = (void __user *)(unsigned long)
- (node->userspace_addr + addr - node->start);
+ (map->addr + addr - map->start);
s += size;
addr += size;
++ret;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a123fd70847e..181382185bbc 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,6 +12,7 @@
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
#include <linux/atomic.h>
+#include <linux/vhost_iotlb.h>
struct vhost_work;
typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -52,27 +53,6 @@ struct vhost_log {
u64 len;
};
-#define START(node) ((node)->start)
-#define LAST(node) ((node)->last)
-
-struct vhost_umem_node {
- struct rb_node rb;
- struct list_head link;
- __u64 start;
- __u64 last;
- __u64 size;
- __u64 userspace_addr;
- __u32 perm;
- __u32 flags_padding;
- __u64 __subtree_last;
-};
-
-struct vhost_umem {
- struct rb_root_cached umem_tree;
- struct list_head umem_list;
- int numem;
-};
-
enum vhost_uaddr_type {
VHOST_ADDR_DESC = 0,
VHOST_ADDR_AVAIL = 1,
@@ -90,7 +70,7 @@ struct vhost_virtqueue {
struct vring_desc __user *desc;
struct vring_avail __user *avail;
struct vring_used __user *used;
- const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+ const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS];
struct file *kick;
struct eventfd_ctx *call_ctx;
struct eventfd_ctx *error_ctx;
@@ -128,8 +108,8 @@ struct vhost_virtqueue {
struct iovec *indirect;
struct vring_used_elem *heads;
/* Protected by virtqueue mutex. */
- struct vhost_umem *umem;
- struct vhost_umem *iotlb;
+ struct vhost_iotlb *umem;
+ struct vhost_iotlb *iotlb;
void *private_data;
u64 acked_features;
u64 acked_backend_features;
@@ -164,8 +144,8 @@ struct vhost_dev {
struct eventfd_ctx *log_ctx;
struct llist_head work_list;
struct task_struct *worker;
- struct vhost_umem *umem;
- struct vhost_umem *iotlb;
+ struct vhost_iotlb *umem;
+ struct vhost_iotlb *iotlb;
spinlock_t iotlb_lock;
struct list_head read_list;
struct list_head pending_list;
@@ -174,16 +154,20 @@ struct vhost_dev {
int weight;
int byte_weight;
u64 kcov_handle;
+ int (*msg_handler)(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg);
};
bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
- int nvqs, int iov_limit, int weight, int byte_weight);
+ int nvqs, int iov_limit, int weight, int byte_weight,
+ int (*msg_handler)(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg));
long vhost_dev_set_owner(struct vhost_dev *dev);
bool vhost_dev_has_owner(struct vhost_dev *dev);
long vhost_dev_check_owner(struct vhost_dev *);
-struct vhost_umem *vhost_dev_reset_owner_prepare(void);
-void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
+struct vhost_iotlb *vhost_dev_reset_owner_prepare(void);
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *iotlb);
void vhost_dev_cleanup(struct vhost_dev *);
void vhost_dev_stop(struct vhost_dev *);
long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
@@ -229,6 +213,9 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
struct iov_iter *from);
int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
+void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
+ struct vhost_iotlb_map *map);
+
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
if ((vq)->error_ctx) \
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index a0a2d74967ef..ee0491f579ac 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -13,6 +13,9 @@
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/bvec.h>
+#include <linux/highmem.h>
+#include <linux/vhost_iotlb.h>
#include <uapi/linux/virtio_config.h>
static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
@@ -71,9 +74,11 @@ static inline int __vringh_get_head(const struct vringh *vrh,
}
/* Copy some bytes to/from the iovec. Returns num copied. */
-static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
+static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
+ struct vringh_kiov *iov,
void *ptr, size_t len,
- int (*xfer)(void *addr, void *ptr,
+ int (*xfer)(const struct vringh *vrh,
+ void *addr, void *ptr,
size_t len))
{
int err, done = 0;
@@ -82,7 +87,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
size_t partlen;
partlen = min(iov->iov[iov->i].iov_len, len);
- err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
+ err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, partlen);
if (err)
return err;
done += partlen;
@@ -96,6 +101,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
/* Fix up old iov element then increment. */
iov->iov[iov->i].iov_len = iov->consumed;
iov->iov[iov->i].iov_base -= iov->consumed;
+
iov->consumed = 0;
iov->i++;
@@ -227,7 +233,8 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src,
u64 addr,
struct vringh_range *r),
struct vringh_range *range,
- int (*copy)(void *dst, const void *src, size_t len))
+ int (*copy)(const struct vringh *vrh,
+ void *dst, const void *src, size_t len))
{
size_t part, len = sizeof(struct vring_desc);
@@ -241,7 +248,7 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src,
if (!rcheck(vrh, addr, &part, range, getrange))
return -EINVAL;
- err = copy(dst, src, part);
+ err = copy(vrh, dst, src, part);
if (err)
return err;
@@ -262,7 +269,8 @@ __vringh_iov(struct vringh *vrh, u16 i,
struct vringh_range *)),
bool (*getrange)(struct vringh *, u64, struct vringh_range *),
gfp_t gfp,
- int (*copy)(void *dst, const void *src, size_t len))
+ int (*copy)(const struct vringh *vrh,
+ void *dst, const void *src, size_t len))
{
int err, count = 0, up_next, desc_max;
struct vring_desc desc, *descs;
@@ -291,7 +299,7 @@ __vringh_iov(struct vringh *vrh, u16 i,
err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
&slowrange, copy);
else
- err = copy(&desc, &descs[i], sizeof(desc));
+ err = copy(vrh, &desc, &descs[i], sizeof(desc));
if (unlikely(err))
goto fail;
@@ -404,7 +412,8 @@ static inline int __vringh_complete(struct vringh *vrh,
unsigned int num_used,
int (*putu16)(const struct vringh *vrh,
__virtio16 *p, u16 val),
- int (*putused)(struct vring_used_elem *dst,
+ int (*putused)(const struct vringh *vrh,
+ struct vring_used_elem *dst,
const struct vring_used_elem
*src, unsigned num))
{
@@ -420,12 +429,12 @@ static inline int __vringh_complete(struct vringh *vrh,
/* Compiler knows num_used == 1 sometimes, hence extra check */
if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
u16 part = vrh->vring.num - off;
- err = putused(&used_ring->ring[off], used, part);
+ err = putused(vrh, &used_ring->ring[off], used, part);
if (!err)
- err = putused(&used_ring->ring[0], used + part,
+ err = putused(vrh, &used_ring->ring[0], used + part,
num_used - part);
} else
- err = putused(&used_ring->ring[off], used, num_used);
+ err = putused(vrh, &used_ring->ring[off], used, num_used);
if (err) {
vringh_bad("Failed to write %u used entries %u at %p",
@@ -564,13 +573,15 @@ static inline int putu16_user(const struct vringh *vrh, __virtio16 *p, u16 val)
return put_user(v, (__force __virtio16 __user *)p);
}
-static inline int copydesc_user(void *dst, const void *src, size_t len)
+static inline int copydesc_user(const struct vringh *vrh,
+ void *dst, const void *src, size_t len)
{
return copy_from_user(dst, (__force void __user *)src, len) ?
-EFAULT : 0;
}
-static inline int putused_user(struct vring_used_elem *dst,
+static inline int putused_user(const struct vringh *vrh,
+ struct vring_used_elem *dst,
const struct vring_used_elem *src,
unsigned int num)
{
@@ -578,13 +589,15 @@ static inline int putused_user(struct vring_used_elem *dst,
sizeof(*dst) * num) ? -EFAULT : 0;
}
-static inline int xfer_from_user(void *src, void *dst, size_t len)
+static inline int xfer_from_user(const struct vringh *vrh, void *src,
+ void *dst, size_t len)
{
return copy_from_user(dst, (__force void __user *)src, len) ?
-EFAULT : 0;
}
-static inline int xfer_to_user(void *dst, void *src, size_t len)
+static inline int xfer_to_user(const struct vringh *vrh,
+ void *dst, void *src, size_t len)
{
return copy_to_user((__force void __user *)dst, src, len) ?
-EFAULT : 0;
@@ -706,7 +719,7 @@ EXPORT_SYMBOL(vringh_getdesc_user);
*/
ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
{
- return vringh_iov_xfer((struct vringh_kiov *)riov,
+ return vringh_iov_xfer(NULL, (struct vringh_kiov *)riov,
dst, len, xfer_from_user);
}
EXPORT_SYMBOL(vringh_iov_pull_user);
@@ -722,7 +735,7 @@ EXPORT_SYMBOL(vringh_iov_pull_user);
ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
const void *src, size_t len)
{
- return vringh_iov_xfer((struct vringh_kiov *)wiov,
+ return vringh_iov_xfer(NULL, (struct vringh_kiov *)wiov,
(void *)src, len, xfer_to_user);
}
EXPORT_SYMBOL(vringh_iov_push_user);
@@ -832,13 +845,15 @@ static inline int putu16_kern(const struct vringh *vrh, __virtio16 *p, u16 val)
return 0;
}
-static inline int copydesc_kern(void *dst, const void *src, size_t len)
+static inline int copydesc_kern(const struct vringh *vrh,
+ void *dst, const void *src, size_t len)
{
memcpy(dst, src, len);
return 0;
}
-static inline int putused_kern(struct vring_used_elem *dst,
+static inline int putused_kern(const struct vringh *vrh,
+ struct vring_used_elem *dst,
const struct vring_used_elem *src,
unsigned int num)
{
@@ -846,13 +861,15 @@ static inline int putused_kern(struct vring_used_elem *dst,
return 0;
}
-static inline int xfer_kern(void *src, void *dst, size_t len)
+static inline int xfer_kern(const struct vringh *vrh, void *src,
+ void *dst, size_t len)
{
memcpy(dst, src, len);
return 0;
}
-static inline int kern_xfer(void *dst, void *src, size_t len)
+static inline int kern_xfer(const struct vringh *vrh, void *dst,
+ void *src, size_t len)
{
memcpy(dst, src, len);
return 0;
@@ -949,7 +966,7 @@ EXPORT_SYMBOL(vringh_getdesc_kern);
*/
ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
{
- return vringh_iov_xfer(riov, dst, len, xfer_kern);
+ return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern);
}
EXPORT_SYMBOL(vringh_iov_pull_kern);
@@ -964,7 +981,7 @@ EXPORT_SYMBOL(vringh_iov_pull_kern);
ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
const void *src, size_t len)
{
- return vringh_iov_xfer(wiov, (void *)src, len, kern_xfer);
+ return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer);
}
EXPORT_SYMBOL(vringh_iov_push_kern);
@@ -1042,4 +1059,362 @@ int vringh_need_notify_kern(struct vringh *vrh)
}
EXPORT_SYMBOL(vringh_need_notify_kern);
+static int iotlb_translate(const struct vringh *vrh,
+ u64 addr, u64 len, struct bio_vec iov[],
+ int iov_size, u32 perm)
+{
+ struct vhost_iotlb_map *map;
+ struct vhost_iotlb *iotlb = vrh->iotlb;
+ int ret = 0;
+ u64 s = 0;
+
+ while (len > s) {
+ u64 size, pa, pfn;
+
+ if (unlikely(ret >= iov_size)) {
+ ret = -ENOBUFS;
+ break;
+ }
+
+ map = vhost_iotlb_itree_first(iotlb, addr,
+ addr + len - 1);
+ if (!map || map->start > addr) {
+ ret = -EINVAL;
+ break;
+ } else if (!(map->perm & perm)) {
+ ret = -EPERM;
+ break;
+ }
+
+ size = map->size - addr + map->start;
+ pa = map->addr + addr - map->start;
+ pfn = pa >> PAGE_SHIFT;
+ iov[ret].bv_page = pfn_to_page(pfn);
+ iov[ret].bv_len = min(len - s, size);
+ iov[ret].bv_offset = pa & (PAGE_SIZE - 1);
+ s += size;
+ addr += size;
+ ++ret;
+ }
+
+ return ret;
+}
+
+static inline int copy_from_iotlb(const struct vringh *vrh, void *dst,
+ void *src, size_t len)
+{
+ struct iov_iter iter;
+ struct bio_vec iov[16];
+ int ret;
+
+ ret = iotlb_translate(vrh, (u64)(uintptr_t)src,
+ len, iov, 16, VHOST_MAP_RO);
+ if (ret < 0)
+ return ret;
+
+ iov_iter_bvec(&iter, READ, iov, ret, len);
+
+ ret = copy_from_iter(dst, len, &iter);
+
+ return ret;
+}
+
+static inline int copy_to_iotlb(const struct vringh *vrh, void *dst,
+ void *src, size_t len)
+{
+ struct iov_iter iter;
+ struct bio_vec iov[16];
+ int ret;
+
+ ret = iotlb_translate(vrh, (u64)(uintptr_t)dst,
+ len, iov, 16, VHOST_MAP_WO);
+ if (ret < 0)
+ return ret;
+
+ iov_iter_bvec(&iter, WRITE, iov, ret, len);
+
+ return copy_to_iter(src, len, &iter);
+}
+
+static inline int getu16_iotlb(const struct vringh *vrh,
+ u16 *val, const __virtio16 *p)
+{
+ struct bio_vec iov;
+ void *kaddr, *from;
+ int ret;
+
+ /* Atomic read is needed for getu16 */
+ ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p),
+ &iov, 1, VHOST_MAP_RO);
+ if (ret < 0)
+ return ret;
+
+ kaddr = kmap_atomic(iov.bv_page);
+ from = kaddr + iov.bv_offset;
+ *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from));
+ kunmap_atomic(kaddr);
+
+ return 0;
+}
+
+static inline int putu16_iotlb(const struct vringh *vrh,
+ __virtio16 *p, u16 val)
+{
+ struct bio_vec iov;
+ void *kaddr, *to;
+ int ret;
+
+ /* Atomic write is needed for putu16 */
+ ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p),
+ &iov, 1, VHOST_MAP_WO);
+ if (ret < 0)
+ return ret;
+
+ kaddr = kmap_atomic(iov.bv_page);
+ to = kaddr + iov.bv_offset;
+ WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val));
+ kunmap_atomic(kaddr);
+
+ return 0;
+}
+
+static inline int copydesc_iotlb(const struct vringh *vrh,
+ void *dst, const void *src, size_t len)
+{
+ int ret;
+
+ ret = copy_from_iotlb(vrh, dst, (void *)src, len);
+ if (ret != len)
+ return -EFAULT;
+
+ return 0;
+}
+
+static inline int xfer_from_iotlb(const struct vringh *vrh, void *src,
+ void *dst, size_t len)
+{
+ int ret;
+
+ ret = copy_from_iotlb(vrh, dst, src, len);
+ if (ret != len)
+ return -EFAULT;
+
+ return 0;
+}
+
+static inline int xfer_to_iotlb(const struct vringh *vrh,
+ void *dst, void *src, size_t len)
+{
+ int ret;
+
+ ret = copy_to_iotlb(vrh, dst, src, len);
+ if (ret != len)
+ return -EFAULT;
+
+ return 0;
+}
+
+static inline int putused_iotlb(const struct vringh *vrh,
+ struct vring_used_elem *dst,
+ const struct vring_used_elem *src,
+ unsigned int num)
+{
+ int size = num * sizeof(*dst);
+ int ret;
+
+ ret = copy_to_iotlb(vrh, dst, (void *)src, num * sizeof(*dst));
+ if (ret != size)
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * vringh_init_iotlb - initialize a vringh for a ring with IOTLB.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_iotlb(struct vringh *vrh, u64 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc *desc,
+ struct vring_avail *avail,
+ struct vring_used *used)
+{
+ return vringh_init_kern(vrh, features, num, weak_barriers,
+ desc, avail, used);
+}
+EXPORT_SYMBOL(vringh_init_iotlb);
+
+/**
+ * vringh_set_iotlb - initialize a vringh for a ring with IOTLB.
+ * @vrh: the vring
+ * @iotlb: iotlb associated with this vring
+ */
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb)
+{
+ vrh->iotlb = iotlb;
+}
+EXPORT_SYMBOL(vringh_set_iotlb);
+
+/**
+ * vringh_getdesc_iotlb - get next available descriptor from ring with
+ * IOTLB.
+ * @vrh: the kernelspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @head: head index we received, for passing to vringh_complete_iotlb().
+ * @gfp: flags for allocating larger riov/wiov.
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num. You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_iotlb(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ struct vringh_kiov *wiov,
+ u16 *head,
+ gfp_t gfp)
+{
+ int err;
+
+ err = __vringh_get_head(vrh, getu16_iotlb, &vrh->last_avail_idx);
+ if (err < 0)
+ return err;
+
+ /* Empty... */
+ if (err == vrh->vring.num)
+ return 0;
+
+ *head = err;
+ err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
+ gfp, copydesc_iotlb);
+ if (err)
+ return err;
+
+ return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_iotlb);
+
+/**
+ * vringh_iov_pull_iotlb - copy bytes from vring_iov.
+ * @vrh: the vring.
+ * @riov: the riov as passed to vringh_getdesc_iotlb() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_iotlb(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ void *dst, size_t len)
+{
+ return vringh_iov_xfer(vrh, riov, dst, len, xfer_from_iotlb);
+}
+EXPORT_SYMBOL(vringh_iov_pull_iotlb);
+
+/**
+ * vringh_iov_push_iotlb - copy bytes into vring_iov.
+ * @vrh: the vring.
+ * @wiov: the wiov as passed to vringh_getdesc_iotlb() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
+ struct vringh_kiov *wiov,
+ const void *src, size_t len)
+{
+ return vringh_iov_xfer(vrh, wiov, (void *)src, len, xfer_to_iotlb);
+}
+EXPORT_SYMBOL(vringh_iov_push_iotlb);
+
+/**
+ * vringh_abandon_iotlb - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ * vringh_get_iotlb() to undo).
+ *
+ * The next vringh_get_iotlb() will return the old descriptor(s) again.
+ */
+void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num)
+{
+ /* We only update vring_avail_event(vr) when we want to be notified,
+ * so we haven't changed that yet.
+ */
+ vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_iotlb);
+
+/**
+ * vringh_complete_iotlb - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_iotlb.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_iotlb() after one or more calls
+ * to this function.
+ */
+int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len)
+{
+ struct vring_used_elem used;
+
+ used.id = cpu_to_vringh32(vrh, head);
+ used.len = cpu_to_vringh32(vrh, len);
+
+ return __vringh_complete(vrh, &used, 1, putu16_iotlb, putused_iotlb);
+}
+EXPORT_SYMBOL(vringh_complete_iotlb);
+
+/**
+ * vringh_notify_enable_iotlb - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_iotlb(struct vringh *vrh)
+{
+ return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_notify_enable_iotlb);
+
+/**
+ * vringh_notify_disable_iotlb - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_iotlb(struct vringh *vrh)
+{
+ __vringh_notify_disable(vrh, putu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_notify_disable_iotlb);
+
+/**
+ * vringh_need_notify_iotlb - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_iotlb() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_iotlb(struct vringh *vrh)
+{
+ return __vringh_need_notify(vrh, getu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_need_notify_iotlb);
+
+
MODULE_LICENSE("GPL");
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index c2d7d57e98cf..97669484a3f6 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -621,7 +621,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT,
- VHOST_VSOCK_WEIGHT);
+ VHOST_VSOCK_WEIGHT, NULL);
file->private_data = vsock;
spin_lock_init(&vsock->send_pkt_list_lock);