diff options
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/net.c | 107 | ||||
-rw-r--r-- | drivers/vhost/scsi.c | 99 | ||||
-rw-r--r-- | drivers/vhost/vdpa.c | 58 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 213 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 3 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 1 | ||||
-rw-r--r-- | drivers/vhost/vsock.c | 8 |
7 files changed, 279 insertions, 210 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f2ed7167c848..b9b9e9d40951 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -141,10 +141,8 @@ struct vhost_net { unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; - /* Private page frag */ - struct page_frag page_frag; - /* Refcount bias of page frag */ - int refcnt_bias; + /* Private page frag cache */ + struct page_frag_cache pf_cache; }; static unsigned vhost_net_zcopy_mask __read_mostly; @@ -382,7 +380,7 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, } } -static void vhost_zerocopy_callback(struct sk_buff *skb, +static void vhost_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); @@ -410,6 +408,10 @@ static void vhost_zerocopy_callback(struct sk_buff *skb, rcu_read_unlock_bh(); } +static const struct ubuf_info_ops vhost_ubuf_ops = { + .complete = vhost_zerocopy_complete, +}; + static inline unsigned long busy_clock(void) { return local_clock() >> 10; @@ -655,41 +657,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) !vhost_vq_avail_empty(vq->dev, vq); } -static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, - struct page_frag *pfrag, gfp_t gfp) -{ - if (pfrag->page) { - if (pfrag->offset + sz <= pfrag->size) - return true; - __page_frag_cache_drain(pfrag->page, net->refcnt_bias); - } - - pfrag->offset = 0; - net->refcnt_bias = 0; - if (SKB_FRAG_PAGE_ORDER) { - /* Avoid direct reclaim but allow kswapd to wake */ - pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | __GFP_NOWARN | - __GFP_NORETRY, - SKB_FRAG_PAGE_ORDER); - if (likely(pfrag->page)) { - pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; - goto done; - } - } - pfrag->page = alloc_page(gfp); - if (likely(pfrag->page)) { - pfrag->size = PAGE_SIZE; - goto done; - } - return false; - -done: - net->refcnt_bias = USHRT_MAX; - page_ref_add(pfrag->page, USHRT_MAX - 1); - return true; -} - #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, @@ -699,7 +666,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); - struct page_frag *alloc_frag = &net->page_frag; struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; struct tun_xdp_hdr *hdr; @@ -710,6 +676,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, int sock_hlen = nvq->sock_hlen; void *buf; int copied; + int ret; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; @@ -719,22 +686,24 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); - alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); - if (unlikely(!vhost_net_page_frag_refill(net, buflen, - alloc_frag, GFP_KERNEL))) + buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, + SMP_CACHE_BYTES); + if (unlikely(!buf)) return -ENOMEM; - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; - copied = copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + - offsetof(struct tun_xdp_hdr, gso), - sock_hlen, from); - if (copied != sock_hlen) - return -EFAULT; + copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso), + sock_hlen, from); + if (copied != sock_hlen) { + ret = -EFAULT; + goto err; + } hdr = buf; gso = &hdr->gso; + if (!sock_hlen) + memset(buf, 0, pad); + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2 > @@ -743,27 +712,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); - if (vhost16_to_cpu(vq, gso->hdr_len) > len) - return -EINVAL; + if (vhost16_to_cpu(vq, gso->hdr_len) > len) { + ret = -EINVAL; + goto err; + } } len -= sock_hlen; - copied = copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + pad, - len, from); - if (copied != len) - return -EFAULT; + copied = copy_from_iter(buf + pad, len, from); + if (copied != len) { + ret = -EFAULT; + goto err; + } xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len, true); hdr->buflen = buflen; - --net->refcnt_bias; - alloc_frag->offset += buflen; - ++nvq->batched_xdp; return 0; + +err: + page_frag_free(buf); + return ret; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) @@ -911,7 +883,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; - ubuf->ubuf.callback = vhost_zerocopy_callback; + ubuf->ubuf.ops = &vhost_ubuf_ops; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; @@ -1135,6 +1107,7 @@ static void handle_rx(struct vhost_net *net) size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; + bool set_num_buffers; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; @@ -1157,6 +1130,8 @@ static void handle_rx(struct vhost_net *net) vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + set_num_buffers = mergeable || + vhost_has_feature(vq, VIRTIO_F_VERSION_1); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, @@ -1233,7 +1208,7 @@ static void handle_rx(struct vhost_net *net) /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); - if (likely(mergeable) && + if (likely(set_num_buffers) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); @@ -1353,8 +1328,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) vqs[VHOST_NET_VQ_RX]); f->private_data = n; - n->page_frag.page = NULL; - n->refcnt_bias = 0; + page_frag_cache_init(&n->pf_cache); return 0; } @@ -1422,8 +1396,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); - if (n->page_frag.page) - __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); + page_frag_cache_drain(&n->pf_cache); kvfree(n); return 0; } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 282aac45c690..718fa4e0b31e 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -27,7 +27,7 @@ #include <linux/miscdevice.h> #include <linux/blk_types.h> #include <linux/bio.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <scsi/scsi_common.h> #include <scsi/scsi_proto.h> #include <target/target_core_base.h> @@ -210,6 +210,7 @@ struct vhost_scsi { struct vhost_scsi_tmf { struct vhost_work vwork; + struct work_struct flush_work; struct vhost_scsi *vhost; struct vhost_scsi_virtqueue *svq; @@ -358,14 +359,23 @@ static void vhost_scsi_release_tmf_res(struct vhost_scsi_tmf *tmf) vhost_scsi_put_inflight(inflight); } +static void vhost_scsi_drop_cmds(struct vhost_scsi_virtqueue *svq) +{ + struct vhost_scsi_cmd *cmd, *t; + struct llist_node *llnode; + + llnode = llist_del_all(&svq->completion_list); + llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) + vhost_scsi_release_cmd_res(&cmd->tvc_se_cmd); +} + static void vhost_scsi_release_cmd(struct se_cmd *se_cmd) { if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) { struct vhost_scsi_tmf *tmf = container_of(se_cmd, struct vhost_scsi_tmf, se_cmd); - struct vhost_virtqueue *vq = &tmf->svq->vq; - vhost_vq_work_queue(vq, &tmf->vwork); + schedule_work(&tmf->flush_work); } else { struct vhost_scsi_cmd *cmd = container_of(se_cmd, struct vhost_scsi_cmd, tvc_se_cmd); @@ -373,7 +383,8 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd) struct vhost_scsi_virtqueue, vq); llist_add(&cmd->tvc_completion_list, &svq->completion_list); - vhost_vq_work_queue(&svq->vq, &svq->completion_work); + if (!vhost_vq_work_queue(&svq->vq, &svq->completion_work)) + vhost_scsi_drop_cmds(svq); } } @@ -497,10 +508,8 @@ again: vq_err(vq, "Faulted on vhost_scsi_send_event\n"); } -static void vhost_scsi_evt_work(struct vhost_work *work) +static void vhost_scsi_complete_events(struct vhost_scsi *vs, bool drop) { - struct vhost_scsi *vs = container_of(work, struct vhost_scsi, - vs_event_work); struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; struct vhost_scsi_evt *evt, *t; struct llist_node *llnode; @@ -508,12 +517,20 @@ static void vhost_scsi_evt_work(struct vhost_work *work) mutex_lock(&vq->mutex); llnode = llist_del_all(&vs->vs_event_list); llist_for_each_entry_safe(evt, t, llnode, list) { - vhost_scsi_do_evt_work(vs, evt); + if (!drop) + vhost_scsi_do_evt_work(vs, evt); vhost_scsi_free_evt(vs, evt); } mutex_unlock(&vq->mutex); } +static void vhost_scsi_evt_work(struct vhost_work *work) +{ + struct vhost_scsi *vs = container_of(work, struct vhost_scsi, + vs_event_work); + vhost_scsi_complete_events(vs, false); +} + static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd) { struct iov_iter *iter = &cmd->saved_iter; @@ -1012,20 +1029,23 @@ vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc, /* virtio-scsi spec requires byte 0 of the lun to be 1 */ vq_err(vq, "Illegal virtio-scsi lun: %u\n", *vc->lunp); } else { - struct vhost_scsi_tpg **vs_tpg, *tpg; - - vs_tpg = vhost_vq_get_backend(vq); /* validated at handler entry */ - - tpg = READ_ONCE(vs_tpg[*vc->target]); - if (unlikely(!tpg)) { - vq_err(vq, "Target 0x%x does not exist\n", *vc->target); - } else { - if (tpgp) - *tpgp = tpg; - ret = 0; + struct vhost_scsi_tpg **vs_tpg, *tpg = NULL; + + if (vc->target) { + /* validated at handler entry */ + vs_tpg = vhost_vq_get_backend(vq); + tpg = READ_ONCE(vs_tpg[*vc->target]); + if (unlikely(!tpg)) { + vq_err(vq, "Target 0x%x does not exist\n", *vc->target); + goto out; + } } - } + if (tpgp) + *tpgp = tpg; + ret = 0; + } +out: return ret; } @@ -1270,33 +1290,32 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work *work) { struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf, vwork); - struct vhost_virtqueue *ctl_vq, *vq; - int resp_code, i; - - if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) { - /* - * Flush IO vqs that don't share a worker with the ctl to make - * sure they have sent their responses before us. - */ - ctl_vq = &tmf->vhost->vqs[VHOST_SCSI_VQ_CTL].vq; - for (i = VHOST_SCSI_VQ_IO; i < tmf->vhost->dev.nvqs; i++) { - vq = &tmf->vhost->vqs[i].vq; - - if (vhost_vq_is_setup(vq) && - vq->worker != ctl_vq->worker) - vhost_vq_flush(vq); - } + int resp_code; + if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) resp_code = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; - } else { + else resp_code = VIRTIO_SCSI_S_FUNCTION_REJECTED; - } vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs, tmf->vq_desc, &tmf->resp_iov, resp_code); vhost_scsi_release_tmf_res(tmf); } +static void vhost_scsi_tmf_flush_work(struct work_struct *work) +{ + struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf, + flush_work); + struct vhost_virtqueue *vq = &tmf->svq->vq; + /* + * Make sure we have sent responses for other commands before we + * send our response. + */ + vhost_dev_flush(vq->dev); + if (!vhost_vq_work_queue(vq, &tmf->vwork)) + vhost_scsi_release_tmf_res(tmf); +} + static void vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, struct vhost_virtqueue *vq, @@ -1320,6 +1339,7 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, if (!tmf) goto send_reject; + INIT_WORK(&tmf->flush_work, vhost_scsi_tmf_flush_work); vhost_work_init(&tmf->vwork, vhost_scsi_tmf_resp_work); tmf->vhost = vs; tmf->svq = svq; @@ -1509,7 +1529,8 @@ vhost_scsi_send_evt(struct vhost_scsi *vs, struct vhost_virtqueue *vq, } llist_add(&evt->list, &vs->vs_event_list); - vhost_vq_work_queue(vq, &vs->vs_event_work); + if (!vhost_vq_work_queue(vq, &vs->vs_event_work)) + vhost_scsi_complete_events(vs, true); } static void vhost_scsi_evt_handle_kick(struct vhost_work *work) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index bc4a51e4638b..5a49b5a6d496 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -209,11 +209,9 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) if (irq < 0) return; - irq_bypass_unregister_producer(&vq->call_ctx.producer); if (!vq->call_ctx.ctx) return; - vq->call_ctx.producer.token = vq->call_ctx.ctx; vq->call_ctx.producer.irq = irq; ret = irq_bypass_register_producer(&vq->call_ctx.producer); if (unlikely(ret)) @@ -595,6 +593,9 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v) const struct vdpa_config_ops *ops = vdpa->config; int ret; + if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK)) + return 0; + if (!ops->suspend) return -EOPNOTSUPP; @@ -615,6 +616,9 @@ static long vhost_vdpa_resume(struct vhost_vdpa *v) const struct vdpa_config_ops *ops = vdpa->config; int ret; + if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK)) + return 0; + if (!ops->resume) return -EOPNOTSUPP; @@ -681,6 +685,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, if (!ops->set_group_asid) return -EOPNOTSUPP; return ops->set_group_asid(vdpa, idx, s.num); + case VHOST_VDPA_GET_VRING_SIZE: + if (!ops->get_vq_size) + return -EOPNOTSUPP; + s.index = idx; + s.num = ops->get_vq_size(vdpa, idx); + if (copy_to_user(argp, &s, sizeof(s))) + return -EFAULT; + return 0; case VHOST_GET_VRING_BASE: r = ops->get_vq_state(v->vdpa, idx, &vq_state); if (r) @@ -695,6 +707,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, vq->last_avail_idx = vq_state.split.avail_index; } break; + case VHOST_SET_VRING_CALL: + if (vq->call_ctx.ctx) { + if (ops->get_status(vdpa) & + VIRTIO_CONFIG_S_DRIVER_OK) + vhost_vdpa_unsetup_vq_irq(v, idx); + vq->call_ctx.producer.token = NULL; + } + break; } r = vhost_vring_ioctl(&v->vdev, cmd, argp); @@ -733,13 +753,16 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, cb.callback = vhost_vdpa_virtqueue_cb; cb.private = vq; cb.trigger = vq->call_ctx.ctx; + vq->call_ctx.producer.token = vq->call_ctx.ctx; + if (ops->get_status(vdpa) & + VIRTIO_CONFIG_S_DRIVER_OK) + vhost_vdpa_setup_vq_irq(v, idx); } else { cb.callback = NULL; cb.private = NULL; cb.trigger = NULL; } ops->set_vq_cb(vdpa, idx, &cb); - vhost_vdpa_setup_vq_irq(v, idx); break; case VHOST_SET_VRING_NUM: @@ -1298,26 +1321,24 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; struct device *dma_dev = vdpa_get_dma_dev(vdpa); - const struct bus_type *bus; int ret; /* Device want to do DMA by itself */ if (ops->set_map || ops->dma_map) return 0; - bus = dma_dev->bus; - if (!bus) - return -EFAULT; - if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY)) { dev_warn_once(&v->dev, "Failed to allocate domain, device is not IOMMU cache coherent capable\n"); return -ENOTSUPP; } - v->domain = iommu_domain_alloc(bus); - if (!v->domain) - return -EIO; + v->domain = iommu_paging_domain_alloc(dma_dev); + if (IS_ERR(v->domain)) { + ret = PTR_ERR(v->domain); + v->domain = NULL; + return ret; + } ret = iommu_attach_device(v->domain, dma_dev); if (ret) @@ -1407,6 +1428,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) for (i = 0; i < nvqs; i++) { vqs[i] = &v->vqs[i]; vqs[i]->handle_kick = handle_vq_kick; + vqs[i]->call_ctx.ctx = NULL; } vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false, vhost_vdpa_process_iotlb_msg); @@ -1469,13 +1491,7 @@ static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf) notify = ops->get_vq_notification(vdpa, index); - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (remap_pfn_range(vma, vmf->address & PAGE_MASK, - PFN_DOWN(notify.addr), PAGE_SIZE, - vma->vm_page_prot)) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; + return vmf_insert_pfn(vma, vmf->address & PAGE_MASK, PFN_DOWN(notify.addr)); } static const struct vm_operations_struct vhost_vdpa_vm_ops = { @@ -1534,7 +1550,7 @@ static void vhost_vdpa_release_dev(struct device *device) struct vhost_vdpa *v = container_of(device, struct vhost_vdpa, dev); - ida_simple_remove(&vhost_vdpa_ida, v->minor); + ida_free(&vhost_vdpa_ida, v->minor); kfree(v->vqs); kfree(v); } @@ -1557,8 +1573,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) if (!v) return -ENOMEM; - minor = ida_simple_get(&vhost_vdpa_ida, 0, - VHOST_VDPA_DEV_MAX, GFP_KERNEL); + minor = ida_alloc_max(&vhost_vdpa_ida, VHOST_VDPA_DEV_MAX - 1, + GFP_KERNEL); if (minor < 0) { kfree(v); return minor; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..63612faeab72 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -263,34 +263,37 @@ bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work) } EXPORT_SYMBOL_GPL(vhost_vq_work_queue); -void vhost_vq_flush(struct vhost_virtqueue *vq) -{ - struct vhost_flush_struct flush; - - init_completion(&flush.wait_event); - vhost_work_init(&flush.work, vhost_flush_work); - - if (vhost_vq_work_queue(vq, &flush.work)) - wait_for_completion(&flush.wait_event); -} -EXPORT_SYMBOL_GPL(vhost_vq_flush); - /** - * vhost_worker_flush - flush a worker + * __vhost_worker_flush - flush a worker * @worker: worker to flush * - * This does not use RCU to protect the worker, so the device or worker - * mutex must be held. + * The worker's flush_mutex must be held. */ -static void vhost_worker_flush(struct vhost_worker *worker) +static void __vhost_worker_flush(struct vhost_worker *worker) { struct vhost_flush_struct flush; + if (!worker->attachment_cnt || worker->killed) + return; + init_completion(&flush.wait_event); vhost_work_init(&flush.work, vhost_flush_work); vhost_worker_queue(worker, &flush.work); + /* + * Drop mutex in case our worker is killed and it needs to take the + * mutex to force cleanup. + */ + mutex_unlock(&worker->mutex); wait_for_completion(&flush.wait_event); + mutex_lock(&worker->mutex); +} + +static void vhost_worker_flush(struct vhost_worker *worker) +{ + mutex_lock(&worker->mutex); + __vhost_worker_flush(worker); + mutex_unlock(&worker->mutex); } void vhost_dev_flush(struct vhost_dev *dev) @@ -298,15 +301,8 @@ void vhost_dev_flush(struct vhost_dev *dev) struct vhost_worker *worker; unsigned long i; - xa_for_each(&dev->worker_xa, i, worker) { - mutex_lock(&worker->mutex); - if (!worker->attachment_cnt) { - mutex_unlock(&worker->mutex); - continue; - } + xa_for_each(&dev->worker_xa, i, worker) vhost_worker_flush(worker); - mutex_unlock(&worker->mutex); - } } EXPORT_SYMBOL_GPL(vhost_dev_flush); @@ -392,7 +388,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, __vhost_vq_meta_reset(vq); } -static bool vhost_worker(void *data) +static bool vhost_run_work_list(void *data) { struct vhost_worker *worker = data; struct vhost_work *work, *work_next; @@ -417,6 +413,40 @@ static bool vhost_worker(void *data) return !!node; } +static void vhost_worker_killed(void *data) +{ + struct vhost_worker *worker = data; + struct vhost_dev *dev = worker->dev; + struct vhost_virtqueue *vq; + int i, attach_cnt = 0; + + mutex_lock(&worker->mutex); + worker->killed = true; + + for (i = 0; i < dev->nvqs; i++) { + vq = dev->vqs[i]; + + mutex_lock(&vq->mutex); + if (worker == + rcu_dereference_check(vq->worker, + lockdep_is_held(&vq->mutex))) { + rcu_assign_pointer(vq->worker, NULL); + attach_cnt++; + } + mutex_unlock(&vq->mutex); + } + + worker->attachment_cnt -= attach_cnt; + if (attach_cnt) + synchronize_rcu(); + /* + * Finish vhost_worker_flush calls and any other works that snuck in + * before the synchronize_rcu. + */ + vhost_run_work_list(worker); + mutex_unlock(&worker->mutex); +} + static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) { kfree(vq->indirect); @@ -631,10 +661,12 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) if (!worker) return NULL; + worker->dev = dev; snprintf(name, sizeof(name), "vhost-%d", current->pid); - vtsk = vhost_task_create(vhost_worker, worker, name); - if (!vtsk) + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, + worker, name); + if (IS_ERR(vtsk)) goto free_worker; mutex_init(&worker->mutex); @@ -664,22 +696,37 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, { struct vhost_worker *old_worker; - old_worker = rcu_dereference_check(vq->worker, - lockdep_is_held(&vq->dev->mutex)); - mutex_lock(&worker->mutex); - worker->attachment_cnt++; - mutex_unlock(&worker->mutex); + if (worker->killed) { + mutex_unlock(&worker->mutex); + return; + } + + mutex_lock(&vq->mutex); + + old_worker = rcu_dereference_check(vq->worker, + lockdep_is_held(&vq->mutex)); rcu_assign_pointer(vq->worker, worker); + worker->attachment_cnt++; - if (!old_worker) + if (!old_worker) { + mutex_unlock(&vq->mutex); + mutex_unlock(&worker->mutex); return; + } + mutex_unlock(&vq->mutex); + mutex_unlock(&worker->mutex); + /* * Take the worker mutex to make sure we see the work queued from * device wide flushes which doesn't use RCU for execution. */ mutex_lock(&old_worker->mutex); - old_worker->attachment_cnt--; + if (old_worker->killed) { + mutex_unlock(&old_worker->mutex); + return; + } + /* * We don't want to call synchronize_rcu for every vq during setup * because it will slow down VM startup. If we haven't done @@ -690,6 +737,8 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq) && !vq->kick) { mutex_unlock(&vq->mutex); + + old_worker->attachment_cnt--; mutex_unlock(&old_worker->mutex); /* * vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID. @@ -705,7 +754,8 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, /* Make sure new vq queue/flush/poll calls see the new worker */ synchronize_rcu(); /* Make sure whatever was queued gets run */ - vhost_worker_flush(old_worker); + __vhost_worker_flush(old_worker); + old_worker->attachment_cnt--; mutex_unlock(&old_worker->mutex); } @@ -754,10 +804,16 @@ static int vhost_free_worker(struct vhost_dev *dev, return -ENODEV; mutex_lock(&worker->mutex); - if (worker->attachment_cnt) { + if (worker->attachment_cnt || worker->killed) { mutex_unlock(&worker->mutex); return -EBUSY; } + /* + * A flush might have raced and snuck in before attachment_cnt was set + * to zero. Make sure flushes are flushed from the queue before + * freeing. + */ + __vhost_worker_flush(worker); mutex_unlock(&worker->mutex); vhost_worker_destroy(dev, worker); @@ -1290,10 +1346,36 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(&d->vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, &vq->avail->idx); + __virtio16 idx; + int r; + + r = vhost_get_avail(vq, idx, &vq->avail->idx); + if (unlikely(r < 0)) { + vq_err(vq, "Failed to access available index at %p (%d)\n", + &vq->avail->idx, r); + return r; + } + + /* Check it isn't doing very strange thing with available indexes */ + vq->avail_idx = vhost16_to_cpu(vq, idx); + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Invalid available index change from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EINVAL; + } + + /* We're done if there is nothing new */ + if (vq->avail_idx == vq->last_avail_idx) + return 0; + + /* + * We updated vq->avail_idx so we need a memory barrier between + * the index read above and the caller reading avail ring entries. + */ + smp_rmb(); + return 1; } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2498,38 +2580,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; - __virtio16 avail_idx; + u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access; - /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx; - if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - &vq->avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved used index from %u to %u", - last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret < 0)) + return ret; - /* If there's nothing new since last we looked, return - * invalid. - */ - if (vq->avail_idx == last_avail_idx) + if (!ret) return vq->num; - - /* Only get avail ring entries after they have been - * exposed by guest. - */ - smp_rmb(); } /* Grab the next descriptor number they're advertising, and increment @@ -2790,25 +2851,21 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (vq->avail_idx != vq->last_avail_idx) return false; - r = vhost_get_avail_idx(vq, &avail_idx); - if (unlikely(r)) - return false; - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + r = vhost_get_avail_idx(vq); - return vq->avail_idx == vq->last_avail_idx; + /* Note: we treat error as non-empty here */ + return r == 0; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); /* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) @@ -2832,15 +2889,13 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ smp_mb(); - r = vhost_get_avail_idx(vq, &avail_idx); - if (r) { - vq_err(vq, "Failed to check avail idx at %p: %d\n", - &vq->avail->idx, r); + + r = vhost_get_avail_idx(vq); + /* Note: we treat error as empty here */ + if (unlikely(r < 0)) return false; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - return vq->avail_idx != vq->last_avail_idx; + return r; } EXPORT_SYMBOL_GPL(vhost_enable_notify); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 9e942fcda5c3..bb75a292d50c 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -28,12 +28,14 @@ struct vhost_work { struct vhost_worker { struct vhost_task *vtsk; + struct vhost_dev *dev; /* Used to serialize device wide flushing with worker swapping. */ struct mutex mutex; struct llist_head work_list; u64 kcov_handle; u32 id; int attachment_cnt; + bool killed; }; /* Poll a file (eventfd or socket) */ @@ -205,7 +207,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *, struct vhost_log *log, unsigned int *log_num); void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); -void vhost_vq_flush(struct vhost_virtqueue *vq); bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work); bool vhost_vq_has_work(struct vhost_virtqueue *vq); bool vhost_vq_is_setup(struct vhost_virtqueue *vq); diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 7b8fd977f71c..73e153f9b449 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1614,4 +1614,5 @@ EXPORT_SYMBOL(vringh_need_notify_iotlb); #endif +MODULE_DESCRIPTION("host side of a virtio ring"); MODULE_LICENSE("GPL"); diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index ec20ecff85c7..802153e23073 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, restart_tx = true; } - consume_skb(skb); + virtio_transport_consume_skb_sent(skb, true); } } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); if (added) @@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, + .unsent_bytes = virtio_transport_unsent_bytes, + .read_skb = virtio_transport_read_skb, }, @@ -667,6 +669,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) } vsock->guest_cid = 0; /* no CID assigned yet */ + vsock->seqpacket_allow = false; atomic_set(&vsock->queued_replies, 0); @@ -810,8 +813,7 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) goto err; } - if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET)) - vsock->seqpacket_allow = true; + vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; |