diff options
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/Kconfig | 1 | ||||
-rw-r--r-- | drivers/vhost/net.c | 137 | ||||
-rw-r--r-- | drivers/vhost/scsi.c | 892 | ||||
-rw-r--r-- | drivers/vhost/vdpa.c | 58 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 241 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 3 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 20 | ||||
-rw-r--r-- | drivers/vhost/vsock.c | 8 |
8 files changed, 894 insertions, 466 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index b455d9ab6f3d..020d4fbb947c 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -47,6 +47,7 @@ config VHOST_SCSI tristate "VHOST_SCSI TCM fabric driver" depends on TARGET_CORE && EVENTFD select VHOST + select SG_POOL default n help Say M here to enable the vhost_scsi TCM fabric module diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f2ed7167c848..7cbfc7d718b3 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -141,10 +141,8 @@ struct vhost_net { unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; - /* Private page frag */ - struct page_frag page_frag; - /* Refcount bias of page frag */ - int refcnt_bias; + /* Private page frag cache */ + struct page_frag_cache pf_cache; }; static unsigned vhost_net_zcopy_mask __read_mostly; @@ -382,7 +380,7 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, } } -static void vhost_zerocopy_callback(struct sk_buff *skb, +static void vhost_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); @@ -410,6 +408,10 @@ static void vhost_zerocopy_callback(struct sk_buff *skb, rcu_read_unlock_bh(); } +static const struct ubuf_info_ops vhost_ubuf_ops = { + .complete = vhost_zerocopy_complete, +}; + static inline unsigned long busy_clock(void) { return local_clock() >> 10; @@ -655,41 +657,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) !vhost_vq_avail_empty(vq->dev, vq); } -static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, - struct page_frag *pfrag, gfp_t gfp) -{ - if (pfrag->page) { - if (pfrag->offset + sz <= pfrag->size) - return true; - __page_frag_cache_drain(pfrag->page, net->refcnt_bias); - } - - pfrag->offset = 0; - net->refcnt_bias = 0; - if (SKB_FRAG_PAGE_ORDER) { - /* Avoid direct reclaim but allow kswapd to wake */ - pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | __GFP_NOWARN | - __GFP_NORETRY, - SKB_FRAG_PAGE_ORDER); - if (likely(pfrag->page)) { - pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; - goto done; - } - } - pfrag->page = alloc_page(gfp); - if (likely(pfrag->page)) { - pfrag->size = PAGE_SIZE; - goto done; - } - return false; - -done: - net->refcnt_bias = USHRT_MAX; - page_ref_add(pfrag->page, USHRT_MAX - 1); - return true; -} - #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, @@ -699,7 +666,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); - struct page_frag *alloc_frag = &net->page_frag; struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; struct tun_xdp_hdr *hdr; @@ -710,6 +676,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, int sock_hlen = nvq->sock_hlen; void *buf; int copied; + int ret; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; @@ -719,22 +686,24 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); - alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); - if (unlikely(!vhost_net_page_frag_refill(net, buflen, - alloc_frag, GFP_KERNEL))) + buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, + SMP_CACHE_BYTES); + if (unlikely(!buf)) return -ENOMEM; - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; - copied = copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + - offsetof(struct tun_xdp_hdr, gso), - sock_hlen, from); - if (copied != sock_hlen) - return -EFAULT; + copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso), + sock_hlen, from); + if (copied != sock_hlen) { + ret = -EFAULT; + goto err; + } hdr = buf; gso = &hdr->gso; + if (!sock_hlen) + memset(buf, 0, pad); + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2 > @@ -743,27 +712,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); - if (vhost16_to_cpu(vq, gso->hdr_len) > len) - return -EINVAL; + if (vhost16_to_cpu(vq, gso->hdr_len) > len) { + ret = -EINVAL; + goto err; + } } len -= sock_hlen; - copied = copy_page_from_iter(alloc_frag->page, - alloc_frag->offset + pad, - len, from); - if (copied != len) - return -EFAULT; + copied = copy_from_iter(buf + pad, len, from); + if (copied != len) { + ret = -EFAULT; + goto err; + } xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len, true); hdr->buflen = buflen; - --net->refcnt_bias; - alloc_frag->offset += buflen; - ++nvq->batched_xdp; return 0; + +err: + page_frag_free(buf); + return ret; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) @@ -783,10 +755,10 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); + bool busyloop_intr; do { - bool busyloop_intr = false; - + busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); @@ -797,13 +769,10 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { - if (unlikely(busyloop_intr)) { - vhost_poll_queue(&vq->poll); - } else if (unlikely(vhost_enable_notify(&net->dev, - vq))) { - vhost_disable_notify(&net->dev, vq); - continue; - } + /* Kicks are disabled at this point, break loop and + * process any remaining batched packets. Queue will + * be re-enabled afterwards. + */ break; } @@ -853,7 +822,22 @@ done: ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); + /* Kicks are still disabled, dispatch any remaining batched msgs. */ vhost_tx_batch(net, nvq, sock, &msg); + + if (unlikely(busyloop_intr)) + /* If interrupted while doing busy polling, requeue the + * handler to be fair handle_rx as well as other tasks + * waiting on cpu. + */ + vhost_poll_queue(&vq->poll); + else + /* All of our work has been completed; however, before + * leaving the TX handler, do one last check for work, + * and requeue handler if necessary. If there is no work, + * queue will be reenabled. + */ + vhost_net_busy_poll_try_queue(net, vq); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) @@ -911,7 +895,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; - ubuf->ubuf.callback = vhost_zerocopy_callback; + ubuf->ubuf.ops = &vhost_ubuf_ops; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; @@ -1135,6 +1119,7 @@ static void handle_rx(struct vhost_net *net) size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; + bool set_num_buffers; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; @@ -1157,6 +1142,8 @@ static void handle_rx(struct vhost_net *net) vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + set_num_buffers = mergeable || + vhost_has_feature(vq, VIRTIO_F_VERSION_1); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, @@ -1233,7 +1220,7 @@ static void handle_rx(struct vhost_net *net) /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); - if (likely(mergeable) && + if (likely(set_num_buffers) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); @@ -1353,8 +1340,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) vqs[VHOST_NET_VQ_RX]); f->private_data = n; - n->page_frag.page = NULL; - n->refcnt_bias = 0; + page_frag_cache_init(&n->pf_cache); return 0; } @@ -1422,8 +1408,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); - if (n->page_frag.page) - __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); + page_frag_cache_drain(&n->pf_cache); kvfree(n); return 0; } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 282aac45c690..c12a0d4e6386 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -27,7 +27,7 @@ #include <linux/miscdevice.h> #include <linux/blk_types.h> #include <linux/bio.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <scsi/scsi_common.h> #include <scsi/scsi_proto.h> #include <target/target_core_base.h> @@ -45,6 +45,55 @@ #define VHOST_SCSI_PREALLOC_SGLS 2048 #define VHOST_SCSI_PREALLOC_UPAGES 2048 #define VHOST_SCSI_PREALLOC_PROT_SGLS 2048 +/* + * For the legacy descriptor case we allocate an iov per byte in the + * virtio_scsi_cmd_resp struct. + */ +#define VHOST_SCSI_MAX_RESP_IOVS sizeof(struct virtio_scsi_cmd_resp) + +static unsigned int vhost_scsi_inline_sg_cnt = VHOST_SCSI_PREALLOC_SGLS; + +#ifdef CONFIG_ARCH_NO_SG_CHAIN +static int vhost_scsi_set_inline_sg_cnt(const char *buf, + const struct kernel_param *kp) +{ + pr_err("Setting inline_sg_cnt is not supported.\n"); + return -EOPNOTSUPP; +} +#else +static int vhost_scsi_set_inline_sg_cnt(const char *buf, + const struct kernel_param *kp) +{ + unsigned int cnt; + int ret; + + ret = kstrtouint(buf, 10, &cnt); + if (ret) + return ret; + + if (ret > VHOST_SCSI_PREALLOC_SGLS) { + pr_err("Max inline_sg_cnt is %u\n", VHOST_SCSI_PREALLOC_SGLS); + return -EINVAL; + } + + vhost_scsi_inline_sg_cnt = cnt; + return 0; +} +#endif + +static int vhost_scsi_get_inline_sg_cnt(char *buf, + const struct kernel_param *kp) +{ + return sprintf(buf, "%u\n", vhost_scsi_inline_sg_cnt); +} + +static const struct kernel_param_ops vhost_scsi_inline_sg_cnt_op = { + .get = vhost_scsi_get_inline_sg_cnt, + .set = vhost_scsi_set_inline_sg_cnt, +}; + +module_param_cb(inline_sg_cnt, &vhost_scsi_inline_sg_cnt_op, NULL, 0644); +MODULE_PARM_DESC(inline_sg_cnt, "Set the number of scatterlist entries to pre-allocate. The default is 2048."); /* Max number of requests before requeueing the job. * Using this limit prevents one virtqueue from starving others with @@ -62,42 +111,33 @@ struct vhost_scsi_inflight { struct vhost_scsi_cmd { /* Descriptor from vhost_get_vq_desc() for virt_queue segment */ int tvc_vq_desc; - /* virtio-scsi initiator task attribute */ - int tvc_task_attr; - /* virtio-scsi response incoming iovecs */ - int tvc_in_iovs; - /* virtio-scsi initiator data direction */ - enum dma_data_direction tvc_data_direction; - /* Expected data transfer length from virtio-scsi header */ - u32 tvc_exp_data_len; - /* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */ - u64 tvc_tag; /* The number of scatterlists associated with this cmd */ u32 tvc_sgl_count; u32 tvc_prot_sgl_count; - /* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */ - u32 tvc_lun; u32 copied_iov:1; - const void *saved_iter_addr; - struct iov_iter saved_iter; - /* Pointer to the SGL formatted memory from virtio-scsi */ - struct scatterlist *tvc_sgl; - struct scatterlist *tvc_prot_sgl; - struct page **tvc_upages; - /* Pointer to response header iovec */ - struct iovec *tvc_resp_iov; - /* Pointer to vhost_scsi for our device */ - struct vhost_scsi *tvc_vhost; + const void *read_iov; + struct iov_iter *read_iter; + struct scatterlist *sgl; + struct sg_table table; + struct scatterlist *prot_sgl; + struct sg_table prot_table; + /* Fast path response header iovec used when only one vec is needed */ + struct iovec tvc_resp_iov; + /* Number of iovs for response */ + unsigned int tvc_resp_iovs_cnt; + /* Pointer to response header iovecs if more than one is needed */ + struct iovec *tvc_resp_iovs; /* Pointer to vhost_virtqueue for the cmd */ struct vhost_virtqueue *tvc_vq; - /* Pointer to vhost nexus memory */ - struct vhost_scsi_nexus *tvc_nexus; /* The TCM I/O descriptor that is accessed via container_of() */ struct se_cmd tvc_se_cmd; - /* Copy of the incoming SCSI command descriptor block (CDB) */ - unsigned char tvc_cdb[VHOST_SCSI_MAX_CDB_SIZE]; /* Sense buffer that will be mapped into outgoing status */ unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER]; + /* + * Dirty write descriptors of this command. + */ + struct vhost_log *tvc_log; + unsigned int tvc_log_num; /* Completed commands list, serviced from vhost worker thread */ struct llist_node tvc_completion_list; /* Used to track inflight cmd */ @@ -187,6 +227,7 @@ struct vhost_scsi_virtqueue { struct vhost_scsi_cmd *scsi_cmds; struct sbitmap scsi_tags; int max_cmds; + struct page **upages; struct vhost_work completion_work; struct llist_head completion_list; @@ -206,10 +247,13 @@ struct vhost_scsi { bool vs_events_missed; /* any missed events, protected by vq->mutex */ int vs_events_nr; /* num of pending events, protected by vq->mutex */ + + unsigned int inline_sg_cnt; }; struct vhost_scsi_tmf { struct vhost_work vwork; + struct work_struct flush_work; struct vhost_scsi *vhost; struct vhost_scsi_virtqueue *svq; @@ -219,6 +263,12 @@ struct vhost_scsi_tmf { struct iovec resp_iov; int in_iovs; int vq_desc; + + /* + * Dirty write descriptors of this command. + */ + struct vhost_log *tmf_log; + unsigned int tmf_log_num; }; /* @@ -323,29 +373,83 @@ static int vhost_scsi_check_prot_fabric_only(struct se_portal_group *se_tpg) return tpg->tv_fabric_prot_type; } +static int vhost_scsi_copy_cmd_log(struct vhost_virtqueue *vq, + struct vhost_scsi_cmd *cmd, + struct vhost_log *log, + unsigned int log_num) +{ + if (!cmd->tvc_log) + cmd->tvc_log = kmalloc_array(vq->dev->iov_limit, + sizeof(*cmd->tvc_log), + GFP_KERNEL); + + if (unlikely(!cmd->tvc_log)) { + vq_err(vq, "Failed to alloc tvc_log\n"); + return -ENOMEM; + } + + memcpy(cmd->tvc_log, log, sizeof(*cmd->tvc_log) * log_num); + cmd->tvc_log_num = log_num; + + return 0; +} + +static void vhost_scsi_log_write(struct vhost_virtqueue *vq, + struct vhost_log *log, + unsigned int log_num) +{ + if (likely(!vhost_has_feature(vq, VHOST_F_LOG_ALL))) + return; + + if (likely(!log_num || !log)) + return; + + /* + * vhost-scsi doesn't support VIRTIO_F_ACCESS_PLATFORM. + * No requirement for vq->iotlb case. + */ + WARN_ON_ONCE(unlikely(vq->iotlb)); + vhost_log_write(vq, log, log_num, U64_MAX, NULL, 0); +} + static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd) { struct vhost_scsi_cmd *tv_cmd = container_of(se_cmd, struct vhost_scsi_cmd, tvc_se_cmd); struct vhost_scsi_virtqueue *svq = container_of(tv_cmd->tvc_vq, struct vhost_scsi_virtqueue, vq); + struct vhost_scsi *vs = svq->vs; struct vhost_scsi_inflight *inflight = tv_cmd->inflight; + struct scatterlist *sg; + struct page *page; int i; if (tv_cmd->tvc_sgl_count) { - for (i = 0; i < tv_cmd->tvc_sgl_count; i++) { + for_each_sgtable_sg(&tv_cmd->table, sg, i) { + page = sg_page(sg); + if (!page) + continue; + if (tv_cmd->copied_iov) - __free_page(sg_page(&tv_cmd->tvc_sgl[i])); + __free_page(page); else - put_page(sg_page(&tv_cmd->tvc_sgl[i])); + put_page(page); } - kfree(tv_cmd->saved_iter_addr); + kfree(tv_cmd->read_iter); + kfree(tv_cmd->read_iov); + sg_free_table_chained(&tv_cmd->table, vs->inline_sg_cnt); } if (tv_cmd->tvc_prot_sgl_count) { - for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++) - put_page(sg_page(&tv_cmd->tvc_prot_sgl[i])); + for_each_sgtable_sg(&tv_cmd->prot_table, sg, i) { + page = sg_page(sg); + if (page) + put_page(page); + } + sg_free_table_chained(&tv_cmd->prot_table, vs->inline_sg_cnt); } + if (tv_cmd->tvc_resp_iovs != &tv_cmd->tvc_resp_iov) + kfree(tv_cmd->tvc_resp_iovs); sbitmap_clear_bit(&svq->scsi_tags, se_cmd->map_tag); vhost_scsi_put_inflight(inflight); } @@ -354,18 +458,31 @@ static void vhost_scsi_release_tmf_res(struct vhost_scsi_tmf *tmf) { struct vhost_scsi_inflight *inflight = tmf->inflight; + /* + * tmf->tmf_log is default NULL unless VHOST_F_LOG_ALL is set. + */ + kfree(tmf->tmf_log); kfree(tmf); vhost_scsi_put_inflight(inflight); } +static void vhost_scsi_drop_cmds(struct vhost_scsi_virtqueue *svq) +{ + struct vhost_scsi_cmd *cmd, *t; + struct llist_node *llnode; + + llnode = llist_del_all(&svq->completion_list); + llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) + vhost_scsi_release_cmd_res(&cmd->tvc_se_cmd); +} + static void vhost_scsi_release_cmd(struct se_cmd *se_cmd) { if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) { struct vhost_scsi_tmf *tmf = container_of(se_cmd, struct vhost_scsi_tmf, se_cmd); - struct vhost_virtqueue *vq = &tmf->svq->vq; - vhost_vq_work_queue(vq, &tmf->vwork); + schedule_work(&tmf->flush_work); } else { struct vhost_scsi_cmd *cmd = container_of(se_cmd, struct vhost_scsi_cmd, tvc_se_cmd); @@ -373,7 +490,8 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd) struct vhost_scsi_virtqueue, vq); llist_add(&cmd->tvc_completion_list, &svq->completion_list); - vhost_vq_work_queue(&svq->vq, &svq->completion_work); + if (!vhost_vq_work_queue(&svq->vq, &svq->completion_work)) + vhost_scsi_drop_cmds(svq); } } @@ -453,6 +571,8 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt) struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; struct virtio_scsi_event *event = &evt->event; struct virtio_scsi_event __user *eventp; + struct vhost_log *vq_log; + unsigned int log_num; unsigned out, in; int head, ret; @@ -463,9 +583,19 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt) again: vhost_disable_notify(&vs->dev, vq); + + vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + + /* + * Reset 'log_num' since vhost_get_vq_desc() may reset it only + * after certain condition checks. + */ + log_num = 0; + head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, - NULL, NULL); + vq_log, &log_num); if (head < 0) { vs->vs_events_missed = true; return; @@ -495,12 +625,12 @@ again: vhost_add_used_and_signal(&vs->dev, vq, head, 0); else vq_err(vq, "Faulted on vhost_scsi_send_event\n"); + + vhost_scsi_log_write(vq, vq_log, log_num); } -static void vhost_scsi_evt_work(struct vhost_work *work) +static void vhost_scsi_complete_events(struct vhost_scsi *vs, bool drop) { - struct vhost_scsi *vs = container_of(work, struct vhost_scsi, - vs_event_work); struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; struct vhost_scsi_evt *evt, *t; struct llist_node *llnode; @@ -508,23 +638,34 @@ static void vhost_scsi_evt_work(struct vhost_work *work) mutex_lock(&vq->mutex); llnode = llist_del_all(&vs->vs_event_list); llist_for_each_entry_safe(evt, t, llnode, list) { - vhost_scsi_do_evt_work(vs, evt); + if (!drop) + vhost_scsi_do_evt_work(vs, evt); vhost_scsi_free_evt(vs, evt); } mutex_unlock(&vq->mutex); } +static void vhost_scsi_evt_work(struct vhost_work *work) +{ + struct vhost_scsi *vs = container_of(work, struct vhost_scsi, + vs_event_work); + vhost_scsi_complete_events(vs, false); +} + static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd) { - struct iov_iter *iter = &cmd->saved_iter; - struct scatterlist *sg = cmd->tvc_sgl; + struct iov_iter *iter = cmd->read_iter; + struct scatterlist *sg; struct page *page; size_t len; int i; - for (i = 0; i < cmd->tvc_sgl_count; i++) { - page = sg_page(&sg[i]); - len = sg[i].length; + for_each_sgtable_sg(&cmd->table, sg, i) { + page = sg_page(sg); + if (!page) + continue; + + len = sg->length; if (copy_page_to_iter(page, 0, len, iter) != len) { pr_err("Could not copy data while handling misaligned cmd. Error %zu\n", @@ -554,6 +695,9 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) int ret; llnode = llist_del_all(&svq->completion_list); + + mutex_lock(&svq->vq.mutex); + llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) { se_cmd = &cmd->tvc_se_cmd; @@ -561,7 +705,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) cmd, se_cmd->residual_count, se_cmd->scsi_status); memset(&v_rsp, 0, sizeof(v_rsp)); - if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) { + if (cmd->read_iter && vhost_scsi_copy_sgl_to_iov(cmd)) { v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET; } else { v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, @@ -574,8 +718,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) se_cmd->scsi_sense_length); } - iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov, - cmd->tvc_in_iovs, sizeof(v_rsp)); + iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iovs, + cmd->tvc_resp_iovs_cnt, sizeof(v_rsp)); ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter); if (likely(ret == sizeof(v_rsp))) { signal = true; @@ -584,63 +728,69 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) } else pr_err("Faulted on virtio_scsi_cmd_resp\n"); + vhost_scsi_log_write(cmd->tvc_vq, cmd->tvc_log, + cmd->tvc_log_num); + vhost_scsi_release_cmd_res(se_cmd); } + mutex_unlock(&svq->vq.mutex); + if (signal) vhost_signal(&svq->vs->dev, &svq->vq); } static struct vhost_scsi_cmd * -vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, - unsigned char *cdb, u64 scsi_tag, u16 lun, u8 task_attr, - u32 exp_data_len, int data_direction) +vhost_scsi_get_cmd(struct vhost_virtqueue *vq, u64 scsi_tag) { struct vhost_scsi_virtqueue *svq = container_of(vq, struct vhost_scsi_virtqueue, vq); struct vhost_scsi_cmd *cmd; - struct vhost_scsi_nexus *tv_nexus; - struct scatterlist *sg, *prot_sg; - struct iovec *tvc_resp_iov; - struct page **pages; + struct scatterlist *sgl, *prot_sgl; + struct vhost_log *log; int tag; - tv_nexus = tpg->tpg_nexus; - if (!tv_nexus) { - pr_err("Unable to locate active struct vhost_scsi_nexus\n"); - return ERR_PTR(-EIO); - } - tag = sbitmap_get(&svq->scsi_tags); if (tag < 0) { - pr_err("Unable to obtain tag for vhost_scsi_cmd\n"); + pr_warn_once("Guest sent too many cmds. Returning TASK_SET_FULL.\n"); return ERR_PTR(-ENOMEM); } cmd = &svq->scsi_cmds[tag]; - sg = cmd->tvc_sgl; - prot_sg = cmd->tvc_prot_sgl; - pages = cmd->tvc_upages; - tvc_resp_iov = cmd->tvc_resp_iov; + sgl = cmd->sgl; + prot_sgl = cmd->prot_sgl; + log = cmd->tvc_log; memset(cmd, 0, sizeof(*cmd)); - cmd->tvc_sgl = sg; - cmd->tvc_prot_sgl = prot_sg; - cmd->tvc_upages = pages; + cmd->sgl = sgl; + cmd->prot_sgl = prot_sgl; + cmd->tvc_log = log; cmd->tvc_se_cmd.map_tag = tag; - cmd->tvc_tag = scsi_tag; - cmd->tvc_lun = lun; - cmd->tvc_task_attr = task_attr; - cmd->tvc_exp_data_len = exp_data_len; - cmd->tvc_data_direction = data_direction; - cmd->tvc_nexus = tv_nexus; cmd->inflight = vhost_scsi_get_inflight(vq); - cmd->tvc_resp_iov = tvc_resp_iov; - - memcpy(cmd->tvc_cdb, cdb, VHOST_SCSI_MAX_CDB_SIZE); return cmd; } +static void vhost_scsi_revert_map_iov_to_sgl(struct iov_iter *iter, + struct scatterlist *curr, + struct scatterlist *end) +{ + size_t revert_bytes = 0; + struct page *page; + + while (curr != end) { + page = sg_page(curr); + + if (page) { + put_page(page); + revert_bytes += curr->length; + } + /* Clear so we can re-use it for the copy path */ + sg_set_page(curr, NULL, 0, 0); + curr = sg_next(curr); + } + iov_iter_revert(iter, revert_bytes); +} + /* * Map a user memory range into a scatterlist * @@ -649,14 +799,17 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, static int vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, - struct scatterlist *sgl, + struct sg_table *sg_table, + struct scatterlist **sgl, bool is_prot) { - struct page **pages = cmd->tvc_upages; - struct scatterlist *sg = sgl; - ssize_t bytes, mapped_bytes; - size_t offset, mapped_offset; - unsigned int npages = 0; + struct vhost_scsi_virtqueue *svq = container_of(cmd->tvc_vq, + struct vhost_scsi_virtqueue, vq); + struct page **pages = svq->upages; + struct scatterlist *sg = *sgl; + ssize_t bytes; + size_t offset; + unsigned int n, npages = 0; bytes = iov_iter_get_pages2(iter, pages, LONG_MAX, VHOST_SCSI_PREALLOC_UPAGES, &offset); @@ -664,11 +817,8 @@ vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd, if (bytes <= 0) return bytes < 0 ? bytes : -EFAULT; - mapped_bytes = bytes; - mapped_offset = offset; - while (bytes) { - unsigned n = min_t(unsigned, PAGE_SIZE - offset, bytes); + n = min_t(unsigned int, PAGE_SIZE - offset, bytes); /* * The block layer requires bios/requests to be a multiple of * 512 bytes, but Windows can send us vecs that are misaligned. @@ -689,25 +839,24 @@ vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd, goto revert_iter_get_pages; } - sg_set_page(sg++, pages[npages++], n, offset); + sg_set_page(sg, pages[npages++], n, offset); + sg = sg_next(sg); bytes -= n; offset = 0; } + *sgl = sg; return npages; revert_iter_get_pages: - iov_iter_revert(iter, mapped_bytes); + vhost_scsi_revert_map_iov_to_sgl(iter, *sgl, sg); - npages = 0; - while (mapped_bytes) { - unsigned int n = min_t(unsigned int, PAGE_SIZE - mapped_offset, - mapped_bytes); + iov_iter_revert(iter, bytes); + while (bytes) { + n = min_t(unsigned int, PAGE_SIZE, bytes); put_page(pages[npages++]); - - mapped_bytes -= n; - mapped_offset = 0; + bytes -= n; } return -EINVAL; @@ -735,33 +884,42 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) static int vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, - struct scatterlist *sg, int sg_count) + struct sg_table *sg_table, int sg_count, + int data_dir) { size_t len = iov_iter_count(iter); unsigned int nbytes = 0; + struct scatterlist *sg; struct page *page; - int i; + int i, ret; - if (cmd->tvc_data_direction == DMA_FROM_DEVICE) { - cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter, - GFP_KERNEL); - if (!cmd->saved_iter_addr) + if (data_dir == DMA_FROM_DEVICE) { + cmd->read_iter = kzalloc(sizeof(*cmd->read_iter), GFP_KERNEL); + if (!cmd->read_iter) return -ENOMEM; + + cmd->read_iov = dup_iter(cmd->read_iter, iter, GFP_KERNEL); + if (!cmd->read_iov) { + ret = -ENOMEM; + goto free_iter; + } } - for (i = 0; i < sg_count; i++) { + for_each_sgtable_sg(sg_table, sg, i) { page = alloc_page(GFP_KERNEL); if (!page) { - i--; + ret = -ENOMEM; goto err; } nbytes = min_t(unsigned int, PAGE_SIZE, len); - sg_set_page(&sg[i], page, nbytes, 0); + sg_set_page(sg, page, nbytes, 0); - if (cmd->tvc_data_direction == DMA_TO_DEVICE && - copy_page_from_iter(page, 0, nbytes, iter) != nbytes) + if (data_dir == DMA_TO_DEVICE && + copy_page_from_iter(page, 0, nbytes, iter) != nbytes) { + ret = -EFAULT; goto err; + } len -= nbytes; } @@ -773,66 +931,63 @@ err: pr_err("Could not read %u bytes while handling misaligned cmd\n", nbytes); - for (; i >= 0; i--) - __free_page(sg_page(&sg[i])); - kfree(cmd->saved_iter_addr); - return -ENOMEM; + for_each_sgtable_sg(sg_table, sg, i) { + page = sg_page(sg); + if (page) + __free_page(page); + } + kfree(cmd->read_iov); +free_iter: + kfree(cmd->read_iter); + return ret; } static int vhost_scsi_map_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, - struct scatterlist *sg, int sg_count, bool is_prot) + struct sg_table *sg_table, int sg_count, bool is_prot) { - struct scatterlist *p = sg; - size_t revert_bytes; + struct scatterlist *sg = sg_table->sgl; int ret; while (iov_iter_count(iter)) { - ret = vhost_scsi_map_to_sgl(cmd, iter, sg, is_prot); + ret = vhost_scsi_map_to_sgl(cmd, iter, sg_table, &sg, is_prot); if (ret < 0) { - revert_bytes = 0; - - while (p < sg) { - struct page *page = sg_page(p); - - if (page) { - put_page(page); - revert_bytes += p->length; - } - p++; - } - - iov_iter_revert(iter, revert_bytes); + vhost_scsi_revert_map_iov_to_sgl(iter, sg_table->sgl, + sg); return ret; } - sg += ret; } return 0; } static int -vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, +vhost_scsi_mapal(struct vhost_scsi *vs, struct vhost_scsi_cmd *cmd, size_t prot_bytes, struct iov_iter *prot_iter, - size_t data_bytes, struct iov_iter *data_iter) + size_t data_bytes, struct iov_iter *data_iter, int data_dir) { int sgl_count, ret; if (prot_bytes) { sgl_count = vhost_scsi_calc_sgls(prot_iter, prot_bytes, VHOST_SCSI_PREALLOC_PROT_SGLS); - if (sgl_count < 0) - return sgl_count; + cmd->prot_table.sgl = cmd->prot_sgl; + ret = sg_alloc_table_chained(&cmd->prot_table, sgl_count, + cmd->prot_table.sgl, + vs->inline_sg_cnt); + if (ret) + return ret; - sg_init_table(cmd->tvc_prot_sgl, sgl_count); cmd->tvc_prot_sgl_count = sgl_count; pr_debug("%s prot_sg %p prot_sgl_count %u\n", __func__, - cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); + cmd->prot_table.sgl, cmd->tvc_prot_sgl_count); ret = vhost_scsi_map_iov_to_sgl(cmd, prot_iter, - cmd->tvc_prot_sgl, + &cmd->prot_table, cmd->tvc_prot_sgl_count, true); if (ret < 0) { + sg_free_table_chained(&cmd->prot_table, + vs->inline_sg_cnt); cmd->tvc_prot_sgl_count = 0; return ret; } @@ -842,20 +997,23 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, if (sgl_count < 0) return sgl_count; - sg_init_table(cmd->tvc_sgl, sgl_count); + cmd->table.sgl = cmd->sgl; + ret = sg_alloc_table_chained(&cmd->table, sgl_count, cmd->table.sgl, + vs->inline_sg_cnt); + if (ret) + return ret; + cmd->tvc_sgl_count = sgl_count; pr_debug("%s data_sg %p data_sgl_count %u\n", __func__, - cmd->tvc_sgl, cmd->tvc_sgl_count); + cmd->table.sgl, cmd->tvc_sgl_count); - ret = vhost_scsi_map_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, + ret = vhost_scsi_map_iov_to_sgl(cmd, data_iter, &cmd->table, cmd->tvc_sgl_count, false); - if (ret == -EINVAL) { - sg_init_table(cmd->tvc_sgl, cmd->tvc_sgl_count); - ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, - cmd->tvc_sgl_count); - } - + if (ret == -EINVAL) + ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, &cmd->table, + cmd->tvc_sgl_count, data_dir); if (ret < 0) { + sg_free_table_chained(&cmd->table, vs->inline_sg_cnt); cmd->tvc_sgl_count = 0; return ret; } @@ -879,32 +1037,33 @@ static int vhost_scsi_to_tcm_attr(int attr) return TCM_SIMPLE_TAG; } -static void vhost_scsi_target_queue_cmd(struct vhost_scsi_cmd *cmd) +static void vhost_scsi_target_queue_cmd(struct vhost_scsi_nexus *nexus, + struct vhost_scsi_cmd *cmd, + unsigned char *cdb, u16 lun, + int task_attr, int data_dir, + u32 exp_data_len) { struct se_cmd *se_cmd = &cmd->tvc_se_cmd; - struct vhost_scsi_nexus *tv_nexus; struct scatterlist *sg_ptr, *sg_prot_ptr = NULL; /* FIXME: BIDI operation */ if (cmd->tvc_sgl_count) { - sg_ptr = cmd->tvc_sgl; + sg_ptr = cmd->table.sgl; if (cmd->tvc_prot_sgl_count) - sg_prot_ptr = cmd->tvc_prot_sgl; + sg_prot_ptr = cmd->prot_table.sgl; else se_cmd->prot_pto = true; } else { sg_ptr = NULL; } - tv_nexus = cmd->tvc_nexus; se_cmd->tag = 0; - target_init_cmd(se_cmd, tv_nexus->tvn_se_sess, &cmd->tvc_sense_buf[0], - cmd->tvc_lun, cmd->tvc_exp_data_len, - vhost_scsi_to_tcm_attr(cmd->tvc_task_attr), - cmd->tvc_data_direction, TARGET_SCF_ACK_KREF); + target_init_cmd(se_cmd, nexus->tvn_se_sess, &cmd->tvc_sense_buf[0], + lun, exp_data_len, vhost_scsi_to_tcm_attr(task_attr), + data_dir, TARGET_SCF_ACK_KREF); - if (target_submit_prep(se_cmd, cmd->tvc_cdb, sg_ptr, + if (target_submit_prep(se_cmd, cdb, sg_ptr, cmd->tvc_sgl_count, NULL, 0, sg_prot_ptr, cmd->tvc_prot_sgl_count, GFP_KERNEL)) return; @@ -913,33 +1072,82 @@ static void vhost_scsi_target_queue_cmd(struct vhost_scsi_cmd *cmd) } static void -vhost_scsi_send_bad_target(struct vhost_scsi *vs, - struct vhost_virtqueue *vq, - int head, unsigned out) +vhost_scsi_send_status(struct vhost_scsi *vs, struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc, u8 status) { - struct virtio_scsi_cmd_resp __user *resp; struct virtio_scsi_cmd_resp rsp; + struct iov_iter iov_iter; int ret; memset(&rsp, 0, sizeof(rsp)); - rsp.response = VIRTIO_SCSI_S_BAD_TARGET; - resp = vq->iov[out].iov_base; - ret = __copy_to_user(resp, &rsp, sizeof(rsp)); - if (!ret) - vhost_add_used_and_signal(&vs->dev, vq, head, 0); + rsp.status = status; + + iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[vc->out], vc->in, + sizeof(rsp)); + + ret = copy_to_iter(&rsp, sizeof(rsp), &iov_iter); + + if (likely(ret == sizeof(rsp))) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); else pr_err("Faulted on virtio_scsi_cmd_resp\n"); } +#define TYPE_IO_CMD 0 +#define TYPE_CTRL_TMF 1 +#define TYPE_CTRL_AN 2 + +static void +vhost_scsi_send_bad_target(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, + struct vhost_scsi_ctx *vc, int type) +{ + union { + struct virtio_scsi_cmd_resp cmd; + struct virtio_scsi_ctrl_tmf_resp tmf; + struct virtio_scsi_ctrl_an_resp an; + } rsp; + struct iov_iter iov_iter; + size_t rsp_size; + int ret; + + memset(&rsp, 0, sizeof(rsp)); + + if (type == TYPE_IO_CMD) { + rsp_size = sizeof(struct virtio_scsi_cmd_resp); + rsp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET; + } else if (type == TYPE_CTRL_TMF) { + rsp_size = sizeof(struct virtio_scsi_ctrl_tmf_resp); + rsp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET; + } else { + rsp_size = sizeof(struct virtio_scsi_ctrl_an_resp); + rsp.an.response = VIRTIO_SCSI_S_BAD_TARGET; + } + + iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[vc->out], vc->in, + rsp_size); + + ret = copy_to_iter(&rsp, rsp_size, &iov_iter); + + if (likely(ret == rsp_size)) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); + else + pr_err("Faulted on virtio scsi type=%d\n", type); +} + static int vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq, - struct vhost_scsi_ctx *vc) + struct vhost_scsi_ctx *vc, + struct vhost_log *log, unsigned int *log_num) { int ret = -ENXIO; + if (likely(log_num)) + *log_num = 0; + vc->head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &vc->out, &vc->in, - NULL, NULL); + log, log_num); pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", vc->head, vc->out, vc->in); @@ -1012,21 +1220,61 @@ vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc, /* virtio-scsi spec requires byte 0 of the lun to be 1 */ vq_err(vq, "Illegal virtio-scsi lun: %u\n", *vc->lunp); } else { - struct vhost_scsi_tpg **vs_tpg, *tpg; + struct vhost_scsi_tpg **vs_tpg, *tpg = NULL; + + if (vc->target) { + /* validated at handler entry */ + vs_tpg = vhost_vq_get_backend(vq); + tpg = READ_ONCE(vs_tpg[*vc->target]); + if (unlikely(!tpg)) { + vq_err(vq, "Target 0x%x does not exist\n", *vc->target); + goto out; + } + } - vs_tpg = vhost_vq_get_backend(vq); /* validated at handler entry */ + if (tpgp) + *tpgp = tpg; + ret = 0; + } +out: + return ret; +} - tpg = READ_ONCE(vs_tpg[*vc->target]); - if (unlikely(!tpg)) { - vq_err(vq, "Target 0x%x does not exist\n", *vc->target); - } else { - if (tpgp) - *tpgp = tpg; - ret = 0; - } +static int +vhost_scsi_setup_resp_iovs(struct vhost_scsi_cmd *cmd, struct iovec *in_iovs, + unsigned int in_iovs_cnt) +{ + int i, cnt; + + if (!in_iovs_cnt) + return 0; + /* + * Initiator's normally just put the virtio_scsi_cmd_resp in the first + * iov, but just in case they wedged in some data with it we check for + * greater than or equal to the response struct. + */ + if (in_iovs[0].iov_len >= sizeof(struct virtio_scsi_cmd_resp)) { + cmd->tvc_resp_iovs = &cmd->tvc_resp_iov; + cmd->tvc_resp_iovs_cnt = 1; + } else { + /* + * Legacy descriptor layouts didn't specify that we must put + * the entire response in one iov. Worst case we have a + * iov per byte. + */ + cnt = min(VHOST_SCSI_MAX_RESP_IOVS, in_iovs_cnt); + cmd->tvc_resp_iovs = kcalloc(cnt, sizeof(struct iovec), + GFP_KERNEL); + if (!cmd->tvc_resp_iovs) + return -ENOMEM; + + cmd->tvc_resp_iovs_cnt = cnt; } - return ret; + for (i = 0; i < cmd->tvc_resp_iovs_cnt; i++) + cmd->tvc_resp_iovs[i] = in_iovs[i]; + + return 0; } static u16 vhost_buf_to_lun(u8 *lun_buf) @@ -1040,16 +1288,19 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) struct vhost_scsi_tpg **vs_tpg, *tpg; struct virtio_scsi_cmd_req v_req; struct virtio_scsi_cmd_req_pi v_req_pi; + struct vhost_scsi_nexus *nexus; struct vhost_scsi_ctx vc; struct vhost_scsi_cmd *cmd; struct iov_iter in_iter, prot_iter, data_iter; u64 tag; u32 exp_data_len, data_direction; - int ret, prot_bytes, i, c = 0; + int ret, prot_bytes, c = 0; u16 lun; u8 task_attr; bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI); - void *cdb; + u8 *cdb; + struct vhost_log *vq_log; + unsigned int log_num; mutex_lock(&vq->mutex); /* @@ -1065,8 +1316,11 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) vhost_disable_notify(&vs->dev, vq); + vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + do { - ret = vhost_scsi_get_desc(vs, vq, &vc); + ret = vhost_scsi_get_desc(vs, vq, &vc, vq_log, &log_num); if (ret) goto err; @@ -1192,29 +1446,47 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE); goto err; } - cmd = vhost_scsi_get_cmd(vq, tpg, cdb, tag, lun, task_attr, - exp_data_len + prot_bytes, - data_direction); + + nexus = tpg->tpg_nexus; + if (!nexus) { + vq_err(vq, "Unable to locate active struct vhost_scsi_nexus\n"); + ret = -EIO; + goto err; + } + + cmd = vhost_scsi_get_cmd(vq, tag); if (IS_ERR(cmd)) { - vq_err(vq, "vhost_scsi_get_cmd failed %ld\n", - PTR_ERR(cmd)); + ret = PTR_ERR(cmd); + vq_err(vq, "vhost_scsi_get_tag failed %dd\n", ret); goto err; } - cmd->tvc_vhost = vs; cmd->tvc_vq = vq; - for (i = 0; i < vc.in ; i++) - cmd->tvc_resp_iov[i] = vq->iov[vc.out + i]; - cmd->tvc_in_iovs = vc.in; + + ret = vhost_scsi_setup_resp_iovs(cmd, &vq->iov[vc.out], vc.in); + if (ret) { + vq_err(vq, "Failed to alloc recv iovs\n"); + vhost_scsi_release_cmd_res(&cmd->tvc_se_cmd); + goto err; + } + + if (unlikely(vq_log && log_num)) { + ret = vhost_scsi_copy_cmd_log(vq, cmd, vq_log, log_num); + if (unlikely(ret)) { + vhost_scsi_release_cmd_res(&cmd->tvc_se_cmd); + goto err; + } + } pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", - cmd->tvc_cdb[0], cmd->tvc_lun); + cdb[0], lun); pr_debug("cmd: %p exp_data_len: %d, prot_bytes: %d data_direction:" " %d\n", cmd, exp_data_len, prot_bytes, data_direction); if (data_direction != DMA_NONE) { - if (unlikely(vhost_scsi_mapal(cmd, prot_bytes, - &prot_iter, exp_data_len, - &data_iter))) { + ret = vhost_scsi_mapal(vs, cmd, prot_bytes, &prot_iter, + exp_data_len, &data_iter, + data_direction); + if (unlikely(ret)) { vq_err(vq, "Failed to map iov to sgl\n"); vhost_scsi_release_cmd_res(&cmd->tvc_se_cmd); goto err; @@ -1226,7 +1498,9 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) * vhost_scsi_queue_data_in() and vhost_scsi_queue_status() */ cmd->tvc_vq_desc = vc.head; - vhost_scsi_target_queue_cmd(cmd); + vhost_scsi_target_queue_cmd(nexus, cmd, cdb, lun, task_attr, + data_direction, + exp_data_len + prot_bytes); ret = 0; err: /* @@ -1234,11 +1508,18 @@ err: * EINVAL: Invalid response buffer, drop the request * EIO: Respond with bad target * EAGAIN: Pending request + * ENOMEM: Could not allocate resources for request */ if (ret == -ENXIO) break; - else if (ret == -EIO) - vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); + else if (ret == -EIO) { + vhost_scsi_send_bad_target(vs, vq, &vc, TYPE_IO_CMD); + vhost_scsi_log_write(vq, vq_log, log_num); + } else if (ret == -ENOMEM) { + vhost_scsi_send_status(vs, vq, &vc, + SAM_STAT_TASK_SET_FULL); + vhost_scsi_log_write(vq, vq_log, log_num); + } } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); out: mutex_unlock(&vq->mutex); @@ -1270,38 +1551,43 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work *work) { struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf, vwork); - struct vhost_virtqueue *ctl_vq, *vq; - int resp_code, i; - - if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) { - /* - * Flush IO vqs that don't share a worker with the ctl to make - * sure they have sent their responses before us. - */ - ctl_vq = &tmf->vhost->vqs[VHOST_SCSI_VQ_CTL].vq; - for (i = VHOST_SCSI_VQ_IO; i < tmf->vhost->dev.nvqs; i++) { - vq = &tmf->vhost->vqs[i].vq; - - if (vhost_vq_is_setup(vq) && - vq->worker != ctl_vq->worker) - vhost_vq_flush(vq); - } + int resp_code; + if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) resp_code = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; - } else { + else resp_code = VIRTIO_SCSI_S_FUNCTION_REJECTED; - } + mutex_lock(&tmf->svq->vq.mutex); vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs, tmf->vq_desc, &tmf->resp_iov, resp_code); + vhost_scsi_log_write(&tmf->svq->vq, tmf->tmf_log, + tmf->tmf_log_num); + mutex_unlock(&tmf->svq->vq.mutex); + vhost_scsi_release_tmf_res(tmf); } +static void vhost_scsi_tmf_flush_work(struct work_struct *work) +{ + struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf, + flush_work); + struct vhost_virtqueue *vq = &tmf->svq->vq; + /* + * Make sure we have sent responses for other commands before we + * send our response. + */ + vhost_dev_flush(vq->dev); + if (!vhost_vq_work_queue(vq, &tmf->vwork)) + vhost_scsi_release_tmf_res(tmf); +} + static void vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, struct vhost_virtqueue *vq, struct virtio_scsi_ctrl_tmf_req *vtmf, - struct vhost_scsi_ctx *vc) + struct vhost_scsi_ctx *vc, + struct vhost_log *log, unsigned int log_num) { struct vhost_scsi_virtqueue *svq = container_of(vq, struct vhost_scsi_virtqueue, vq); @@ -1320,6 +1606,7 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, if (!tmf) goto send_reject; + INIT_WORK(&tmf->flush_work, vhost_scsi_tmf_flush_work); vhost_work_init(&tmf->vwork, vhost_scsi_tmf_resp_work); tmf->vhost = vs; tmf->svq = svq; @@ -1328,6 +1615,19 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, tmf->in_iovs = vc->in; tmf->inflight = vhost_scsi_get_inflight(vq); + if (unlikely(log && log_num)) { + tmf->tmf_log = kmalloc_array(log_num, sizeof(*tmf->tmf_log), + GFP_KERNEL); + if (tmf->tmf_log) { + memcpy(tmf->tmf_log, log, sizeof(*tmf->tmf_log) * log_num); + tmf->tmf_log_num = log_num; + } else { + pr_err("vhost_scsi tmf log allocation error\n"); + vhost_scsi_release_tmf_res(tmf); + goto send_reject; + } + } + if (target_submit_tmr(&tmf->se_cmd, tpg->tpg_nexus->tvn_se_sess, NULL, vhost_buf_to_lun(vtmf->lun), NULL, TMR_LUN_RESET, GFP_KERNEL, 0, @@ -1341,6 +1641,7 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, send_reject: vhost_scsi_send_tmf_resp(vs, vq, vc->in, vc->head, &vq->iov[vc->out], VIRTIO_SCSI_S_FUNCTION_REJECTED); + vhost_scsi_log_write(vq, log, log_num); } static void @@ -1377,6 +1678,8 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) struct vhost_scsi_ctx vc; size_t typ_size; int ret, c = 0; + struct vhost_log *vq_log; + unsigned int log_num; mutex_lock(&vq->mutex); /* @@ -1390,8 +1693,11 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) vhost_disable_notify(&vs->dev, vq); + vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + do { - ret = vhost_scsi_get_desc(vs, vq, &vc); + ret = vhost_scsi_get_desc(vs, vq, &vc, vq_log, &log_num); if (ret) goto err; @@ -1455,9 +1761,12 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) goto err; if (v_req.type == VIRTIO_SCSI_T_TMF) - vhost_scsi_handle_tmf(vs, tpg, vq, &v_req.tmf, &vc); - else + vhost_scsi_handle_tmf(vs, tpg, vq, &v_req.tmf, &vc, + vq_log, log_num); + else { vhost_scsi_send_an_resp(vs, vq, &vc); + vhost_scsi_log_write(vq, vq_log, log_num); + } err: /* * ENXIO: No more requests, or read error, wait for next kick @@ -1467,8 +1776,13 @@ err: */ if (ret == -ENXIO) break; - else if (ret == -EIO) - vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); + else if (ret == -EIO) { + vhost_scsi_send_bad_target(vs, vq, &vc, + v_req.type == VIRTIO_SCSI_T_TMF ? + TYPE_CTRL_TMF : + TYPE_CTRL_AN); + vhost_scsi_log_write(vq, vq_log, log_num); + } } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); out: mutex_unlock(&vq->mutex); @@ -1509,7 +1823,8 @@ vhost_scsi_send_evt(struct vhost_scsi *vs, struct vhost_virtqueue *vq, } llist_add(&evt->list, &vs->vs_event_list); - vhost_vq_work_queue(vq, &vs->vs_event_work); + if (!vhost_vq_work_queue(vq, &vs->vs_event_work)) + vhost_scsi_complete_events(vs, true); } static void vhost_scsi_evt_handle_kick(struct vhost_work *work) @@ -1562,6 +1877,24 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) wait_for_completion(&vs->old_inflight[i]->comp); } +static void vhost_scsi_destroy_vq_log(struct vhost_virtqueue *vq) +{ + struct vhost_scsi_virtqueue *svq = container_of(vq, + struct vhost_scsi_virtqueue, vq); + struct vhost_scsi_cmd *tv_cmd; + unsigned int i; + + if (!svq->scsi_cmds) + return; + + for (i = 0; i < svq->max_cmds; i++) { + tv_cmd = &svq->scsi_cmds[i]; + kfree(tv_cmd->tvc_log); + tv_cmd->tvc_log = NULL; + tv_cmd->tvc_log_num = 0; + } +} + static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq) { struct vhost_scsi_virtqueue *svq = container_of(vq, @@ -1575,13 +1908,13 @@ static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq) for (i = 0; i < svq->max_cmds; i++) { tv_cmd = &svq->scsi_cmds[i]; - kfree(tv_cmd->tvc_sgl); - kfree(tv_cmd->tvc_prot_sgl); - kfree(tv_cmd->tvc_upages); - kfree(tv_cmd->tvc_resp_iov); + kfree(tv_cmd->sgl); + kfree(tv_cmd->prot_sgl); } sbitmap_free(&svq->scsi_tags); + kfree(svq->upages); + vhost_scsi_destroy_vq_log(vq); kfree(svq->scsi_cmds); svq->scsi_cmds = NULL; } @@ -1590,6 +1923,7 @@ static int vhost_scsi_setup_vq_cmds(struct vhost_virtqueue *vq, int max_cmds) { struct vhost_scsi_virtqueue *svq = container_of(vq, struct vhost_scsi_virtqueue, vq); + struct vhost_scsi *vs = svq->vs; struct vhost_scsi_cmd *tv_cmd; unsigned int i; @@ -1607,39 +1941,33 @@ static int vhost_scsi_setup_vq_cmds(struct vhost_virtqueue *vq, int max_cmds) return -ENOMEM; } + svq->upages = kcalloc(VHOST_SCSI_PREALLOC_UPAGES, sizeof(struct page *), + GFP_KERNEL); + if (!svq->upages) + goto out; + for (i = 0; i < max_cmds; i++) { tv_cmd = &svq->scsi_cmds[i]; - tv_cmd->tvc_sgl = kcalloc(VHOST_SCSI_PREALLOC_SGLS, - sizeof(struct scatterlist), - GFP_KERNEL); - if (!tv_cmd->tvc_sgl) { - pr_err("Unable to allocate tv_cmd->tvc_sgl\n"); - goto out; - } - - tv_cmd->tvc_upages = kcalloc(VHOST_SCSI_PREALLOC_UPAGES, - sizeof(struct page *), - GFP_KERNEL); - if (!tv_cmd->tvc_upages) { - pr_err("Unable to allocate tv_cmd->tvc_upages\n"); - goto out; - } - - tv_cmd->tvc_resp_iov = kcalloc(UIO_MAXIOV, - sizeof(struct iovec), - GFP_KERNEL); - if (!tv_cmd->tvc_resp_iov) { - pr_err("Unable to allocate tv_cmd->tvc_resp_iov\n"); - goto out; + if (vs->inline_sg_cnt) { + tv_cmd->sgl = kcalloc(vs->inline_sg_cnt, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!tv_cmd->sgl) { + pr_err("Unable to allocate tv_cmd->sgl\n"); + goto out; + } } - tv_cmd->tvc_prot_sgl = kcalloc(VHOST_SCSI_PREALLOC_PROT_SGLS, - sizeof(struct scatterlist), - GFP_KERNEL); - if (!tv_cmd->tvc_prot_sgl) { - pr_err("Unable to allocate tv_cmd->tvc_prot_sgl\n"); - goto out; + if (vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI) && + vs->inline_sg_cnt) { + tv_cmd->prot_sgl = kcalloc(vs->inline_sg_cnt, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!tv_cmd->prot_sgl) { + pr_err("Unable to allocate tv_cmd->prot_sgl\n"); + goto out; + } } } return 0; @@ -1678,14 +2006,19 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, } } + if (vs->vs_tpg) { + pr_err("vhost-scsi endpoint already set for %s.\n", + vs->vs_vhost_wwpn); + ret = -EEXIST; + goto out; + } + len = sizeof(vs_tpg[0]) * VHOST_SCSI_MAX_TARGET; vs_tpg = kzalloc(len, GFP_KERNEL); if (!vs_tpg) { ret = -ENOMEM; goto out; } - if (vs->vs_tpg) - memcpy(vs_tpg, vs->vs_tpg, len); mutex_lock(&vhost_scsi_mutex); list_for_each_entry(tpg, &vhost_scsi_list, tv_tpg_list) { @@ -1701,12 +2034,6 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, tv_tport = tpg->tport; if (!strcmp(tv_tport->tport_name, t->vhost_wwpn)) { - if (vs->vs_tpg && vs->vs_tpg[tpg->tport_tpgt]) { - mutex_unlock(&tpg->tv_tpg_mutex); - mutex_unlock(&vhost_scsi_mutex); - ret = -EEXIST; - goto undepend; - } /* * In order to ensure individual vhost-scsi configfs * groups cannot be removed while in use by vhost ioctl, @@ -1753,15 +2080,15 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, } ret = 0; } else { - ret = -EEXIST; + ret = -ENODEV; + goto free_tpg; } /* - * Act as synchronize_rcu to make sure access to - * old vs->vs_tpg is finished. + * Act as synchronize_rcu to make sure requests after this point + * see a fully setup device. */ vhost_scsi_flush(vs); - kfree(vs->vs_tpg); vs->vs_tpg = vs_tpg; goto out; @@ -1781,6 +2108,7 @@ undepend: target_undepend_item(&tpg->se_tpg.tpg_group.cg_item); } } +free_tpg: kfree(vs_tpg); out: mutex_unlock(&vs->dev.mutex); @@ -1883,6 +2211,7 @@ free_vs_tpg: vhost_scsi_flush(vs); kfree(vs->vs_tpg); vs->vs_tpg = NULL; + memset(vs->vs_vhost_wwpn, 0, sizeof(vs->vs_vhost_wwpn)); WARN_ON(vs->vs_events_nr); mutex_unlock(&vs->dev.mutex); return 0; @@ -1895,6 +2224,7 @@ err_dev: static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) { struct vhost_virtqueue *vq; + bool is_log, was_log; int i; if (features & ~VHOST_SCSI_FEATURES) @@ -1907,12 +2237,39 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) return -EFAULT; } + if (!vs->dev.nvqs) + goto out; + + is_log = features & (1 << VHOST_F_LOG_ALL); + /* + * All VQs should have same feature. + */ + was_log = vhost_has_feature(&vs->vqs[0].vq, VHOST_F_LOG_ALL); + for (i = 0; i < vs->dev.nvqs; i++) { vq = &vs->vqs[i].vq; mutex_lock(&vq->mutex); vq->acked_features = features; mutex_unlock(&vq->mutex); } + + /* + * If VHOST_F_LOG_ALL is removed, free tvc_log after + * vq->acked_features is committed. + */ + if (!is_log && was_log) { + for (i = VHOST_SCSI_VQ_IO; i < vs->dev.nvqs; i++) { + if (!vs->vqs[i].scsi_cmds) + continue; + + vq = &vs->vqs[i].vq; + mutex_lock(&vq->mutex); + vhost_scsi_destroy_vq_log(vq); + mutex_unlock(&vq->mutex); + } + } + +out: mutex_unlock(&vs->dev.mutex); return 0; } @@ -1927,6 +2284,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) vs = kvzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) goto err_vs; + vs->inline_sg_cnt = vhost_scsi_inline_sg_cnt; if (nvqs > VHOST_SCSI_MAX_IO_VQ) { pr_err("Invalid max_io_vqs of %d. Using %d.\n", nvqs, diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index bc4a51e4638b..5a49b5a6d496 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -209,11 +209,9 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) if (irq < 0) return; - irq_bypass_unregister_producer(&vq->call_ctx.producer); if (!vq->call_ctx.ctx) return; - vq->call_ctx.producer.token = vq->call_ctx.ctx; vq->call_ctx.producer.irq = irq; ret = irq_bypass_register_producer(&vq->call_ctx.producer); if (unlikely(ret)) @@ -595,6 +593,9 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v) const struct vdpa_config_ops *ops = vdpa->config; int ret; + if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK)) + return 0; + if (!ops->suspend) return -EOPNOTSUPP; @@ -615,6 +616,9 @@ static long vhost_vdpa_resume(struct vhost_vdpa *v) const struct vdpa_config_ops *ops = vdpa->config; int ret; + if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK)) + return 0; + if (!ops->resume) return -EOPNOTSUPP; @@ -681,6 +685,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, if (!ops->set_group_asid) return -EOPNOTSUPP; return ops->set_group_asid(vdpa, idx, s.num); + case VHOST_VDPA_GET_VRING_SIZE: + if (!ops->get_vq_size) + return -EOPNOTSUPP; + s.index = idx; + s.num = ops->get_vq_size(vdpa, idx); + if (copy_to_user(argp, &s, sizeof(s))) + return -EFAULT; + return 0; case VHOST_GET_VRING_BASE: r = ops->get_vq_state(v->vdpa, idx, &vq_state); if (r) @@ -695,6 +707,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, vq->last_avail_idx = vq_state.split.avail_index; } break; + case VHOST_SET_VRING_CALL: + if (vq->call_ctx.ctx) { + if (ops->get_status(vdpa) & + VIRTIO_CONFIG_S_DRIVER_OK) + vhost_vdpa_unsetup_vq_irq(v, idx); + vq->call_ctx.producer.token = NULL; + } + break; } r = vhost_vring_ioctl(&v->vdev, cmd, argp); @@ -733,13 +753,16 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, cb.callback = vhost_vdpa_virtqueue_cb; cb.private = vq; cb.trigger = vq->call_ctx.ctx; + vq->call_ctx.producer.token = vq->call_ctx.ctx; + if (ops->get_status(vdpa) & + VIRTIO_CONFIG_S_DRIVER_OK) + vhost_vdpa_setup_vq_irq(v, idx); } else { cb.callback = NULL; cb.private = NULL; cb.trigger = NULL; } ops->set_vq_cb(vdpa, idx, &cb); - vhost_vdpa_setup_vq_irq(v, idx); break; case VHOST_SET_VRING_NUM: @@ -1298,26 +1321,24 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; struct device *dma_dev = vdpa_get_dma_dev(vdpa); - const struct bus_type *bus; int ret; /* Device want to do DMA by itself */ if (ops->set_map || ops->dma_map) return 0; - bus = dma_dev->bus; - if (!bus) - return -EFAULT; - if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY)) { dev_warn_once(&v->dev, "Failed to allocate domain, device is not IOMMU cache coherent capable\n"); return -ENOTSUPP; } - v->domain = iommu_domain_alloc(bus); - if (!v->domain) - return -EIO; + v->domain = iommu_paging_domain_alloc(dma_dev); + if (IS_ERR(v->domain)) { + ret = PTR_ERR(v->domain); + v->domain = NULL; + return ret; + } ret = iommu_attach_device(v->domain, dma_dev); if (ret) @@ -1407,6 +1428,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) for (i = 0; i < nvqs; i++) { vqs[i] = &v->vqs[i]; vqs[i]->handle_kick = handle_vq_kick; + vqs[i]->call_ctx.ctx = NULL; } vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false, vhost_vdpa_process_iotlb_msg); @@ -1469,13 +1491,7 @@ static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf) notify = ops->get_vq_notification(vdpa, index); - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (remap_pfn_range(vma, vmf->address & PAGE_MASK, - PFN_DOWN(notify.addr), PAGE_SIZE, - vma->vm_page_prot)) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; + return vmf_insert_pfn(vma, vmf->address & PAGE_MASK, PFN_DOWN(notify.addr)); } static const struct vm_operations_struct vhost_vdpa_vm_ops = { @@ -1534,7 +1550,7 @@ static void vhost_vdpa_release_dev(struct device *device) struct vhost_vdpa *v = container_of(device, struct vhost_vdpa, dev); - ida_simple_remove(&vhost_vdpa_ida, v->minor); + ida_free(&vhost_vdpa_ida, v->minor); kfree(v->vqs); kfree(v); } @@ -1557,8 +1573,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) if (!v) return -ENOMEM; - minor = ida_simple_get(&vhost_vdpa_ida, 0, - VHOST_VDPA_DEV_MAX, GFP_KERNEL); + minor = ida_alloc_max(&vhost_vdpa_ida, VHOST_VDPA_DEV_MAX - 1, + GFP_KERNEL); if (minor < 0) { kfree(v); return minor; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..3a5ebb973dba 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -263,34 +263,37 @@ bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work) } EXPORT_SYMBOL_GPL(vhost_vq_work_queue); -void vhost_vq_flush(struct vhost_virtqueue *vq) -{ - struct vhost_flush_struct flush; - - init_completion(&flush.wait_event); - vhost_work_init(&flush.work, vhost_flush_work); - - if (vhost_vq_work_queue(vq, &flush.work)) - wait_for_completion(&flush.wait_event); -} -EXPORT_SYMBOL_GPL(vhost_vq_flush); - /** - * vhost_worker_flush - flush a worker + * __vhost_worker_flush - flush a worker * @worker: worker to flush * - * This does not use RCU to protect the worker, so the device or worker - * mutex must be held. + * The worker's flush_mutex must be held. */ -static void vhost_worker_flush(struct vhost_worker *worker) +static void __vhost_worker_flush(struct vhost_worker *worker) { struct vhost_flush_struct flush; + if (!worker->attachment_cnt || worker->killed) + return; + init_completion(&flush.wait_event); vhost_work_init(&flush.work, vhost_flush_work); vhost_worker_queue(worker, &flush.work); + /* + * Drop mutex in case our worker is killed and it needs to take the + * mutex to force cleanup. + */ + mutex_unlock(&worker->mutex); wait_for_completion(&flush.wait_event); + mutex_lock(&worker->mutex); +} + +static void vhost_worker_flush(struct vhost_worker *worker) +{ + mutex_lock(&worker->mutex); + __vhost_worker_flush(worker); + mutex_unlock(&worker->mutex); } void vhost_dev_flush(struct vhost_dev *dev) @@ -298,15 +301,8 @@ void vhost_dev_flush(struct vhost_dev *dev) struct vhost_worker *worker; unsigned long i; - xa_for_each(&dev->worker_xa, i, worker) { - mutex_lock(&worker->mutex); - if (!worker->attachment_cnt) { - mutex_unlock(&worker->mutex); - continue; - } + xa_for_each(&dev->worker_xa, i, worker) vhost_worker_flush(worker); - mutex_unlock(&worker->mutex); - } } EXPORT_SYMBOL_GPL(vhost_dev_flush); @@ -392,7 +388,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, __vhost_vq_meta_reset(vq); } -static bool vhost_worker(void *data) +static bool vhost_run_work_list(void *data) { struct vhost_worker *worker = data; struct vhost_work *work, *work_next; @@ -417,6 +413,40 @@ static bool vhost_worker(void *data) return !!node; } +static void vhost_worker_killed(void *data) +{ + struct vhost_worker *worker = data; + struct vhost_dev *dev = worker->dev; + struct vhost_virtqueue *vq; + int i, attach_cnt = 0; + + mutex_lock(&worker->mutex); + worker->killed = true; + + for (i = 0; i < dev->nvqs; i++) { + vq = dev->vqs[i]; + + mutex_lock(&vq->mutex); + if (worker == + rcu_dereference_check(vq->worker, + lockdep_is_held(&vq->mutex))) { + rcu_assign_pointer(vq->worker, NULL); + attach_cnt++; + } + mutex_unlock(&vq->mutex); + } + + worker->attachment_cnt -= attach_cnt; + if (attach_cnt) + synchronize_rcu(); + /* + * Finish vhost_worker_flush calls and any other works that snuck in + * before the synchronize_rcu. + */ + vhost_run_work_list(worker); + mutex_unlock(&worker->mutex); +} + static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) { kfree(vq->indirect); @@ -631,10 +661,12 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) if (!worker) return NULL; + worker->dev = dev; snprintf(name, sizeof(name), "vhost-%d", current->pid); - vtsk = vhost_task_create(vhost_worker, worker, name); - if (!vtsk) + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, + worker, name); + if (IS_ERR(vtsk)) goto free_worker; mutex_init(&worker->mutex); @@ -664,22 +696,37 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, { struct vhost_worker *old_worker; - old_worker = rcu_dereference_check(vq->worker, - lockdep_is_held(&vq->dev->mutex)); - mutex_lock(&worker->mutex); - worker->attachment_cnt++; - mutex_unlock(&worker->mutex); + if (worker->killed) { + mutex_unlock(&worker->mutex); + return; + } + + mutex_lock(&vq->mutex); + + old_worker = rcu_dereference_check(vq->worker, + lockdep_is_held(&vq->mutex)); rcu_assign_pointer(vq->worker, worker); + worker->attachment_cnt++; - if (!old_worker) + if (!old_worker) { + mutex_unlock(&vq->mutex); + mutex_unlock(&worker->mutex); return; + } + mutex_unlock(&vq->mutex); + mutex_unlock(&worker->mutex); + /* * Take the worker mutex to make sure we see the work queued from * device wide flushes which doesn't use RCU for execution. */ mutex_lock(&old_worker->mutex); - old_worker->attachment_cnt--; + if (old_worker->killed) { + mutex_unlock(&old_worker->mutex); + return; + } + /* * We don't want to call synchronize_rcu for every vq during setup * because it will slow down VM startup. If we haven't done @@ -690,6 +737,8 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq) && !vq->kick) { mutex_unlock(&vq->mutex); + + old_worker->attachment_cnt--; mutex_unlock(&old_worker->mutex); /* * vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID. @@ -705,7 +754,8 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, /* Make sure new vq queue/flush/poll calls see the new worker */ synchronize_rcu(); /* Make sure whatever was queued gets run */ - vhost_worker_flush(old_worker); + __vhost_worker_flush(old_worker); + old_worker->attachment_cnt--; mutex_unlock(&old_worker->mutex); } @@ -754,10 +804,16 @@ static int vhost_free_worker(struct vhost_dev *dev, return -ENODEV; mutex_lock(&worker->mutex); - if (worker->attachment_cnt) { + if (worker->attachment_cnt || worker->killed) { mutex_unlock(&worker->mutex); return -EBUSY; } + /* + * A flush might have raced and snuck in before attachment_cnt was set + * to zero. Make sure flushes are flushed from the queue before + * freeing. + */ + __vhost_worker_flush(worker); mutex_unlock(&worker->mutex); vhost_worker_destroy(dev, worker); @@ -1290,10 +1346,36 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(&d->vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, &vq->avail->idx); + __virtio16 idx; + int r; + + r = vhost_get_avail(vq, idx, &vq->avail->idx); + if (unlikely(r < 0)) { + vq_err(vq, "Failed to access available index at %p (%d)\n", + &vq->avail->idx, r); + return r; + } + + /* Check it isn't doing very strange thing with available indexes */ + vq->avail_idx = vhost16_to_cpu(vq, idx); + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Invalid available index change from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EINVAL; + } + + /* We're done if there is nothing new */ + if (vq->avail_idx == vq->last_avail_idx) + return 0; + + /* + * We updated vq->avail_idx so we need a memory barrier between + * the index read above and the caller reading avail ring entries. + */ + smp_rmb(); + return 1; } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2222,6 +2304,19 @@ static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) return 0; } +/* + * vhost_log_write() - Log in dirty page bitmap + * @vq: vhost virtqueue. + * @log: Array of dirty memory in GPA. + * @log_num: Size of vhost_log arrary. + * @len: The total length of memory buffer to log in the dirty bitmap. + * Some drivers may only partially use pages shared via the last + * vring descriptor (i.e. vhost-net RX buffer). + * Use (len == U64_MAX) to indicate the driver would log all + * pages of vring descriptors. + * @iov: Array of dirty memory in HVA. + * @count: Size of iovec array. + */ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len, struct iovec *iov, int count) { @@ -2245,15 +2340,14 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, r = log_write(vq->log_base, log[i].addr, l); if (r < 0) return r; - len -= l; - if (!len) { - if (vq->log_ctx) - eventfd_signal(vq->log_ctx); - return 0; - } + + if (len != U64_MAX) + len -= l; } - /* Length written exceeds what we have stored. This is a bug. */ - BUG(); + + if (vq->log_ctx) + eventfd_signal(vq->log_ctx); + return 0; } EXPORT_SYMBOL_GPL(vhost_log_write); @@ -2498,38 +2592,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; - __virtio16 avail_idx; + u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access; - /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx; - if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - &vq->avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved used index from %u to %u", - last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret < 0)) + return ret; - /* If there's nothing new since last we looked, return - * invalid. - */ - if (vq->avail_idx == last_avail_idx) + if (!ret) return vq->num; - - /* Only get avail ring entries after they have been - * exposed by guest. - */ - smp_rmb(); } /* Grab the next descriptor number they're advertising, and increment @@ -2790,25 +2863,21 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (vq->avail_idx != vq->last_avail_idx) return false; - r = vhost_get_avail_idx(vq, &avail_idx); - if (unlikely(r)) - return false; - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + r = vhost_get_avail_idx(vq); - return vq->avail_idx == vq->last_avail_idx; + /* Note: we treat error as non-empty here */ + return r == 0; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); /* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) @@ -2832,15 +2901,13 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ smp_mb(); - r = vhost_get_avail_idx(vq, &avail_idx); - if (r) { - vq_err(vq, "Failed to check avail idx at %p: %d\n", - &vq->avail->idx, r); + + r = vhost_get_avail_idx(vq); + /* Note: we treat error as empty here */ + if (unlikely(r < 0)) return false; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - return vq->avail_idx != vq->last_avail_idx; + return r; } EXPORT_SYMBOL_GPL(vhost_enable_notify); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 9e942fcda5c3..bb75a292d50c 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -28,12 +28,14 @@ struct vhost_work { struct vhost_worker { struct vhost_task *vtsk; + struct vhost_dev *dev; /* Used to serialize device wide flushing with worker swapping. */ struct mutex mutex; struct llist_head work_list; u64 kcov_handle; u32 id; int attachment_cnt; + bool killed; }; /* Poll a file (eventfd or socket) */ @@ -205,7 +207,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *, struct vhost_log *log, unsigned int *log_num); void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); -void vhost_vq_flush(struct vhost_virtqueue *vq); bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work); bool vhost_vq_has_work(struct vhost_virtqueue *vq); bool vhost_vq_is_setup(struct vhost_virtqueue *vq); diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 7b8fd977f71c..bbce65452701 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -225,10 +225,9 @@ static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) flag = (iov->max_num & VRINGH_IOV_ALLOCATED); if (flag) - new = krealloc_array(iov->iov, new_num, - sizeof(struct iovec), gfp); + new = krealloc_array(iov->iov, new_num, sizeof(*new), gfp); else { - new = kmalloc_array(new_num, sizeof(struct iovec), gfp); + new = kmalloc_array(new_num, sizeof(*new), gfp); if (new) { memcpy(new, iov->iov, iov->max_num * sizeof(struct iovec)); @@ -1291,11 +1290,10 @@ static inline int getu16_iotlb(const struct vringh *vrh, if (ret) return ret; } else { - void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page); - void *from = kaddr + ivec.iov.bvec[0].bv_offset; + __virtio16 *from = bvec_kmap_local(&ivec.iov.bvec[0]); - tmp = READ_ONCE(*(__virtio16 *)from); - kunmap_local(kaddr); + tmp = READ_ONCE(*from); + kunmap_local(from); } *val = vringh16_to_cpu(vrh, tmp); @@ -1330,11 +1328,10 @@ static inline int putu16_iotlb(const struct vringh *vrh, if (ret) return ret; } else { - void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page); - void *to = kaddr + ivec.iov.bvec[0].bv_offset; + __virtio16 *to = bvec_kmap_local(&ivec.iov.bvec[0]); - WRITE_ONCE(*(__virtio16 *)to, tmp); - kunmap_local(kaddr); + WRITE_ONCE(*to, tmp); + kunmap_local(to); } return 0; @@ -1614,4 +1611,5 @@ EXPORT_SYMBOL(vringh_need_notify_iotlb); #endif +MODULE_DESCRIPTION("host side of a virtio ring"); MODULE_LICENSE("GPL"); diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index ec20ecff85c7..802153e23073 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, restart_tx = true; } - consume_skb(skb); + virtio_transport_consume_skb_sent(skb, true); } } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); if (added) @@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, + .unsent_bytes = virtio_transport_unsent_bytes, + .read_skb = virtio_transport_read_skb, }, @@ -667,6 +669,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) } vsock->guest_cid = 0; /* no CID assigned yet */ + vsock->seqpacket_allow = false; atomic_set(&vsock->queued_replies, 0); @@ -810,8 +813,7 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) goto err; } - if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET)) - vsock->seqpacket_allow = true; + vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; |