diff options
Diffstat (limited to 'drivers/nvme/target/rdma.c')
| -rw-r--r-- | drivers/nvme/target/rdma.c | 1260 |
1 files changed, 901 insertions, 359 deletions
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 56a4cba690b5..9c12b2361a6d 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1,18 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics RDMA target. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/atomic.h> +#include <linux/blk-integrity.h> #include <linux/ctype.h> #include <linux/delay.h> #include <linux/err.h> @@ -23,33 +16,45 @@ #include <linux/string.h> #include <linux/wait.h> #include <linux/inet.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <rdma/rw.h> +#include <rdma/ib_cm.h> #include <linux/nvme-rdma.h> #include "nvmet.h" /* - * We allow up to a page of inline data to go with the SQE + * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data */ -#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_MAX_INLINE_SGE 4 +#define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) + +/* Assume mpsmin == device_page_size == 4KB */ +#define NVMET_RDMA_MAX_MDTS 8 +#define NVMET_RDMA_MAX_METADATA_MDTS 5 + +#define NVMET_RDMA_BACKLOG 128 + +#define NVMET_RDMA_DISCRETE_RSP_TAG -1 + +struct nvmet_rdma_srq; struct nvmet_rdma_cmd { - struct ib_sge sge[2]; + struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; struct ib_recv_wr wr; - struct scatterlist inline_sg; - struct page *inline_page; + struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; struct nvme_command *nvme_cmd; struct nvmet_rdma_queue *queue; + struct nvmet_rdma_srq *nsrq; }; enum { NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), - NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), }; struct nvmet_rdma_rsp { @@ -61,39 +66,41 @@ struct nvmet_rdma_rsp { struct nvmet_rdma_queue *queue; struct ib_cqe read_cqe; + struct ib_cqe write_cqe; struct rdma_rw_ctx rw; struct nvmet_req req; + bool allocated; u8 n_rdma; u32 flags; u32 invalidate_rkey; struct list_head wait_list; - struct list_head free_list; + int tag; }; enum nvmet_rdma_queue_state { NVMET_RDMA_Q_CONNECTING, NVMET_RDMA_Q_LIVE, NVMET_RDMA_Q_DISCONNECTING, - NVMET_RDMA_IN_DEVICE_REMOVAL, }; struct nvmet_rdma_queue { struct rdma_cm_id *cm_id; + struct ib_qp *qp; struct nvmet_port *port; struct ib_cq *cq; atomic_t sq_wr_avail; struct nvmet_rdma_device *dev; + struct nvmet_rdma_srq *nsrq; spinlock_t state_lock; enum nvmet_rdma_queue_state state; struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; struct nvmet_rdma_rsp *rsps; - struct list_head free_rsps; - spinlock_t rsps_lock; + struct sbitmap rsp_tags; struct nvmet_rdma_cmd *cmds; struct work_struct release_work; @@ -103,26 +110,52 @@ struct nvmet_rdma_queue { int idx; int host_qid; + int comp_vector; int recv_queue_size; int send_queue_size; struct list_head queue_list; }; +struct nvmet_rdma_port { + struct nvmet_port *nport; + struct sockaddr_storage addr; + struct rdma_cm_id *cm_id; + struct delayed_work repair_work; +}; + +struct nvmet_rdma_srq { + struct ib_srq *srq; + struct nvmet_rdma_cmd *cmds; + struct nvmet_rdma_device *ndev; +}; + struct nvmet_rdma_device { struct ib_device *device; struct ib_pd *pd; - struct ib_srq *srq; - struct nvmet_rdma_cmd *srq_cmds; + struct nvmet_rdma_srq **srqs; + int srq_count; size_t srq_size; struct kref ref; struct list_head entry; + int inline_data_size; + int inline_page_count; }; static bool nvmet_rdma_use_srq; module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); MODULE_PARM_DESC(use_srq, "Use shared receive queue."); +static int srq_size_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops srq_size_ops = { + .set = srq_size_set, + .get = param_get_int, +}; + +static int nvmet_rdma_srq_size = 1024; +module_param_cb(srq_size, &srq_size_ops, &nvmet_rdma_srq_size, 0644); +MODULE_PARM_DESC(srq_size, "set Shared Receive Queue (SRQ) size, should >= 256 (default: 1024)"); + static DEFINE_IDA(nvmet_rdma_queue_ida); static LIST_HEAD(nvmet_rdma_queue_list); static DEFINE_MUTEX(nvmet_rdma_queue_mutex); @@ -134,43 +167,71 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); +static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r); +static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r, + int tag); + +static const struct nvmet_fabrics_ops nvmet_rdma_ops; + +static int srq_size_set(const char *val, const struct kernel_param *kp) +{ + int n = 0, ret; + + ret = kstrtoint(val, 10, &n); + if (ret != 0 || n < 256) + return -EINVAL; -static struct nvmet_fabrics_ops nvmet_rdma_ops; + return param_set_int(val, kp); +} -/* XXX: really should move to a generic header sooner or later.. */ -static inline u32 get_unaligned_le24(const u8 *p) +static int num_pages(int len) { - return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; + return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); } static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) { return nvme_is_write(rsp->req.cmd) && - rsp->req.data_len && + rsp->req.transfer_len && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) { return !nvme_is_write(rsp->req.cmd) && - rsp->req.data_len && - !rsp->req.rsp->status && + rsp->req.transfer_len && + !rsp->req.cqe->status && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } static inline struct nvmet_rdma_rsp * nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) { - struct nvmet_rdma_rsp *rsp; - unsigned long flags; - - spin_lock_irqsave(&queue->rsps_lock, flags); - rsp = list_first_entry(&queue->free_rsps, - struct nvmet_rdma_rsp, free_list); - list_del(&rsp->free_list); - spin_unlock_irqrestore(&queue->rsps_lock, flags); + struct nvmet_rdma_rsp *rsp = NULL; + int tag; + + tag = sbitmap_get(&queue->rsp_tags); + if (tag >= 0) + rsp = &queue->rsps[tag]; + + if (unlikely(!rsp)) { + int ret; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (unlikely(!rsp)) + return NULL; + ret = nvmet_rdma_alloc_rsp(queue->dev, rsp, + NVMET_RDMA_DISCRETE_RSP_TAG); + if (unlikely(ret)) { + kfree(rsp); + return NULL; + } + } return rsp; } @@ -178,64 +239,78 @@ nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) static inline void nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) { - unsigned long flags; + if (unlikely(rsp->tag == NVMET_RDMA_DISCRETE_RSP_TAG)) { + nvmet_rdma_free_rsp(rsp->queue->dev, rsp); + kfree(rsp); + return; + } - spin_lock_irqsave(&rsp->queue->rsps_lock, flags); - list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); - spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); + sbitmap_clear_bit(&rsp->queue->rsp_tags, rsp->tag); } -static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents) +static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) { struct scatterlist *sg; - int count; + struct ib_sge *sge; + int i; - if (!sgl || !nents) + if (!ndev->inline_data_size) return; - for_each_sg(sgl, sg, nents, count) - __free_page(sg_page(sg)); - kfree(sgl); + sg = c->inline_sg; + sge = &c->sge[1]; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } } -static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, - u32 length) +static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) { struct scatterlist *sg; - struct page *page; - unsigned int nent; - int i = 0; - - nent = DIV_ROUND_UP(length, PAGE_SIZE); - sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) - goto out; - - sg_init_table(sg, nent); - - while (length) { - u32 page_len = min_t(u32, length, PAGE_SIZE); + struct ib_sge *sge; + struct page *pg; + int len; + int i; - page = alloc_page(GFP_KERNEL); - if (!page) - goto out_free_pages; + if (!ndev->inline_data_size) + return 0; - sg_set_page(&sg[i], page, page_len, 0); - length -= page_len; - i++; + sg = c->inline_sg; + sg_init_table(sg, ndev->inline_page_count); + sge = &c->sge[1]; + len = ndev->inline_data_size; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) + goto out_err; + sg_assign_page(sg, pg); + sge->addr = ib_dma_map_page(ndev->device, + pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, sge->addr)) + goto out_err; + sge->length = min_t(int, len, PAGE_SIZE); + sge->lkey = ndev->pd->local_dma_lkey; + len -= sge->length; } - *sgl = sg; - *nents = nent; - return 0; -out_free_pages: - while (i > 0) { - i--; - __free_page(sg_page(&sg[i])); + return 0; +out_err: + for (; i >= 0; i--, sg--, sge--) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); } - kfree(sg); -out: - return NVME_SC_INTERNAL; + return -ENOMEM; } static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, @@ -254,33 +329,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, c->sge[0].length = sizeof(*c->nvme_cmd); c->sge[0].lkey = ndev->pd->local_dma_lkey; - if (!admin) { - c->inline_page = alloc_pages(GFP_KERNEL, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - if (!c->inline_page) - goto out_unmap_cmd; - c->sge[1].addr = ib_dma_map_page(ndev->device, - c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) - goto out_free_inline_page; - c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; - c->sge[1].lkey = ndev->pd->local_dma_lkey; - } + if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) + goto out_unmap_cmd; c->cqe.done = nvmet_rdma_recv_done; c->wr.wr_cqe = &c->cqe; c->wr.sg_list = c->sge; - c->wr.num_sge = admin ? 1 : 2; + c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; return 0; -out_free_inline_page: - if (!admin) { - __free_pages(c->inline_page, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - } out_unmap_cmd: ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); @@ -294,12 +353,8 @@ out: static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { - if (!admin) { - ib_dma_unmap_page(ndev->device, c->sge[1].addr, - NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); - __free_pages(c->inline_page, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - } + if (!admin) + nvmet_rdma_free_inline_pages(ndev, c); ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); kfree(c->nvme_cmd); @@ -312,7 +367,7 @@ nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *cmds; int ret = -EINVAL, i; - cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); + cmds = kvcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); if (!cmds) goto out; @@ -327,7 +382,7 @@ nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, out_free: while (--i >= 0) nvmet_rdma_free_cmd(ndev, cmds + i, admin); - kfree(cmds); + kvfree(cmds); out: return ERR_PTR(ret); } @@ -339,23 +394,25 @@ static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, for (i = 0; i < nr_cmds; i++) nvmet_rdma_free_cmd(ndev, cmds + i, admin); - kfree(cmds); + kvfree(cmds); } static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, - struct nvmet_rdma_rsp *r) + struct nvmet_rdma_rsp *r, int tag) { /* NVMe CQE / RDMA SEND */ - r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); - if (!r->req.rsp) + r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL); + if (!r->req.cqe) goto out; - r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, - sizeof(*r->req.rsp), DMA_TO_DEVICE); + r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe, + sizeof(*r->req.cqe), DMA_TO_DEVICE); if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) goto out_free_rsp; - r->send_sge.length = sizeof(*r->req.rsp); + if (ib_dma_pci_p2p_dma_supported(ndev->device)) + r->req.p2p_client = &ndev->device->dev; + r->send_sge.length = sizeof(*r->req.cqe); r->send_sge.lkey = ndev->pd->local_dma_lkey; r->send_cqe.done = nvmet_rdma_send_done; @@ -367,10 +424,14 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, /* Data In / RDMA READ */ r->read_cqe.done = nvmet_rdma_read_data_done; + /* Data Out / RDMA WRITE */ + r->write_cqe.done = nvmet_rdma_write_data_done; + r->tag = tag; + return 0; out_free_rsp: - kfree(r->req.rsp); + kfree(r->req.cqe); out: return -ENOMEM; } @@ -379,8 +440,8 @@ static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, struct nvmet_rdma_rsp *r) { ib_dma_unmap_single(ndev->device, r->send_sge.addr, - sizeof(*r->req.rsp), DMA_TO_DEVICE); - kfree(r->req.rsp); + sizeof(*r->req.cqe), DMA_TO_DEVICE); + kfree(r->req.cqe); } static int @@ -388,33 +449,33 @@ nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) { struct nvmet_rdma_device *ndev = queue->dev; int nr_rsps = queue->recv_queue_size * 2; - int ret = -EINVAL, i; + int ret = -ENOMEM, i; - queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), + if (sbitmap_init_node(&queue->rsp_tags, nr_rsps, -1, GFP_KERNEL, + NUMA_NO_NODE, false, true)) + goto out; + + queue->rsps = kvcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), GFP_KERNEL); if (!queue->rsps) - goto out; + goto out_free_sbitmap; for (i = 0; i < nr_rsps; i++) { struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; - ret = nvmet_rdma_alloc_rsp(ndev, rsp); + ret = nvmet_rdma_alloc_rsp(ndev, rsp, i); if (ret) goto out_free; - - list_add_tail(&rsp->free_list, &queue->free_rsps); } return 0; out_free: - while (--i >= 0) { - struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; - - list_del(&rsp->free_list); - nvmet_rdma_free_rsp(ndev, rsp); - } - kfree(queue->rsps); + while (--i >= 0) + nvmet_rdma_free_rsp(ndev, &queue->rsps[i]); + kvfree(queue->rsps); +out_free_sbitmap: + sbitmap_free(&queue->rsp_tags); out: return ret; } @@ -424,27 +485,30 @@ static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) struct nvmet_rdma_device *ndev = queue->dev; int i, nr_rsps = queue->recv_queue_size * 2; - for (i = 0; i < nr_rsps; i++) { - struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; - - list_del(&rsp->free_list); - nvmet_rdma_free_rsp(ndev, rsp); - } - kfree(queue->rsps); + for (i = 0; i < nr_rsps; i++) + nvmet_rdma_free_rsp(ndev, &queue->rsps[i]); + kvfree(queue->rsps); + sbitmap_free(&queue->rsp_tags); } static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *cmd) { - struct ib_recv_wr *bad_wr; + int ret; ib_dma_sync_single_for_device(ndev->device, cmd->sge[0].addr, cmd->sge[0].length, DMA_FROM_DEVICE); - if (ndev->srq) - return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); - return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); + if (cmd->nsrq) + ret = ib_post_srq_recv(cmd->nsrq->srq, &cmd->wr, NULL); + else + ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL); + + if (unlikely(ret)) + pr_err("post_recv cmd failed\n"); + + return ret; } static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) @@ -470,6 +534,129 @@ static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) spin_unlock(&queue->rsp_wr_wait_lock); } +static u16 nvmet_rdma_check_pi_status(struct ib_mr *sig_mr) +{ + struct ib_mr_status mr_status; + int ret; + u16 status = 0; + + ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + return NVME_SC_INVALID_PI; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + status = NVME_SC_GUARD_CHECK; + break; + case IB_SIG_BAD_REFTAG: + status = NVME_SC_REFTAG_CHECK; + break; + case IB_SIG_BAD_APPTAG: + status = NVME_SC_APPTAG_CHECK; + break; + } + pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + } + + return status; +} + +static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi, + struct nvme_command *cmd, struct ib_sig_domain *domain, + u16 control, u8 pi_type) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.pi_interval = 1 << bi->interval_exp; + domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); + if (control & NVME_RW_PRINFO_PRCHK_REF) + domain->sig.dif.ref_remap = true; + + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); + domain->sig.dif.app_escape = true; + if (pi_type == NVME_NS_DPS_PI_TYPE3) + domain->sig.dif.ref_escape = true; +} + +static void nvmet_rdma_set_sig_attrs(struct nvmet_req *req, + struct ib_sig_attrs *sig_attrs) +{ + struct nvme_command *cmd = req->cmd; + u16 control = le16_to_cpu(cmd->rw.control); + u8 pi_type = req->ns->pi_type; + struct blk_integrity *bi; + + bi = bdev_get_integrity(req->ns->bdev); + + memset(sig_attrs, 0, sizeof(*sig_attrs)); + + if (control & NVME_RW_PRINFO_PRACT) { + /* for WRITE_INSERT/READ_STRIP no wire domain */ + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, + pi_type); + /* Clear the PRACT bit since HCA will generate/verify the PI */ + control &= ~NVME_RW_PRINFO_PRACT; + cmd->rw.control = cpu_to_le16(control); + /* PI is added by the HW */ + req->transfer_len += req->metadata_len; + } else { + /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, + pi_type); + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, + pi_type); + } + + if (control & NVME_RW_PRINFO_PRCHK_REF) + sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG; + if (control & NVME_RW_PRINFO_PRCHK_GUARD) + sig_attrs->check_mask |= IB_SIG_CHECK_GUARD; + if (control & NVME_RW_PRINFO_PRCHK_APP) + sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG; +} + +static int nvmet_rdma_rw_ctx_init(struct nvmet_rdma_rsp *rsp, u64 addr, u32 key, + struct ib_sig_attrs *sig_attrs) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct nvmet_req *req = &rsp->req; + int ret; + + if (req->metadata_len) + ret = rdma_rw_ctx_signature_init(&rsp->rw, cm_id->qp, + cm_id->port_num, req->sg, req->sg_cnt, + req->metadata_sg, req->metadata_sg_cnt, sig_attrs, + addr, key, nvmet_data_dir(req)); + else + ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, + req->sg, req->sg_cnt, 0, addr, key, + nvmet_data_dir(req)); + + return ret; +} + +static void nvmet_rdma_rw_ctx_destroy(struct nvmet_rdma_rsp *rsp) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct nvmet_req *req = &rsp->req; + + if (req->metadata_len) + rdma_rw_ctx_destroy_signature(&rsp->rw, cm_id->qp, + cm_id->port_num, req->sg, req->sg_cnt, + req->metadata_sg, req->metadata_sg_cnt, + nvmet_data_dir(req)); + else + rdma_rw_ctx_destroy(&rsp->rw, cm_id->qp, cm_id->port_num, + req->sg, req->sg_cnt, nvmet_data_dir(req)); +} static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) { @@ -477,14 +664,11 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); - if (rsp->n_rdma) { - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, - queue->cm_id->port_num, rsp->req.sg, - rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); - } + if (rsp->n_rdma) + nvmet_rdma_rw_ctx_destroy(rsp); - if (rsp->req.sg != &rsp->cmd->inline_sg) - nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); + if (rsp->req.sg != rsp->cmd->inline_sg) + nvmet_req_free_sgls(&rsp->req); if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) nvmet_rdma_process_wr_wait_list(queue); @@ -510,6 +694,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); + struct nvmet_rdma_queue *queue = wc->qp->qp_context; nvmet_rdma_release_rsp(rsp); @@ -517,7 +702,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) wc->status != IB_WC_WR_FLUSH_ERR)) { pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); - nvmet_rdma_error_comp(rsp->queue); + nvmet_rdma_error_comp(queue); } } @@ -526,20 +711,25 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) struct nvmet_rdma_rsp *rsp = container_of(req, struct nvmet_rdma_rsp, req); struct rdma_cm_id *cm_id = rsp->queue->cm_id; - struct ib_send_wr *first_wr, *bad_wr; + struct ib_send_wr *first_wr; - if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + if (rsp->invalidate_rkey) { rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; } else { rsp->send_wr.opcode = IB_WR_SEND; } - if (nvmet_rdma_need_data_out(rsp)) - first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, - cm_id->port_num, NULL, &rsp->send_wr); - else + if (nvmet_rdma_need_data_out(rsp)) { + if (rsp->req.metadata_len) + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, &rsp->write_cqe, NULL); + else + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, NULL, &rsp->send_wr); + } else { first_wr = &rsp->send_wr; + } nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); @@ -547,7 +737,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) rsp->send_sge.addr, rsp->send_sge.length, DMA_TO_DEVICE); - if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { + if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { pr_err("sending cmd response failed\n"); nvmet_rdma_release_rsp(rsp); } @@ -557,16 +747,15 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; + u16 status = 0; WARN_ON(rsp->n_rdma <= 0); atomic_add(rsp->n_rdma, &queue->sq_wr_avail); - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, - queue->cm_id->port_num, rsp->req.sg, - rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); rsp->n_rdma = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_rw_ctx_destroy(rsp); nvmet_req_uninit(&rsp->req); nvmet_rdma_release_rsp(rsp); if (wc->status != IB_WC_WR_FLUSH_ERR) { @@ -577,16 +766,81 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) return; } - rsp->req.execute(&rsp->req); + if (rsp->req.metadata_len) + status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); + nvmet_rdma_rw_ctx_destroy(rsp); + + if (unlikely(status)) + nvmet_req_complete(&rsp->req, status); + else + rsp->req.execute(&rsp->req); +} + +static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, write_cqe); + struct nvmet_rdma_queue *queue = wc->qp->qp_context; + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + u16 status; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return; + + WARN_ON(rsp->n_rdma <= 0); + atomic_add(rsp->n_rdma, &queue->sq_wr_avail); + rsp->n_rdma = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_rw_ctx_destroy(rsp); + nvmet_req_uninit(&rsp->req); + nvmet_rdma_release_rsp(rsp); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_info("RDMA WRITE for CQE failed with status %s (%d).\n", + ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + /* + * Upon RDMA completion check the signature status + * - if succeeded send good NVMe response + * - if failed send bad NVMe response with appropriate error + */ + status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); + if (unlikely(status)) + rsp->req.cqe->status = cpu_to_le16(status << 1); + nvmet_rdma_rw_ctx_destroy(rsp); + + if (unlikely(ib_post_send(cm_id->qp, &rsp->send_wr, NULL))) { + pr_err("sending cmd response failed\n"); + nvmet_rdma_release_rsp(rsp); + } } static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, u64 off) { - sg_init_table(&rsp->cmd->inline_sg, 1); - sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); - rsp->req.sg = &rsp->cmd->inline_sg; - rsp->req.sg_cnt = 1; + int sg_count = num_pages(len); + struct scatterlist *sg; + int i; + + sg = rsp->cmd->inline_sg; + for (i = 0; i < sg_count; i++, sg++) { + if (i < sg_count - 1) + sg_unmark_end(sg); + else + sg_mark_end(sg); + sg->offset = off; + sg->length = min_t(int, len, PAGE_SIZE - off); + len -= sg->length; + if (!i) + off = 0; + } + + rsp->req.sg = rsp->cmd->inline_sg; + rsp->req.sg_cnt = sg_count; } static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) @@ -595,12 +849,15 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) u64 off = le64_to_cpu(sgl->addr); u32 len = le32_to_cpu(sgl->length); - if (!nvme_is_write(rsp->req.cmd)) - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + if (!nvme_is_write(rsp->req.cmd)) { + rsp->req.error_loc = + offsetof(struct nvme_common_command, opcode); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } - if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { + if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); - return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_OFFSET | NVME_STATUS_DNR; } /* no data command? */ @@ -609,41 +866,44 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) nvmet_rdma_use_inline_sg(rsp, len, off); rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; + rsp->req.transfer_len += len; return 0; } static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, struct nvme_keyed_sgl_desc *sgl, bool invalidate) { - struct rdma_cm_id *cm_id = rsp->queue->cm_id; u64 addr = le64_to_cpu(sgl->addr); - u32 len = get_unaligned_le24(sgl->length); u32 key = get_unaligned_le32(sgl->key); + struct ib_sig_attrs sig_attrs; int ret; - u16 status; + + rsp->req.transfer_len = get_unaligned_le24(sgl->length); /* no data command? */ - if (!len) + if (!rsp->req.transfer_len) return 0; - status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, - len); - if (status) - return status; + if (rsp->req.metadata_len) + nvmet_rdma_set_sig_attrs(&rsp->req, &sig_attrs); + + ret = nvmet_req_alloc_sgls(&rsp->req); + if (unlikely(ret < 0)) + goto error_out; - ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, - rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, - nvmet_data_dir(&rsp->req)); - if (ret < 0) - return NVME_SC_INTERNAL; + ret = nvmet_rdma_rw_ctx_init(rsp, addr, key, &sig_attrs); + if (unlikely(ret < 0)) + goto error_out; rsp->n_rdma += ret; - if (invalidate) { + if (invalidate) rsp->invalidate_rkey = key; - rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; - } return 0; + +error_out: + rsp->req.transfer_len = 0; + return NVME_SC_INTERNAL; } static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) @@ -657,7 +917,9 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) return nvmet_rdma_map_sgl_inline(rsp); default: pr_err("invalid SGL subtype: %#x\n", sgl->type); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + rsp->req.error_loc = + offsetof(struct nvme_common_command, dptr); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } case NVME_KEY_SGL_FMT_DATA_DESC: switch (sgl->type & 0xf) { @@ -667,11 +929,14 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); default: pr_err("invalid SGL subtype: %#x\n", sgl->type); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + rsp->req.error_loc = + offsetof(struct nvme_common_command, dptr); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } default: pr_err("invalid SGL type: %#x\n", sgl->type); - return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; + rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); + return NVME_SC_SGL_INVALID_TYPE | NVME_STATUS_DNR; } } @@ -689,7 +954,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) } if (nvmet_rdma_need_data_in(rsp)) { - if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + if (rdma_rw_ctx_post(&rsp->rw, queue->qp, queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { @@ -711,8 +976,7 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, cmd->send_sge.addr, cmd->send_sge.length, DMA_TO_DEVICE); - if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, - &queue->nvme_sq, &nvmet_rdma_ops)) + if (!nvmet_req_init(&cmd->req, &queue->nvme_sq, &nvmet_rdma_ops)) return; status = nvmet_rdma_map_sgl(cmd); @@ -731,11 +995,32 @@ out_err: nvmet_req_complete(&cmd->req, status); } +static bool nvmet_rdma_recv_not_live(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + bool ret = true; + + spin_lock_irqsave(&queue->state_lock, flags); + /* + * recheck queue state is not live to prevent a race condition + * with RDMA_CM_EVENT_ESTABLISHED handler. + */ + if (queue->state == NVMET_RDMA_Q_LIVE) + ret = false; + else if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return ret; +} + static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_cmd *cmd = container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; struct nvmet_rdma_rsp *rsp; if (unlikely(wc->status != IB_WC_SUCCESS)) { @@ -756,52 +1041,107 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) cmd->queue = queue; rsp = nvmet_rdma_get_rsp(queue); + if (unlikely(!rsp)) { + /* + * we get here only under memory pressure, + * silently drop and have the host retry + * as we can't even fail it. + */ + nvmet_rdma_post_recv(queue->dev, cmd); + return; + } rsp->queue = queue; rsp->cmd = cmd; rsp->flags = 0; rsp->req.cmd = cmd->nvme_cmd; rsp->req.port = queue->port; rsp->n_rdma = 0; + rsp->invalidate_rkey = 0; - if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { - unsigned long flags; - - spin_lock_irqsave(&queue->state_lock, flags); - if (queue->state == NVMET_RDMA_Q_CONNECTING) - list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); - else - nvmet_rdma_put_rsp(rsp); - spin_unlock_irqrestore(&queue->state_lock, flags); + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE) && + nvmet_rdma_recv_not_live(queue, rsp)) return; - } nvmet_rdma_handle_command(queue, rsp); } -static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) +static void nvmet_rdma_destroy_srq(struct nvmet_rdma_srq *nsrq) +{ + nvmet_rdma_free_cmds(nsrq->ndev, nsrq->cmds, nsrq->ndev->srq_size, + false); + ib_destroy_srq(nsrq->srq); + + kfree(nsrq); +} + +static void nvmet_rdma_destroy_srqs(struct nvmet_rdma_device *ndev) { - if (!ndev->srq) + int i; + + if (!ndev->srqs) return; - nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); - ib_destroy_srq(ndev->srq); + for (i = 0; i < ndev->srq_count; i++) + nvmet_rdma_destroy_srq(ndev->srqs[i]); + + kfree(ndev->srqs); } -static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) +static struct nvmet_rdma_srq * +nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) { struct ib_srq_init_attr srq_attr = { NULL, }; + size_t srq_size = ndev->srq_size; + struct nvmet_rdma_srq *nsrq; struct ib_srq *srq; - size_t srq_size; int ret, i; - srq_size = 4095; /* XXX: tune */ + nsrq = kzalloc(sizeof(*nsrq), GFP_KERNEL); + if (!nsrq) + return ERR_PTR(-ENOMEM); srq_attr.attr.max_wr = srq_size; - srq_attr.attr.max_sge = 2; + srq_attr.attr.max_sge = 1 + ndev->inline_page_count; srq_attr.attr.srq_limit = 0; srq_attr.srq_type = IB_SRQT_BASIC; srq = ib_create_srq(ndev->pd, &srq_attr); if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto out_free; + } + + nsrq->cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); + if (IS_ERR(nsrq->cmds)) { + ret = PTR_ERR(nsrq->cmds); + goto out_destroy_srq; + } + + nsrq->srq = srq; + nsrq->ndev = ndev; + + for (i = 0; i < srq_size; i++) { + nsrq->cmds[i].nsrq = nsrq; + ret = nvmet_rdma_post_recv(ndev, &nsrq->cmds[i]); + if (ret) + goto out_free_cmds; + } + + return nsrq; + +out_free_cmds: + nvmet_rdma_free_cmds(ndev, nsrq->cmds, srq_size, false); +out_destroy_srq: + ib_destroy_srq(srq); +out_free: + kfree(nsrq); + return ERR_PTR(ret); +} + +static int nvmet_rdma_init_srqs(struct nvmet_rdma_device *ndev) +{ + int i, ret; + + if (!ndev->device->attrs.max_srq_wr || !ndev->device->attrs.max_srq) { /* * If SRQs aren't supported we just go ahead and use normal * non-shared receive queues. @@ -810,22 +1150,29 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) return 0; } - ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); - if (IS_ERR(ndev->srq_cmds)) { - ret = PTR_ERR(ndev->srq_cmds); - goto out_destroy_srq; - } + ndev->srq_size = min(ndev->device->attrs.max_srq_wr, + nvmet_rdma_srq_size); + ndev->srq_count = min(ndev->device->num_comp_vectors, + ndev->device->attrs.max_srq); - ndev->srq = srq; - ndev->srq_size = srq_size; + ndev->srqs = kcalloc(ndev->srq_count, sizeof(*ndev->srqs), GFP_KERNEL); + if (!ndev->srqs) + return -ENOMEM; - for (i = 0; i < srq_size; i++) - nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + for (i = 0; i < ndev->srq_count; i++) { + ndev->srqs[i] = nvmet_rdma_init_srq(ndev); + if (IS_ERR(ndev->srqs[i])) { + ret = PTR_ERR(ndev->srqs[i]); + goto err_srq; + } + } return 0; -out_destroy_srq: - ib_destroy_srq(srq); +err_srq: + while (--i >= 0) + nvmet_rdma_destroy_srq(ndev->srqs[i]); + kfree(ndev->srqs); return ret; } @@ -838,7 +1185,7 @@ static void nvmet_rdma_free_dev(struct kref *ref) list_del(&ndev->entry); mutex_unlock(&device_list_mutex); - nvmet_rdma_destroy_srq(ndev); + nvmet_rdma_destroy_srqs(ndev); ib_dealloc_pd(ndev->pd); kfree(ndev); @@ -847,7 +1194,11 @@ static void nvmet_rdma_free_dev(struct kref *ref) static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { + struct nvmet_rdma_port *port = cm_id->context; + struct nvmet_port *nport = port->nport; struct nvmet_rdma_device *ndev; + int inline_page_count; + int inline_sge_count; int ret; mutex_lock(&device_list_mutex); @@ -861,6 +1212,26 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) if (!ndev) goto out_err; + inline_page_count = num_pages(nport->inline_data_size); + inline_sge_count = max(cm_id->device->attrs.max_sge_rd, + cm_id->device->attrs.max_recv_sge) - 1; + if (inline_page_count > inline_sge_count) { + pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", + nport->inline_data_size, cm_id->device->name, + inline_sge_count * PAGE_SIZE); + nport->inline_data_size = inline_sge_count * PAGE_SIZE; + inline_page_count = inline_sge_count; + } + ndev->inline_data_size = nport->inline_data_size; + ndev->inline_page_count = inline_page_count; + + if (nport->pi_enable && !(cm_id->device->attrs.kernel_cap_flags & + IBK_INTEGRITY_HANDOVER)) { + pr_warn("T10-PI is not supported by device %s. Disabling it\n", + cm_id->device->name); + nport->pi_enable = false; + } + ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -869,7 +1240,7 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) goto out_free_dev; if (nvmet_rdma_use_srq) { - ret = nvmet_rdma_init_srq(ndev); + ret = nvmet_rdma_init_srqs(ndev); if (ret) goto out_free_pd; } @@ -891,25 +1262,17 @@ out_err: static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) { - struct ib_qp_init_attr qp_attr; + struct ib_qp_init_attr qp_attr = { }; struct nvmet_rdma_device *ndev = queue->dev; - int comp_vector, nr_cqe, ret, i; - - /* - * Spread the io queues across completion vectors, - * but still keep all admin queues on vector 0. - */ - comp_vector = !queue->host_qid ? 0 : - queue->idx % ndev->device->num_comp_vectors; + int nr_cqe, ret, i, factor; /* * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. */ nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; - queue->cq = ib_alloc_cq(ndev->device, queue, - nr_cqe + 1, comp_vector, - IB_POLL_WORKQUEUE); + queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1, + queue->comp_vector, IB_POLL_WORKQUEUE); if (IS_ERR(queue->cq)) { ret = PTR_ERR(queue->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", @@ -917,7 +1280,6 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) goto out; } - memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_context = queue; qp_attr.event_handler = nvmet_rdma_qp_event; qp_attr.send_cq = queue->cq; @@ -926,23 +1288,29 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) qp_attr.qp_type = IB_QPT_RC; /* +1 for drain */ qp_attr.cap.max_send_wr = queue->send_queue_size + 1; - qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, + 1 << NVMET_RDMA_MAX_MDTS); + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, - ndev->device->attrs.max_sge); + ndev->device->attrs.max_send_sge); - if (ndev->srq) { - qp_attr.srq = ndev->srq; + if (queue->nsrq) { + qp_attr.srq = queue->nsrq->srq; } else { /* +1 for drain */ qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; - qp_attr.cap.max_recv_sge = 2; + qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; } + if (queue->port->pi_enable && queue->host_qid) + qp_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); if (ret) { pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } + queue->qp = queue->cm_id->qp; atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); @@ -950,42 +1318,50 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, qp_attr.cap.max_send_wr, queue->cm_id); - if (!ndev->srq) { + if (!queue->nsrq) { for (i = 0; i < queue->recv_queue_size; i++) { queue->cmds[i].queue = queue; - nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + if (ret) + goto err_destroy_qp; } } out: return ret; +err_destroy_qp: + rdma_destroy_qp(queue->cm_id); err_destroy_cq: - ib_free_cq(queue->cq); + ib_cq_pool_put(queue->cq, nr_cqe + 1); goto out; } static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) { - ib_drain_qp(queue->cm_id->qp); - rdma_destroy_qp(queue->cm_id); - ib_free_cq(queue->cq); + ib_drain_qp(queue->qp); + if (queue->cm_id) + rdma_destroy_id(queue->cm_id); + ib_destroy_qp(queue->qp); + ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * + queue->send_queue_size + 1); } static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) { - pr_info("freeing queue %d\n", queue->idx); + pr_debug("freeing queue %d\n", queue->idx); nvmet_sq_destroy(&queue->nvme_sq); + nvmet_cq_put(&queue->nvme_cq); nvmet_rdma_destroy_queue_ib(queue); - if (!queue->dev->srq) { + if (!queue->nsrq) { nvmet_rdma_free_cmds(queue->dev, queue->cmds, queue->recv_queue_size, !queue->host_qid); } nvmet_rdma_free_rsps(queue); - ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + ida_free(&nvmet_rdma_queue_ida, queue->idx); kfree(queue); } @@ -993,15 +1369,10 @@ static void nvmet_rdma_release_queue_work(struct work_struct *w) { struct nvmet_rdma_queue *queue = container_of(w, struct nvmet_rdma_queue, release_work); - struct rdma_cm_id *cm_id = queue->cm_id; struct nvmet_rdma_device *dev = queue->dev; - enum nvmet_rdma_queue_state state = queue->state; nvmet_rdma_free_queue(queue); - if (state != NVMET_RDMA_IN_DEVICE_REMOVAL) - rdma_destroy_id(cm_id); - kref_put(&dev->ref, nvmet_rdma_free_dev); } @@ -1046,7 +1417,8 @@ static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); rej.sts = cpu_to_le16(status); - return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); + return rdma_reject(cm_id, (void *)&rej, sizeof(rej), + IB_CM_REJ_CONSUMER_DEFINED); } static struct nvmet_rdma_queue * @@ -1054,6 +1426,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { + struct nvmet_rdma_port *port = cm_id->context; struct nvmet_rdma_queue *queue; int ret; @@ -1063,7 +1436,8 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, goto out_reject; } - ret = nvmet_sq_init(&queue->nvme_sq); + nvmet_cq_init(&queue->nvme_cq); + ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) { ret = NVME_RDMA_CM_NO_RSC; goto out_free_queue; @@ -1080,29 +1454,38 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); queue->dev = ndev; queue->cm_id = cm_id; + queue->port = port->nport; spin_lock_init(&queue->state_lock); queue->state = NVMET_RDMA_Q_CONNECTING; INIT_LIST_HEAD(&queue->rsp_wait_list); INIT_LIST_HEAD(&queue->rsp_wr_wait_list); spin_lock_init(&queue->rsp_wr_wait_lock); - INIT_LIST_HEAD(&queue->free_rsps); - spin_lock_init(&queue->rsps_lock); INIT_LIST_HEAD(&queue->queue_list); - queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + queue->idx = ida_alloc(&nvmet_rdma_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = NVME_RDMA_CM_NO_RSC; goto out_destroy_sq; } + /* + * Spread the io queues across completion vectors, + * but still keep all admin queues on vector 0. + */ + queue->comp_vector = !queue->host_qid ? 0 : + queue->idx % ndev->device->num_comp_vectors; + + ret = nvmet_rdma_alloc_rsps(queue); if (ret) { ret = NVME_RDMA_CM_NO_RSC; goto out_ida_remove; } - if (!ndev->srq) { + if (ndev->srqs) { + queue->nsrq = ndev->srqs[queue->comp_vector % ndev->srq_count]; + } else { queue->cmds = nvmet_rdma_alloc_cmds(ndev, queue->recv_queue_size, !queue->host_qid); @@ -1123,7 +1506,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, return queue; out_free_cmds: - if (!ndev->srq) { + if (!queue->nsrq) { nvmet_rdma_free_cmds(queue->dev, queue->cmds, queue->recv_queue_size, !queue->host_qid); @@ -1131,10 +1514,11 @@ out_free_cmds: out_free_responses: nvmet_rdma_free_rsps(queue); out_ida_remove: - ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + ida_free(&nvmet_rdma_queue_ida, queue->idx); out_destroy_sq: nvmet_sq_destroy(&queue->nvme_sq); out_free_queue: + nvmet_cq_put(&queue->nvme_cq); kfree(queue); out_reject: nvmet_rdma_cm_reject(cm_id, ret); @@ -1149,6 +1533,10 @@ static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) case IB_EVENT_COMM_EST: rdma_notify(queue->cm_id, event->event); break; + case IB_EVENT_QP_LAST_WQE_REACHED: + pr_debug("received last WQE reached event for queue=0x%p\n", + queue); + break; default: pr_err("received IB QP event: %s (%d)\n", ib_event_msg(event->event), event->event); @@ -1198,16 +1586,32 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = -ENOMEM; goto put_device; } - queue->port = cm_id->context; if (queue->host_qid == 0) { - /* Let inflight controller teardown complete */ - flush_scheduled_work(); + struct nvmet_rdma_queue *q; + int pending = 0; + + /* Check for pending controller teardown */ + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(q, &nvmet_rdma_queue_list, queue_list) { + if (q->nvme_sq.ctrl == queue->nvme_sq.ctrl && + q->state == NVMET_RDMA_Q_DISCONNECTING) + pending++; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + if (pending > NVMET_RDMA_BACKLOG) + return NVME_SC_CONNECT_CTRL_BUSY; } ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); - if (ret) - goto release_queue; + if (ret) { + /* + * Don't destroy the cm_id in free path, as we implicitly + * destroy the cm_id here with non-zero ret code. + */ + queue->cm_id = NULL; + goto free_queue; + } mutex_lock(&nvmet_rdma_queue_mutex); list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); @@ -1215,7 +1619,7 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, return 0; -release_queue: +free_queue: nvmet_rdma_free_queue(queue); put_device: kref_put(&ndev->ref, nvmet_rdma_free_dev); @@ -1260,9 +1664,18 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) spin_lock_irqsave(&queue->state_lock, flags); switch (queue->state) { case NVMET_RDMA_Q_CONNECTING: + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *rsp; + + rsp = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, + wait_list); + list_del(&rsp->wait_list); + nvmet_rdma_put_rsp(rsp); + } + fallthrough; case NVMET_RDMA_Q_LIVE: queue->state = NVMET_RDMA_Q_DISCONNECTING; - case NVMET_RDMA_IN_DEVICE_REMOVAL: disconnect = true; break; case NVMET_RDMA_Q_DISCONNECTING: @@ -1272,7 +1685,7 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) if (disconnect) { rdma_disconnect(queue->cm_id); - schedule_work(&queue->release_work); + queue_work(nvmet_wq, &queue->release_work); } } @@ -1302,11 +1715,11 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, mutex_unlock(&nvmet_rdma_queue_mutex); pr_err("failed to connect queue %d\n", queue->idx); - schedule_work(&queue->release_work); + queue_work(nvmet_wq, &queue->release_work); } /** - * nvme_rdma_device_removal() - Handle RDMA device removal + * nvmet_rdma_device_removal() - Handle RDMA device removal * @cm_id: rdma_cm id, used for nvmet port * @queue: nvmet rdma queue (cm id qp_context) * @@ -1318,18 +1731,18 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, * We registered an ib_client to handle device removal for queues, * so we only need to handle the listening port cm_ids. In this case * we nullify the priv to prevent double cm_id destruction and destroying - * the cm_id implicitely by returning a non-zero rc to the callout. + * the cm_id implicitly by returning a non-zero rc to the callout. */ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { - struct nvmet_port *port; + struct nvmet_rdma_port *port; if (queue) { /* * This is a queue cm_id. we have registered * an ib_client to handle queues removal - * so don't interfear and just return. + * so don't interfere and just return. */ return 0; } @@ -1342,12 +1755,12 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, * cm_id destroy. use atomic xchg to make sure * we don't compete with remove_port. */ - if (xchg(&port->priv, NULL) != cm_id) + if (xchg(&port->cm_id, NULL) != cm_id) return 0; /* * We need to return 1 so that the core will destroy - * it's own ID. What a great API design.. + * its own ID. What a great API design.. */ return 1; } @@ -1373,15 +1786,16 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, nvmet_rdma_queue_established(queue); break; case RDMA_CM_EVENT_ADDR_CHANGE: + if (!queue) { + struct nvmet_rdma_port *port = cm_id->context; + + queue_delayed_work(nvmet_wq, &port->repair_work, 0); + break; + } + fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: - /* - * We might end up here when we already freed the qp - * which means queue release sequence is in progress, - * so don't get in the way... - */ - if (queue) - nvmet_rdma_queue_disconnect(queue); + nvmet_rdma_queue_disconnect(queue); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: ret = nvmet_rdma_device_removal(cm_id, queue); @@ -1389,7 +1803,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_REJECTED: pr_debug("Connection rejected: %s\n", rdma_reject_msg(cm_id, event->status)); - /* FALLTHROUGH */ + fallthrough; case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_CONNECT_ERROR: nvmet_rdma_queue_connect_fail(cm_id, queue); @@ -1405,49 +1819,55 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) { - struct nvmet_rdma_queue *queue; + struct nvmet_rdma_queue *queue, *n; -restart: mutex_lock(&nvmet_rdma_queue_mutex); - list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { - if (queue->nvme_sq.ctrl == ctrl) { - list_del_init(&queue->queue_list); - mutex_unlock(&nvmet_rdma_queue_mutex); - - __nvmet_rdma_queue_disconnect(queue); - goto restart; - } + list_for_each_entry_safe(queue, n, &nvmet_rdma_queue_list, queue_list) { + if (queue->nvme_sq.ctrl != ctrl) + continue; + list_del_init(&queue->queue_list); + __nvmet_rdma_queue_disconnect(queue); } mutex_unlock(&nvmet_rdma_queue_mutex); } -static int nvmet_rdma_add_port(struct nvmet_port *port) +static void nvmet_rdma_destroy_port_queues(struct nvmet_rdma_port *port) { - struct rdma_cm_id *cm_id; - struct sockaddr_storage addr = { }; - __kernel_sa_family_t af; - int ret; + struct nvmet_rdma_queue *queue, *tmp; + struct nvmet_port *nport = port->nport; - switch (port->disc_addr.adrfam) { - case NVMF_ADDR_FAMILY_IP4: - af = AF_INET; - break; - case NVMF_ADDR_FAMILY_IP6: - af = AF_INET6; - break; - default: - pr_err("address family %d not supported\n", - port->disc_addr.adrfam); - return -EINVAL; - } + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, + queue_list) { + if (queue->port != nport) + continue; - ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, - port->disc_addr.trsvcid, &addr); - if (ret) { - pr_err("malformed ip/port passed: %s:%s\n", - port->disc_addr.traddr, port->disc_addr.trsvcid); - return ret; + list_del_init(&queue->queue_list); + __nvmet_rdma_queue_disconnect(queue); } + mutex_unlock(&nvmet_rdma_queue_mutex); +} + +static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port) +{ + struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL); + + if (cm_id) + rdma_destroy_id(cm_id); + + /* + * Destroy the remaining queues, which are not belong to any + * controller yet. Do it here after the RDMA-CM was destroyed + * guarantees that no new queue will be created. + */ + nvmet_rdma_destroy_port_queues(port); +} + +static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) +{ + struct sockaddr *addr = (struct sockaddr *)&port->addr; + struct rdma_cm_id *cm_id; + int ret; cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, RDMA_PS_TCP, IB_QPT_RC); @@ -1466,23 +1886,19 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) goto out_destroy_id; } - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); + ret = rdma_bind_addr(cm_id, addr); if (ret) { - pr_err("binding CM ID to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } - ret = rdma_listen(cm_id, 128); + ret = rdma_listen(cm_id, NVMET_RDMA_BACKLOG); if (ret) { - pr_err("listening to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } - pr_info("enabling port %d (%pISpcs)\n", - le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); - port->priv = cm_id; + port->cm_id = cm_id; return 0; out_destroy_id: @@ -1490,51 +1906,192 @@ out_destroy_id: return ret; } -static void nvmet_rdma_remove_port(struct nvmet_port *port) +static void nvmet_rdma_repair_port_work(struct work_struct *w) { - struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); + struct nvmet_rdma_port *port = container_of(to_delayed_work(w), + struct nvmet_rdma_port, repair_work); + int ret; - if (cm_id) - rdma_destroy_id(cm_id); + nvmet_rdma_disable_port(port); + ret = nvmet_rdma_enable_port(port); + if (ret) + queue_delayed_work(nvmet_wq, &port->repair_work, 5 * HZ); +} + +static int nvmet_rdma_add_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port; + __kernel_sa_family_t af; + int ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + nport->priv = port; + port->nport = nport; + INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work); + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto out_free_port; + } + + if (nport->inline_data_size < 0) { + nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + nport->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + + if (nport->max_queue_size < 0) { + nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE; + } else if (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) { + pr_warn("max_queue_size %u is too large, reducing to %u\n", + nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE); + nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto out_free_port; + } + + ret = nvmet_rdma_enable_port(port); + if (ret) + goto out_free_port; + + pr_info("enabling port %d (%pISpcs)\n", + le16_to_cpu(nport->disc_addr.portid), + (struct sockaddr *)&port->addr); + + return 0; + +out_free_port: + kfree(port); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + + cancel_delayed_work_sync(&port->repair_work); + nvmet_rdma_disable_port(port); + kfree(port); } -static struct nvmet_fabrics_ops nvmet_rdma_ops = { +static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, + struct nvmet_port *nport, char *traddr) +{ + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; + + if (inet_addr_is_any(&cm_id->route.addr.src_addr)) { + struct nvmet_rdma_rsp *rsp = + container_of(req, struct nvmet_rdma_rsp, req); + struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; + struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; + + sprintf(traddr, "%pISc", addr); + } else { + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); + } +} + +static ssize_t nvmet_rdma_host_port_addr(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_len) +{ + struct nvmet_sq *nvme_sq = ctrl->sqs[0]; + struct nvmet_rdma_queue *queue = + container_of(nvme_sq, struct nvmet_rdma_queue, nvme_sq); + + return snprintf(traddr, traddr_len, "%pISc", + (struct sockaddr *)&queue->cm_id->route.addr.dst_addr); +} + +static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) +{ + if (ctrl->pi_support) + return NVMET_RDMA_MAX_METADATA_MDTS; + return NVMET_RDMA_MAX_MDTS; +} + +static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) +{ + if (ctrl->pi_support) + return NVME_RDMA_MAX_METADATA_QUEUE_SIZE; + return NVME_RDMA_MAX_QUEUE_SIZE; +} + +static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, - .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, .msdbd = 1, - .has_keyed_sgls = 1, + .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED, .add_port = nvmet_rdma_add_port, .remove_port = nvmet_rdma_remove_port, .queue_response = nvmet_rdma_queue_response, .delete_ctrl = nvmet_rdma_delete_ctrl, + .disc_traddr = nvmet_rdma_disc_port_addr, + .host_traddr = nvmet_rdma_host_port_addr, + .get_mdts = nvmet_rdma_get_mdts, + .get_max_queue_size = nvmet_rdma_get_max_queue_size, }; -static void nvmet_rdma_add_one(struct ib_device *ib_device) -{ -} - static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) { - struct nvmet_rdma_queue *queue; + struct nvmet_rdma_queue *queue, *tmp; + struct nvmet_rdma_device *ndev; + bool found = false; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->device == ib_device) { + found = true; + break; + } + } + mutex_unlock(&device_list_mutex); - /* Device is being removed, delete all queues using this device */ + if (!found) + return; + + /* + * IB Device that is used by nvmet controllers is being removed, + * delete all queues using this device. + */ mutex_lock(&nvmet_rdma_queue_mutex); - list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, + queue_list) { if (queue->dev->device != ib_device) continue; pr_info("Removing queue %d\n", queue->idx); + list_del_init(&queue->queue_list); __nvmet_rdma_queue_disconnect(queue); } mutex_unlock(&nvmet_rdma_queue_mutex); - flush_scheduled_work(); + flush_workqueue(nvmet_wq); } static struct ib_client nvmet_rdma_ib_client = { .name = "nvmet_rdma", - .add = nvmet_rdma_add_one, .remove = nvmet_rdma_remove_one }; @@ -1559,30 +2116,15 @@ err_ib_client: static void __exit nvmet_rdma_exit(void) { - struct nvmet_rdma_queue *queue; - nvmet_unregister_transport(&nvmet_rdma_ops); - - flush_scheduled_work(); - - mutex_lock(&nvmet_rdma_queue_mutex); - while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list, - struct nvmet_rdma_queue, queue_list))) { - list_del_init(&queue->queue_list); - - mutex_unlock(&nvmet_rdma_queue_mutex); - __nvmet_rdma_queue_disconnect(queue); - mutex_lock(&nvmet_rdma_queue_mutex); - } - mutex_unlock(&nvmet_rdma_queue_mutex); - - flush_scheduled_work(); ib_unregister_client(&nvmet_rdma_ib_client); + WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); ida_destroy(&nvmet_rdma_queue_ida); } module_init(nvmet_rdma_init); module_exit(nvmet_rdma_exit); +MODULE_DESCRIPTION("NVMe target RDMA transport driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ |
