diff options
Diffstat (limited to 'drivers/nvme/target/rdma.c')
| -rw-r--r-- | drivers/nvme/target/rdma.c | 962 |
1 files changed, 703 insertions, 259 deletions
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index a8d23eb80192..9c12b2361a6d 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1,18 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics RDMA target. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/atomic.h> +#include <linux/blk-integrity.h> #include <linux/ctype.h> #include <linux/delay.h> #include <linux/err.h> @@ -23,11 +16,12 @@ #include <linux/string.h> #include <linux/wait.h> #include <linux/inet.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <rdma/rw.h> +#include <rdma/ib_cm.h> #include <linux/nvme-rdma.h> #include "nvmet.h" @@ -39,6 +33,16 @@ #define NVMET_RDMA_MAX_INLINE_SGE 4 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) +/* Assume mpsmin == device_page_size == 4KB */ +#define NVMET_RDMA_MAX_MDTS 8 +#define NVMET_RDMA_MAX_METADATA_MDTS 5 + +#define NVMET_RDMA_BACKLOG 128 + +#define NVMET_RDMA_DISCRETE_RSP_TAG -1 + +struct nvmet_rdma_srq; + struct nvmet_rdma_cmd { struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; @@ -46,11 +50,11 @@ struct nvmet_rdma_cmd { struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; struct nvme_command *nvme_cmd; struct nvmet_rdma_queue *queue; + struct nvmet_rdma_srq *nsrq; }; enum { NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), - NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), }; struct nvmet_rdma_rsp { @@ -62,6 +66,7 @@ struct nvmet_rdma_rsp { struct nvmet_rdma_queue *queue; struct ib_cqe read_cqe; + struct ib_cqe write_cqe; struct rdma_rw_ctx rw; struct nvmet_req req; @@ -72,7 +77,7 @@ struct nvmet_rdma_rsp { u32 invalidate_rkey; struct list_head wait_list; - struct list_head free_list; + int tag; }; enum nvmet_rdma_queue_state { @@ -83,18 +88,19 @@ enum nvmet_rdma_queue_state { struct nvmet_rdma_queue { struct rdma_cm_id *cm_id; + struct ib_qp *qp; struct nvmet_port *port; struct ib_cq *cq; atomic_t sq_wr_avail; struct nvmet_rdma_device *dev; + struct nvmet_rdma_srq *nsrq; spinlock_t state_lock; enum nvmet_rdma_queue_state state; struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; struct nvmet_rdma_rsp *rsps; - struct list_head free_rsps; - spinlock_t rsps_lock; + struct sbitmap rsp_tags; struct nvmet_rdma_cmd *cmds; struct work_struct release_work; @@ -104,17 +110,31 @@ struct nvmet_rdma_queue { int idx; int host_qid; + int comp_vector; int recv_queue_size; int send_queue_size; struct list_head queue_list; }; +struct nvmet_rdma_port { + struct nvmet_port *nport; + struct sockaddr_storage addr; + struct rdma_cm_id *cm_id; + struct delayed_work repair_work; +}; + +struct nvmet_rdma_srq { + struct ib_srq *srq; + struct nvmet_rdma_cmd *cmds; + struct nvmet_rdma_device *ndev; +}; + struct nvmet_rdma_device { struct ib_device *device; struct ib_pd *pd; - struct ib_srq *srq; - struct nvmet_rdma_cmd *srq_cmds; + struct nvmet_rdma_srq **srqs; + int srq_count; size_t srq_size; struct kref ref; struct list_head entry; @@ -126,6 +146,16 @@ static bool nvmet_rdma_use_srq; module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); MODULE_PARM_DESC(use_srq, "Use shared receive queue."); +static int srq_size_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops srq_size_ops = { + .set = srq_size_set, + .get = param_get_int, +}; + +static int nvmet_rdma_srq_size = 1024; +module_param_cb(srq_size, &srq_size_ops, &nvmet_rdma_srq_size, 0644); +MODULE_PARM_DESC(srq_size, "set Shared Receive Queue (SRQ) size, should >= 256 (default: 1024)"); + static DEFINE_IDA(nvmet_rdma_queue_ida); static LIST_HEAD(nvmet_rdma_queue_list); static DEFINE_MUTEX(nvmet_rdma_queue_mutex); @@ -137,20 +167,31 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); +static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r); +static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r, + int tag); static const struct nvmet_fabrics_ops nvmet_rdma_ops; -static int num_pages(int len) +static int srq_size_set(const char *val, const struct kernel_param *kp) { - return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); + int n = 0, ret; + + ret = kstrtoint(val, 10, &n); + if (ret != 0 || n < 256) + return -EINVAL; + + return param_set_int(val, kp); } -/* XXX: really should move to a generic header sooner or later.. */ -static inline u32 get_unaligned_le24(const u8 *p) +static int num_pages(int len) { - return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; + return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); } static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) @@ -164,28 +205,32 @@ static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) { return !nvme_is_write(rsp->req.cmd) && rsp->req.transfer_len && - !rsp->req.rsp->status && + !rsp->req.cqe->status && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } static inline struct nvmet_rdma_rsp * nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) { - struct nvmet_rdma_rsp *rsp; - unsigned long flags; + struct nvmet_rdma_rsp *rsp = NULL; + int tag; - spin_lock_irqsave(&queue->rsps_lock, flags); - rsp = list_first_entry_or_null(&queue->free_rsps, - struct nvmet_rdma_rsp, free_list); - if (likely(rsp)) - list_del(&rsp->free_list); - spin_unlock_irqrestore(&queue->rsps_lock, flags); + tag = sbitmap_get(&queue->rsp_tags); + if (tag >= 0) + rsp = &queue->rsps[tag]; if (unlikely(!rsp)) { - rsp = kmalloc(sizeof(*rsp), GFP_KERNEL); + int ret; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); if (unlikely(!rsp)) return NULL; - rsp->allocated = true; + ret = nvmet_rdma_alloc_rsp(queue->dev, rsp, + NVMET_RDMA_DISCRETE_RSP_TAG); + if (unlikely(ret)) { + kfree(rsp); + return NULL; + } } return rsp; @@ -194,16 +239,13 @@ nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) static inline void nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) { - unsigned long flags; - - if (unlikely(rsp->allocated)) { + if (unlikely(rsp->tag == NVMET_RDMA_DISCRETE_RSP_TAG)) { + nvmet_rdma_free_rsp(rsp->queue->dev, rsp); kfree(rsp); return; } - spin_lock_irqsave(&rsp->queue->rsps_lock, flags); - list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); - spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); + sbitmap_clear_bit(&rsp->queue->rsp_tags, rsp->tag); } static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, @@ -325,7 +367,7 @@ nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *cmds; int ret = -EINVAL, i; - cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); + cmds = kvcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); if (!cmds) goto out; @@ -340,7 +382,7 @@ nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, out_free: while (--i >= 0) nvmet_rdma_free_cmd(ndev, cmds + i, admin); - kfree(cmds); + kvfree(cmds); out: return ERR_PTR(ret); } @@ -352,23 +394,25 @@ static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, for (i = 0; i < nr_cmds; i++) nvmet_rdma_free_cmd(ndev, cmds + i, admin); - kfree(cmds); + kvfree(cmds); } static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, - struct nvmet_rdma_rsp *r) + struct nvmet_rdma_rsp *r, int tag) { /* NVMe CQE / RDMA SEND */ - r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); - if (!r->req.rsp) + r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL); + if (!r->req.cqe) goto out; - r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, - sizeof(*r->req.rsp), DMA_TO_DEVICE); + r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe, + sizeof(*r->req.cqe), DMA_TO_DEVICE); if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) goto out_free_rsp; - r->send_sge.length = sizeof(*r->req.rsp); + if (ib_dma_pci_p2p_dma_supported(ndev->device)) + r->req.p2p_client = &ndev->device->dev; + r->send_sge.length = sizeof(*r->req.cqe); r->send_sge.lkey = ndev->pd->local_dma_lkey; r->send_cqe.done = nvmet_rdma_send_done; @@ -380,10 +424,14 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, /* Data In / RDMA READ */ r->read_cqe.done = nvmet_rdma_read_data_done; + /* Data Out / RDMA WRITE */ + r->write_cqe.done = nvmet_rdma_write_data_done; + r->tag = tag; + return 0; out_free_rsp: - kfree(r->req.rsp); + kfree(r->req.cqe); out: return -ENOMEM; } @@ -392,8 +440,8 @@ static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, struct nvmet_rdma_rsp *r) { ib_dma_unmap_single(ndev->device, r->send_sge.addr, - sizeof(*r->req.rsp), DMA_TO_DEVICE); - kfree(r->req.rsp); + sizeof(*r->req.cqe), DMA_TO_DEVICE); + kfree(r->req.cqe); } static int @@ -401,33 +449,33 @@ nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) { struct nvmet_rdma_device *ndev = queue->dev; int nr_rsps = queue->recv_queue_size * 2; - int ret = -EINVAL, i; + int ret = -ENOMEM, i; - queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), + if (sbitmap_init_node(&queue->rsp_tags, nr_rsps, -1, GFP_KERNEL, + NUMA_NO_NODE, false, true)) + goto out; + + queue->rsps = kvcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), GFP_KERNEL); if (!queue->rsps) - goto out; + goto out_free_sbitmap; for (i = 0; i < nr_rsps; i++) { struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; - ret = nvmet_rdma_alloc_rsp(ndev, rsp); + ret = nvmet_rdma_alloc_rsp(ndev, rsp, i); if (ret) goto out_free; - - list_add_tail(&rsp->free_list, &queue->free_rsps); } return 0; out_free: - while (--i >= 0) { - struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; - - list_del(&rsp->free_list); - nvmet_rdma_free_rsp(ndev, rsp); - } - kfree(queue->rsps); + while (--i >= 0) + nvmet_rdma_free_rsp(ndev, &queue->rsps[i]); + kvfree(queue->rsps); +out_free_sbitmap: + sbitmap_free(&queue->rsp_tags); out: return ret; } @@ -437,13 +485,10 @@ static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) struct nvmet_rdma_device *ndev = queue->dev; int i, nr_rsps = queue->recv_queue_size * 2; - for (i = 0; i < nr_rsps; i++) { - struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; - - list_del(&rsp->free_list); - nvmet_rdma_free_rsp(ndev, rsp); - } - kfree(queue->rsps); + for (i = 0; i < nr_rsps; i++) + nvmet_rdma_free_rsp(ndev, &queue->rsps[i]); + kvfree(queue->rsps); + sbitmap_free(&queue->rsp_tags); } static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, @@ -455,10 +500,10 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, cmd->sge[0].addr, cmd->sge[0].length, DMA_FROM_DEVICE); - if (ndev->srq) - ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); + if (cmd->nsrq) + ret = ib_post_srq_recv(cmd->nsrq->srq, &cmd->wr, NULL); else - ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); + ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL); if (unlikely(ret)) pr_err("post_recv cmd failed\n"); @@ -489,6 +534,129 @@ static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) spin_unlock(&queue->rsp_wr_wait_lock); } +static u16 nvmet_rdma_check_pi_status(struct ib_mr *sig_mr) +{ + struct ib_mr_status mr_status; + int ret; + u16 status = 0; + + ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + return NVME_SC_INVALID_PI; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + status = NVME_SC_GUARD_CHECK; + break; + case IB_SIG_BAD_REFTAG: + status = NVME_SC_REFTAG_CHECK; + break; + case IB_SIG_BAD_APPTAG: + status = NVME_SC_APPTAG_CHECK; + break; + } + pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + } + + return status; +} + +static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi, + struct nvme_command *cmd, struct ib_sig_domain *domain, + u16 control, u8 pi_type) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.pi_interval = 1 << bi->interval_exp; + domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); + if (control & NVME_RW_PRINFO_PRCHK_REF) + domain->sig.dif.ref_remap = true; + + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); + domain->sig.dif.app_escape = true; + if (pi_type == NVME_NS_DPS_PI_TYPE3) + domain->sig.dif.ref_escape = true; +} + +static void nvmet_rdma_set_sig_attrs(struct nvmet_req *req, + struct ib_sig_attrs *sig_attrs) +{ + struct nvme_command *cmd = req->cmd; + u16 control = le16_to_cpu(cmd->rw.control); + u8 pi_type = req->ns->pi_type; + struct blk_integrity *bi; + + bi = bdev_get_integrity(req->ns->bdev); + + memset(sig_attrs, 0, sizeof(*sig_attrs)); + + if (control & NVME_RW_PRINFO_PRACT) { + /* for WRITE_INSERT/READ_STRIP no wire domain */ + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, + pi_type); + /* Clear the PRACT bit since HCA will generate/verify the PI */ + control &= ~NVME_RW_PRINFO_PRACT; + cmd->rw.control = cpu_to_le16(control); + /* PI is added by the HW */ + req->transfer_len += req->metadata_len; + } else { + /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, + pi_type); + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, + pi_type); + } + + if (control & NVME_RW_PRINFO_PRCHK_REF) + sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG; + if (control & NVME_RW_PRINFO_PRCHK_GUARD) + sig_attrs->check_mask |= IB_SIG_CHECK_GUARD; + if (control & NVME_RW_PRINFO_PRCHK_APP) + sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG; +} + +static int nvmet_rdma_rw_ctx_init(struct nvmet_rdma_rsp *rsp, u64 addr, u32 key, + struct ib_sig_attrs *sig_attrs) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct nvmet_req *req = &rsp->req; + int ret; + + if (req->metadata_len) + ret = rdma_rw_ctx_signature_init(&rsp->rw, cm_id->qp, + cm_id->port_num, req->sg, req->sg_cnt, + req->metadata_sg, req->metadata_sg_cnt, sig_attrs, + addr, key, nvmet_data_dir(req)); + else + ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, + req->sg, req->sg_cnt, 0, addr, key, + nvmet_data_dir(req)); + + return ret; +} + +static void nvmet_rdma_rw_ctx_destroy(struct nvmet_rdma_rsp *rsp) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct nvmet_req *req = &rsp->req; + + if (req->metadata_len) + rdma_rw_ctx_destroy_signature(&rsp->rw, cm_id->qp, + cm_id->port_num, req->sg, req->sg_cnt, + req->metadata_sg, req->metadata_sg_cnt, + nvmet_data_dir(req)); + else + rdma_rw_ctx_destroy(&rsp->rw, cm_id->qp, cm_id->port_num, + req->sg, req->sg_cnt, nvmet_data_dir(req)); +} static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) { @@ -496,14 +664,11 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); - if (rsp->n_rdma) { - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, - queue->cm_id->port_num, rsp->req.sg, - rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); - } + if (rsp->n_rdma) + nvmet_rdma_rw_ctx_destroy(rsp); if (rsp->req.sg != rsp->cmd->inline_sg) - nvmet_req_free_sgl(&rsp->req); + nvmet_req_free_sgls(&rsp->req); if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) nvmet_rdma_process_wr_wait_list(queue); @@ -529,7 +694,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; nvmet_rdma_release_rsp(rsp); @@ -548,18 +713,23 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) struct rdma_cm_id *cm_id = rsp->queue->cm_id; struct ib_send_wr *first_wr; - if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + if (rsp->invalidate_rkey) { rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; } else { rsp->send_wr.opcode = IB_WR_SEND; } - if (nvmet_rdma_need_data_out(rsp)) - first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, - cm_id->port_num, NULL, &rsp->send_wr); - else + if (nvmet_rdma_need_data_out(rsp)) { + if (rsp->req.metadata_len) + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, &rsp->write_cqe, NULL); + else + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, NULL, &rsp->send_wr); + } else { first_wr = &rsp->send_wr; + } nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); @@ -577,16 +747,15 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; + u16 status = 0; WARN_ON(rsp->n_rdma <= 0); atomic_add(rsp->n_rdma, &queue->sq_wr_avail); - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, - queue->cm_id->port_num, rsp->req.sg, - rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); rsp->n_rdma = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_rw_ctx_destroy(rsp); nvmet_req_uninit(&rsp->req); nvmet_rdma_release_rsp(rsp); if (wc->status != IB_WC_WR_FLUSH_ERR) { @@ -597,7 +766,57 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) return; } - nvmet_req_execute(&rsp->req); + if (rsp->req.metadata_len) + status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); + nvmet_rdma_rw_ctx_destroy(rsp); + + if (unlikely(status)) + nvmet_req_complete(&rsp->req, status); + else + rsp->req.execute(&rsp->req); +} + +static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, write_cqe); + struct nvmet_rdma_queue *queue = wc->qp->qp_context; + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + u16 status; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return; + + WARN_ON(rsp->n_rdma <= 0); + atomic_add(rsp->n_rdma, &queue->sq_wr_avail); + rsp->n_rdma = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_rw_ctx_destroy(rsp); + nvmet_req_uninit(&rsp->req); + nvmet_rdma_release_rsp(rsp); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_info("RDMA WRITE for CQE failed with status %s (%d).\n", + ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + /* + * Upon RDMA completion check the signature status + * - if succeeded send good NVMe response + * - if failed send bad NVMe response with appropriate error + */ + status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); + if (unlikely(status)) + rsp->req.cqe->status = cpu_to_le16(status << 1); + nvmet_rdma_rw_ctx_destroy(rsp); + + if (unlikely(ib_post_send(cm_id->qp, &rsp->send_wr, NULL))) { + pr_err("sending cmd response failed\n"); + nvmet_rdma_release_rsp(rsp); + } } static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, @@ -633,12 +852,12 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) if (!nvme_is_write(rsp->req.cmd)) { rsp->req.error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); - return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_OFFSET | NVME_STATUS_DNR; } /* no data command? */ @@ -654,9 +873,9 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, struct nvme_keyed_sgl_desc *sgl, bool invalidate) { - struct rdma_cm_id *cm_id = rsp->queue->cm_id; u64 addr = le64_to_cpu(sgl->addr); u32 key = get_unaligned_le32(sgl->key); + struct ib_sig_attrs sig_attrs; int ret; rsp->req.transfer_len = get_unaligned_le24(sgl->length); @@ -665,21 +884,20 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, if (!rsp->req.transfer_len) return 0; - ret = nvmet_req_alloc_sgl(&rsp->req); - if (ret < 0) + if (rsp->req.metadata_len) + nvmet_rdma_set_sig_attrs(&rsp->req, &sig_attrs); + + ret = nvmet_req_alloc_sgls(&rsp->req); + if (unlikely(ret < 0)) goto error_out; - ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, - rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, - nvmet_data_dir(&rsp->req)); - if (ret < 0) + ret = nvmet_rdma_rw_ctx_init(rsp, addr, key, &sig_attrs); + if (unlikely(ret < 0)) goto error_out; rsp->n_rdma += ret; - if (invalidate) { + if (invalidate) rsp->invalidate_rkey = key; - rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; - } return 0; @@ -701,7 +919,7 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) pr_err("invalid SGL subtype: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } case NVME_KEY_SGL_FMT_DATA_DESC: switch (sgl->type & 0xf) { @@ -713,12 +931,12 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) pr_err("invalid SGL subtype: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } default: pr_err("invalid SGL type: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_TYPE | NVME_STATUS_DNR; } } @@ -736,11 +954,11 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) } if (nvmet_rdma_need_data_in(rsp)) { - if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + if (rdma_rw_ctx_post(&rsp->rw, queue->qp, queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { - nvmet_req_execute(&rsp->req); + rsp->req.execute(&rsp->req); } return true; @@ -758,10 +976,7 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, cmd->send_sge.addr, cmd->send_sge.length, DMA_TO_DEVICE); - cmd->req.p2p_client = &queue->dev->device->dev; - - if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, - &queue->nvme_sq, &nvmet_rdma_ops)) + if (!nvmet_req_init(&cmd->req, &queue->nvme_sq, &nvmet_rdma_ops)) return; status = nvmet_rdma_map_sgl(cmd); @@ -780,11 +995,32 @@ out_err: nvmet_req_complete(&cmd->req, status); } +static bool nvmet_rdma_recv_not_live(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + bool ret = true; + + spin_lock_irqsave(&queue->state_lock, flags); + /* + * recheck queue state is not live to prevent a race condition + * with RDMA_CM_EVENT_ESTABLISHED handler. + */ + if (queue->state == NVMET_RDMA_Q_LIVE) + ret = false; + else if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return ret; +} + static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_cmd *cmd = container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; struct nvmet_rdma_rsp *rsp; if (unlikely(wc->status != IB_WC_SUCCESS)) { @@ -820,39 +1056,49 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) rsp->req.cmd = cmd->nvme_cmd; rsp->req.port = queue->port; rsp->n_rdma = 0; + rsp->invalidate_rkey = 0; - if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { - unsigned long flags; - - spin_lock_irqsave(&queue->state_lock, flags); - if (queue->state == NVMET_RDMA_Q_CONNECTING) - list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); - else - nvmet_rdma_put_rsp(rsp); - spin_unlock_irqrestore(&queue->state_lock, flags); + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE) && + nvmet_rdma_recv_not_live(queue, rsp)) return; - } nvmet_rdma_handle_command(queue, rsp); } -static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) +static void nvmet_rdma_destroy_srq(struct nvmet_rdma_srq *nsrq) +{ + nvmet_rdma_free_cmds(nsrq->ndev, nsrq->cmds, nsrq->ndev->srq_size, + false); + ib_destroy_srq(nsrq->srq); + + kfree(nsrq); +} + +static void nvmet_rdma_destroy_srqs(struct nvmet_rdma_device *ndev) { - if (!ndev->srq) + int i; + + if (!ndev->srqs) return; - nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); - ib_destroy_srq(ndev->srq); + for (i = 0; i < ndev->srq_count; i++) + nvmet_rdma_destroy_srq(ndev->srqs[i]); + + kfree(ndev->srqs); } -static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) +static struct nvmet_rdma_srq * +nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) { struct ib_srq_init_attr srq_attr = { NULL, }; + size_t srq_size = ndev->srq_size; + struct nvmet_rdma_srq *nsrq; struct ib_srq *srq; - size_t srq_size; int ret, i; - srq_size = 4095; /* XXX: tune */ + nsrq = kzalloc(sizeof(*nsrq), GFP_KERNEL); + if (!nsrq) + return ERR_PTR(-ENOMEM); srq_attr.attr.max_wr = srq_size; srq_attr.attr.max_sge = 1 + ndev->inline_page_count; @@ -860,35 +1106,73 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) srq_attr.srq_type = IB_SRQT_BASIC; srq = ib_create_srq(ndev->pd, &srq_attr); if (IS_ERR(srq)) { - /* - * If SRQs aren't supported we just go ahead and use normal - * non-shared receive queues. - */ - pr_info("SRQ requested but not supported.\n"); - return 0; + ret = PTR_ERR(srq); + goto out_free; } - ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); - if (IS_ERR(ndev->srq_cmds)) { - ret = PTR_ERR(ndev->srq_cmds); + nsrq->cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); + if (IS_ERR(nsrq->cmds)) { + ret = PTR_ERR(nsrq->cmds); goto out_destroy_srq; } - ndev->srq = srq; - ndev->srq_size = srq_size; + nsrq->srq = srq; + nsrq->ndev = ndev; for (i = 0; i < srq_size; i++) { - ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + nsrq->cmds[i].nsrq = nsrq; + ret = nvmet_rdma_post_recv(ndev, &nsrq->cmds[i]); if (ret) goto out_free_cmds; } - return 0; + return nsrq; out_free_cmds: - nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); + nvmet_rdma_free_cmds(ndev, nsrq->cmds, srq_size, false); out_destroy_srq: ib_destroy_srq(srq); +out_free: + kfree(nsrq); + return ERR_PTR(ret); +} + +static int nvmet_rdma_init_srqs(struct nvmet_rdma_device *ndev) +{ + int i, ret; + + if (!ndev->device->attrs.max_srq_wr || !ndev->device->attrs.max_srq) { + /* + * If SRQs aren't supported we just go ahead and use normal + * non-shared receive queues. + */ + pr_info("SRQ requested but not supported.\n"); + return 0; + } + + ndev->srq_size = min(ndev->device->attrs.max_srq_wr, + nvmet_rdma_srq_size); + ndev->srq_count = min(ndev->device->num_comp_vectors, + ndev->device->attrs.max_srq); + + ndev->srqs = kcalloc(ndev->srq_count, sizeof(*ndev->srqs), GFP_KERNEL); + if (!ndev->srqs) + return -ENOMEM; + + for (i = 0; i < ndev->srq_count; i++) { + ndev->srqs[i] = nvmet_rdma_init_srq(ndev); + if (IS_ERR(ndev->srqs[i])) { + ret = PTR_ERR(ndev->srqs[i]); + goto err_srq; + } + } + + return 0; + +err_srq: + while (--i >= 0) + nvmet_rdma_destroy_srq(ndev->srqs[i]); + kfree(ndev->srqs); return ret; } @@ -901,7 +1185,7 @@ static void nvmet_rdma_free_dev(struct kref *ref) list_del(&ndev->entry); mutex_unlock(&device_list_mutex); - nvmet_rdma_destroy_srq(ndev); + nvmet_rdma_destroy_srqs(ndev); ib_dealloc_pd(ndev->pd); kfree(ndev); @@ -910,7 +1194,8 @@ static void nvmet_rdma_free_dev(struct kref *ref) static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { - struct nvmet_port *port = cm_id->context; + struct nvmet_rdma_port *port = cm_id->context; + struct nvmet_port *nport = port->nport; struct nvmet_rdma_device *ndev; int inline_page_count; int inline_sge_count; @@ -927,18 +1212,26 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) if (!ndev) goto out_err; - inline_page_count = num_pages(port->inline_data_size); + inline_page_count = num_pages(nport->inline_data_size); inline_sge_count = max(cm_id->device->attrs.max_sge_rd, cm_id->device->attrs.max_recv_sge) - 1; if (inline_page_count > inline_sge_count) { pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", - port->inline_data_size, cm_id->device->name, + nport->inline_data_size, cm_id->device->name, inline_sge_count * PAGE_SIZE); - port->inline_data_size = inline_sge_count * PAGE_SIZE; + nport->inline_data_size = inline_sge_count * PAGE_SIZE; inline_page_count = inline_sge_count; } - ndev->inline_data_size = port->inline_data_size; + ndev->inline_data_size = nport->inline_data_size; ndev->inline_page_count = inline_page_count; + + if (nport->pi_enable && !(cm_id->device->attrs.kernel_cap_flags & + IBK_INTEGRITY_HANDOVER)) { + pr_warn("T10-PI is not supported by device %s. Disabling it\n", + cm_id->device->name); + nport->pi_enable = false; + } + ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -947,7 +1240,7 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) goto out_free_dev; if (nvmet_rdma_use_srq) { - ret = nvmet_rdma_init_srq(ndev); + ret = nvmet_rdma_init_srqs(ndev); if (ret) goto out_free_pd; } @@ -969,25 +1262,17 @@ out_err: static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) { - struct ib_qp_init_attr qp_attr; + struct ib_qp_init_attr qp_attr = { }; struct nvmet_rdma_device *ndev = queue->dev; - int comp_vector, nr_cqe, ret, i; - - /* - * Spread the io queues across completion vectors, - * but still keep all admin queues on vector 0. - */ - comp_vector = !queue->host_qid ? 0 : - queue->idx % ndev->device->num_comp_vectors; + int nr_cqe, ret, i, factor; /* * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. */ nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; - queue->cq = ib_alloc_cq(ndev->device, queue, - nr_cqe + 1, comp_vector, - IB_POLL_WORKQUEUE); + queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1, + queue->comp_vector, IB_POLL_WORKQUEUE); if (IS_ERR(queue->cq)) { ret = PTR_ERR(queue->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", @@ -995,7 +1280,6 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) goto out; } - memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_context = queue; qp_attr.event_handler = nvmet_rdma_qp_event; qp_attr.send_cq = queue->cq; @@ -1004,23 +1288,29 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) qp_attr.qp_type = IB_QPT_RC; /* +1 for drain */ qp_attr.cap.max_send_wr = queue->send_queue_size + 1; - qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, + 1 << NVMET_RDMA_MAX_MDTS); + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, ndev->device->attrs.max_send_sge); - if (ndev->srq) { - qp_attr.srq = ndev->srq; + if (queue->nsrq) { + qp_attr.srq = queue->nsrq->srq; } else { /* +1 for drain */ qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; } + if (queue->port->pi_enable && queue->host_qid) + qp_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); if (ret) { pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } + queue->qp = queue->cm_id->qp; atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); @@ -1028,7 +1318,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, qp_attr.cap.max_send_wr, queue->cm_id); - if (!ndev->srq) { + if (!queue->nsrq) { for (i = 0; i < queue->recv_queue_size; i++) { queue->cmds[i].queue = queue; ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); @@ -1043,18 +1333,18 @@ out: err_destroy_qp: rdma_destroy_qp(queue->cm_id); err_destroy_cq: - ib_free_cq(queue->cq); + ib_cq_pool_put(queue->cq, nr_cqe + 1); goto out; } static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) { - struct ib_qp *qp = queue->cm_id->qp; - - ib_drain_qp(qp); - rdma_destroy_id(queue->cm_id); - ib_destroy_qp(qp); - ib_free_cq(queue->cq); + ib_drain_qp(queue->qp); + if (queue->cm_id) + rdma_destroy_id(queue->cm_id); + ib_destroy_qp(queue->qp); + ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * + queue->send_queue_size + 1); } static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) @@ -1062,15 +1352,16 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) pr_debug("freeing queue %d\n", queue->idx); nvmet_sq_destroy(&queue->nvme_sq); + nvmet_cq_put(&queue->nvme_cq); nvmet_rdma_destroy_queue_ib(queue); - if (!queue->dev->srq) { + if (!queue->nsrq) { nvmet_rdma_free_cmds(queue->dev, queue->cmds, queue->recv_queue_size, !queue->host_qid); } nvmet_rdma_free_rsps(queue); - ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + ida_free(&nvmet_rdma_queue_ida, queue->idx); kfree(queue); } @@ -1126,7 +1417,8 @@ static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); rej.sts = cpu_to_le16(status); - return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); + return rdma_reject(cm_id, (void *)&rej, sizeof(rej), + IB_CM_REJ_CONSUMER_DEFINED); } static struct nvmet_rdma_queue * @@ -1134,6 +1426,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { + struct nvmet_rdma_port *port = cm_id->context; struct nvmet_rdma_queue *queue; int ret; @@ -1143,7 +1436,8 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, goto out_reject; } - ret = nvmet_sq_init(&queue->nvme_sq); + nvmet_cq_init(&queue->nvme_cq); + ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) { ret = NVME_RDMA_CM_NO_RSC; goto out_free_queue; @@ -1160,29 +1454,38 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); queue->dev = ndev; queue->cm_id = cm_id; + queue->port = port->nport; spin_lock_init(&queue->state_lock); queue->state = NVMET_RDMA_Q_CONNECTING; INIT_LIST_HEAD(&queue->rsp_wait_list); INIT_LIST_HEAD(&queue->rsp_wr_wait_list); spin_lock_init(&queue->rsp_wr_wait_lock); - INIT_LIST_HEAD(&queue->free_rsps); - spin_lock_init(&queue->rsps_lock); INIT_LIST_HEAD(&queue->queue_list); - queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + queue->idx = ida_alloc(&nvmet_rdma_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = NVME_RDMA_CM_NO_RSC; goto out_destroy_sq; } + /* + * Spread the io queues across completion vectors, + * but still keep all admin queues on vector 0. + */ + queue->comp_vector = !queue->host_qid ? 0 : + queue->idx % ndev->device->num_comp_vectors; + + ret = nvmet_rdma_alloc_rsps(queue); if (ret) { ret = NVME_RDMA_CM_NO_RSC; goto out_ida_remove; } - if (!ndev->srq) { + if (ndev->srqs) { + queue->nsrq = ndev->srqs[queue->comp_vector % ndev->srq_count]; + } else { queue->cmds = nvmet_rdma_alloc_cmds(ndev, queue->recv_queue_size, !queue->host_qid); @@ -1203,7 +1506,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, return queue; out_free_cmds: - if (!ndev->srq) { + if (!queue->nsrq) { nvmet_rdma_free_cmds(queue->dev, queue->cmds, queue->recv_queue_size, !queue->host_qid); @@ -1211,10 +1514,11 @@ out_free_cmds: out_free_responses: nvmet_rdma_free_rsps(queue); out_ida_remove: - ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + ida_free(&nvmet_rdma_queue_ida, queue->idx); out_destroy_sq: nvmet_sq_destroy(&queue->nvme_sq); out_free_queue: + nvmet_cq_put(&queue->nvme_cq); kfree(queue); out_reject: nvmet_rdma_cm_reject(cm_id, ret); @@ -1229,6 +1533,10 @@ static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) case IB_EVENT_COMM_EST: rdma_notify(queue->cm_id, event->event); break; + case IB_EVENT_QP_LAST_WQE_REACHED: + pr_debug("received last WQE reached event for queue=0x%p\n", + queue); + break; default: pr_err("received IB QP event: %s (%d)\n", ib_event_msg(event->event), event->event); @@ -1278,18 +1586,31 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = -ENOMEM; goto put_device; } - queue->port = cm_id->context; if (queue->host_qid == 0) { - /* Let inflight controller teardown complete */ - flush_scheduled_work(); + struct nvmet_rdma_queue *q; + int pending = 0; + + /* Check for pending controller teardown */ + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(q, &nvmet_rdma_queue_list, queue_list) { + if (q->nvme_sq.ctrl == queue->nvme_sq.ctrl && + q->state == NVMET_RDMA_Q_DISCONNECTING) + pending++; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + if (pending > NVMET_RDMA_BACKLOG) + return NVME_SC_CONNECT_CTRL_BUSY; } ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); if (ret) { - schedule_work(&queue->release_work); - /* Destroying rdma_cm id is not needed here */ - return 0; + /* + * Don't destroy the cm_id in free path, as we implicitly + * destroy the cm_id here with non-zero ret code. + */ + queue->cm_id = NULL; + goto free_queue; } mutex_lock(&nvmet_rdma_queue_mutex); @@ -1298,6 +1619,8 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, return 0; +free_queue: + nvmet_rdma_free_queue(queue); put_device: kref_put(&ndev->ref, nvmet_rdma_free_dev); @@ -1341,6 +1664,16 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) spin_lock_irqsave(&queue->state_lock, flags); switch (queue->state) { case NVMET_RDMA_Q_CONNECTING: + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *rsp; + + rsp = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, + wait_list); + list_del(&rsp->wait_list); + nvmet_rdma_put_rsp(rsp); + } + fallthrough; case NVMET_RDMA_Q_LIVE: queue->state = NVMET_RDMA_Q_DISCONNECTING; disconnect = true; @@ -1352,7 +1685,7 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) if (disconnect) { rdma_disconnect(queue->cm_id); - schedule_work(&queue->release_work); + queue_work(nvmet_wq, &queue->release_work); } } @@ -1382,11 +1715,11 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, mutex_unlock(&nvmet_rdma_queue_mutex); pr_err("failed to connect queue %d\n", queue->idx); - schedule_work(&queue->release_work); + queue_work(nvmet_wq, &queue->release_work); } /** - * nvme_rdma_device_removal() - Handle RDMA device removal + * nvmet_rdma_device_removal() - Handle RDMA device removal * @cm_id: rdma_cm id, used for nvmet port * @queue: nvmet rdma queue (cm id qp_context) * @@ -1398,18 +1731,18 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, * We registered an ib_client to handle device removal for queues, * so we only need to handle the listening port cm_ids. In this case * we nullify the priv to prevent double cm_id destruction and destroying - * the cm_id implicitely by returning a non-zero rc to the callout. + * the cm_id implicitly by returning a non-zero rc to the callout. */ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { - struct nvmet_port *port; + struct nvmet_rdma_port *port; if (queue) { /* * This is a queue cm_id. we have registered * an ib_client to handle queues removal - * so don't interfear and just return. + * so don't interfere and just return. */ return 0; } @@ -1422,12 +1755,12 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, * cm_id destroy. use atomic xchg to make sure * we don't compete with remove_port. */ - if (xchg(&port->priv, NULL) != cm_id) + if (xchg(&port->cm_id, NULL) != cm_id) return 0; /* * We need to return 1 so that the core will destroy - * it's own ID. What a great API design.. + * its own ID. What a great API design.. */ return 1; } @@ -1453,6 +1786,13 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, nvmet_rdma_queue_established(queue); break; case RDMA_CM_EVENT_ADDR_CHANGE: + if (!queue) { + struct nvmet_rdma_port *port = cm_id->context; + + queue_delayed_work(nvmet_wq, &port->repair_work, 0); + break; + } + fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: nvmet_rdma_queue_disconnect(queue); @@ -1463,7 +1803,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_REJECTED: pr_debug("Connection rejected: %s\n", rdma_reject_msg(cm_id, event->status)); - /* FALLTHROUGH */ + fallthrough; case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_CONNECT_ERROR: nvmet_rdma_queue_connect_fail(cm_id, queue); @@ -1479,58 +1819,55 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) { - struct nvmet_rdma_queue *queue; + struct nvmet_rdma_queue *queue, *n; -restart: mutex_lock(&nvmet_rdma_queue_mutex); - list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { - if (queue->nvme_sq.ctrl == ctrl) { - list_del_init(&queue->queue_list); - mutex_unlock(&nvmet_rdma_queue_mutex); - - __nvmet_rdma_queue_disconnect(queue); - goto restart; - } + list_for_each_entry_safe(queue, n, &nvmet_rdma_queue_list, queue_list) { + if (queue->nvme_sq.ctrl != ctrl) + continue; + list_del_init(&queue->queue_list); + __nvmet_rdma_queue_disconnect(queue); } mutex_unlock(&nvmet_rdma_queue_mutex); } -static int nvmet_rdma_add_port(struct nvmet_port *port) +static void nvmet_rdma_destroy_port_queues(struct nvmet_rdma_port *port) { - struct rdma_cm_id *cm_id; - struct sockaddr_storage addr = { }; - __kernel_sa_family_t af; - int ret; + struct nvmet_rdma_queue *queue, *tmp; + struct nvmet_port *nport = port->nport; - switch (port->disc_addr.adrfam) { - case NVMF_ADDR_FAMILY_IP4: - af = AF_INET; - break; - case NVMF_ADDR_FAMILY_IP6: - af = AF_INET6; - break; - default: - pr_err("address family %d not supported\n", - port->disc_addr.adrfam); - return -EINVAL; - } + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, + queue_list) { + if (queue->port != nport) + continue; - if (port->inline_data_size < 0) { - port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; - } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { - pr_warn("inline_data_size %u is too large, reducing to %u\n", - port->inline_data_size, - NVMET_RDMA_MAX_INLINE_DATA_SIZE); - port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + list_del_init(&queue->queue_list); + __nvmet_rdma_queue_disconnect(queue); } + mutex_unlock(&nvmet_rdma_queue_mutex); +} - ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, - port->disc_addr.trsvcid, &addr); - if (ret) { - pr_err("malformed ip/port passed: %s:%s\n", - port->disc_addr.traddr, port->disc_addr.trsvcid); - return ret; - } +static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port) +{ + struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL); + + if (cm_id) + rdma_destroy_id(cm_id); + + /* + * Destroy the remaining queues, which are not belong to any + * controller yet. Do it here after the RDMA-CM was destroyed + * guarantees that no new queue will be created. + */ + nvmet_rdma_destroy_port_queues(port); +} + +static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) +{ + struct sockaddr *addr = (struct sockaddr *)&port->addr; + struct rdma_cm_id *cm_id; + int ret; cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, RDMA_PS_TCP, IB_QPT_RC); @@ -1549,23 +1886,19 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) goto out_destroy_id; } - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); + ret = rdma_bind_addr(cm_id, addr); if (ret) { - pr_err("binding CM ID to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } - ret = rdma_listen(cm_id, 128); + ret = rdma_listen(cm_id, NVMET_RDMA_BACKLOG); if (ret) { - pr_err("listening to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } - pr_info("enabling port %d (%pISpcs)\n", - le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); - port->priv = cm_id; + port->cm_id = cm_id; return 0; out_destroy_id: @@ -1573,20 +1906,102 @@ out_destroy_id: return ret; } -static void nvmet_rdma_remove_port(struct nvmet_port *port) +static void nvmet_rdma_repair_port_work(struct work_struct *w) { - struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); + struct nvmet_rdma_port *port = container_of(to_delayed_work(w), + struct nvmet_rdma_port, repair_work); + int ret; - if (cm_id) - rdma_destroy_id(cm_id); + nvmet_rdma_disable_port(port); + ret = nvmet_rdma_enable_port(port); + if (ret) + queue_delayed_work(nvmet_wq, &port->repair_work, 5 * HZ); +} + +static int nvmet_rdma_add_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port; + __kernel_sa_family_t af; + int ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + nport->priv = port; + port->nport = nport; + INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work); + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto out_free_port; + } + + if (nport->inline_data_size < 0) { + nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + nport->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + + if (nport->max_queue_size < 0) { + nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE; + } else if (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) { + pr_warn("max_queue_size %u is too large, reducing to %u\n", + nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE); + nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto out_free_port; + } + + ret = nvmet_rdma_enable_port(port); + if (ret) + goto out_free_port; + + pr_info("enabling port %d (%pISpcs)\n", + le16_to_cpu(nport->disc_addr.portid), + (struct sockaddr *)&port->addr); + + return 0; + +out_free_port: + kfree(port); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + + cancel_delayed_work_sync(&port->repair_work); + nvmet_rdma_disable_port(port); + kfree(port); } static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, - struct nvmet_port *port, char *traddr) + struct nvmet_port *nport, char *traddr) { - struct rdma_cm_id *cm_id = port->priv; + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; - if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { + if (inet_addr_is_any(&cm_id->route.addr.src_addr)) { struct nvmet_rdma_rsp *rsp = container_of(req, struct nvmet_rdma_rsp, req); struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; @@ -1594,20 +2009,48 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, sprintf(traddr, "%pISc", addr); } else { - memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); } } +static ssize_t nvmet_rdma_host_port_addr(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_len) +{ + struct nvmet_sq *nvme_sq = ctrl->sqs[0]; + struct nvmet_rdma_queue *queue = + container_of(nvme_sq, struct nvmet_rdma_queue, nvme_sq); + + return snprintf(traddr, traddr_len, "%pISc", + (struct sockaddr *)&queue->cm_id->route.addr.dst_addr); +} + +static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) +{ + if (ctrl->pi_support) + return NVMET_RDMA_MAX_METADATA_MDTS; + return NVMET_RDMA_MAX_MDTS; +} + +static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) +{ + if (ctrl->pi_support) + return NVME_RDMA_MAX_METADATA_QUEUE_SIZE; + return NVME_RDMA_MAX_QUEUE_SIZE; +} + static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, .msdbd = 1, - .has_keyed_sgls = 1, + .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED, .add_port = nvmet_rdma_add_port, .remove_port = nvmet_rdma_remove_port, .queue_response = nvmet_rdma_queue_response, .delete_ctrl = nvmet_rdma_delete_ctrl, .disc_traddr = nvmet_rdma_disc_port_addr, + .host_traddr = nvmet_rdma_host_port_addr, + .get_mdts = nvmet_rdma_get_mdts, + .get_max_queue_size = nvmet_rdma_get_max_queue_size, }; static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) @@ -1644,7 +2087,7 @@ static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data } mutex_unlock(&nvmet_rdma_queue_mutex); - flush_scheduled_work(); + flush_workqueue(nvmet_wq); } static struct ib_client nvmet_rdma_ib_client = { @@ -1682,5 +2125,6 @@ static void __exit nvmet_rdma_exit(void) module_init(nvmet_rdma_init); module_exit(nvmet_rdma_exit); +MODULE_DESCRIPTION("NVMe target RDMA transport driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ |
