diff options
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r-- | net/sunrpc/xprtrdma/Makefile | 2 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 9 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/ib_client.c | 184 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/module.c | 18 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 3 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma.c | 22 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 10 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_rw.c | 181 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 150 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 112 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 1 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 96 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 5 |
14 files changed, 570 insertions, 225 deletions
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 55b21bae866d..3232aa23cdb4 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o -rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ +rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ svc_rdma_pcl.o module.o diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ffbf99894970..31434aeb8e29 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -54,7 +54,7 @@ static void frwr_cid_init(struct rpcrdma_ep *ep, cid->ci_completion_id = mr->mr_ibmr->res.id; } -static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +static void frwr_mr_unmap(struct rpcrdma_mr *mr) { if (mr->mr_device) { trace_xprtrdma_mr_unmap(mr); @@ -73,7 +73,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr) { int rc; - frwr_mr_unmap(mr->mr_xprt, mr); + frwr_mr_unmap(mr); rc = ib_dereg_mr(mr->mr_ibmr); if (rc) @@ -84,7 +84,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr) static void frwr_mr_put(struct rpcrdma_mr *mr) { - frwr_mr_unmap(mr->mr_xprt, mr); + frwr_mr_unmap(mr); /* The MR is returned to the req's MR free list instead * of to the xprt's MR free list. No spinlock is needed. @@ -92,7 +92,8 @@ static void frwr_mr_put(struct rpcrdma_mr *mr) rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } -/* frwr_reset - Place MRs back on the free list +/** + * frwr_reset - Place MRs back on @req's free list * @req: request to reset * * Used after a failed marshal. For FRWR, this means the MRs diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c new file mode 100644 index 000000000000..28c68b5f6823 --- /dev/null +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2024 Oracle. All rights reserved. + */ + +/* #include <linux/module.h> +#include <linux/slab.h> */ +#include <linux/xarray.h> +#include <linux/types.h> +#include <linux/kref.h> +#include <linux/completion.h> + +#include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/rdma_rn.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +/* Per-ib_device private data for rpcrdma */ +struct rpcrdma_device { + struct kref rd_kref; + unsigned long rd_flags; + struct ib_device *rd_device; + struct xarray rd_xa; + struct completion rd_done; +}; + +#define RPCRDMA_RD_F_REMOVING (0) + +static struct ib_client rpcrdma_ib_client; + +/* + * Listeners have no associated device, so we never register them. + * Note that ib_get_client_data() does not check if @device is + * NULL for us. + */ +static struct rpcrdma_device *rpcrdma_get_client_data(struct ib_device *device) +{ + if (!device) + return NULL; + return ib_get_client_data(device, &rpcrdma_ib_client); +} + +/** + * rpcrdma_rn_register - register to get device removal notifications + * @device: device to monitor + * @rn: notification object that wishes to be notified + * @done: callback to notify caller of device removal + * + * Returns zero on success. The callback in rn_done is guaranteed + * to be invoked when the device is removed, unless this notification + * is unregistered first. + * + * On failure, a negative errno is returned. + */ +int rpcrdma_rn_register(struct ib_device *device, + struct rpcrdma_notification *rn, + void (*done)(struct rpcrdma_notification *rn)) +{ + struct rpcrdma_device *rd = rpcrdma_get_client_data(device); + + if (!rd || test_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags)) + return -ENETUNREACH; + + if (xa_alloc(&rd->rd_xa, &rn->rn_index, rn, xa_limit_32b, GFP_KERNEL) < 0) + return -ENOMEM; + kref_get(&rd->rd_kref); + rn->rn_done = done; + trace_rpcrdma_client_register(device, rn); + return 0; +} + +static void rpcrdma_rn_release(struct kref *kref) +{ + struct rpcrdma_device *rd = container_of(kref, struct rpcrdma_device, + rd_kref); + + trace_rpcrdma_client_completion(rd->rd_device); + complete(&rd->rd_done); +} + +/** + * rpcrdma_rn_unregister - stop device removal notifications + * @device: monitored device + * @rn: notification object that no longer wishes to be notified + */ +void rpcrdma_rn_unregister(struct ib_device *device, + struct rpcrdma_notification *rn) +{ + struct rpcrdma_device *rd = rpcrdma_get_client_data(device); + + if (!rd) + return; + + trace_rpcrdma_client_unregister(device, rn); + xa_erase(&rd->rd_xa, rn->rn_index); + kref_put(&rd->rd_kref, rpcrdma_rn_release); +} + +/** + * rpcrdma_add_one - ib_client device insertion callback + * @device: device about to be inserted + * + * Returns zero on success. xprtrdma private data has been allocated + * for this device. On failure, a negative errno is returned. + */ +static int rpcrdma_add_one(struct ib_device *device) +{ + struct rpcrdma_device *rd; + + rd = kzalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return -ENOMEM; + + kref_init(&rd->rd_kref); + xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC); + rd->rd_device = device; + init_completion(&rd->rd_done); + ib_set_client_data(device, &rpcrdma_ib_client, rd); + + trace_rpcrdma_client_add_one(device); + return 0; +} + +/** + * rpcrdma_remove_one - ib_client device removal callback + * @device: device about to be removed + * @client_data: this module's private per-device data + * + * Upon return, all transports associated with @device have divested + * themselves from IB hardware resources. + */ +static void rpcrdma_remove_one(struct ib_device *device, + void *client_data) +{ + struct rpcrdma_device *rd = client_data; + struct rpcrdma_notification *rn; + unsigned long index; + + trace_rpcrdma_client_remove_one(device); + + set_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags); + xa_for_each(&rd->rd_xa, index, rn) + rn->rn_done(rn); + + /* + * Wait only if there are still outstanding notification + * registrants for this device. + */ + if (!refcount_dec_and_test(&rd->rd_kref.refcount)) { + trace_rpcrdma_client_wait_on(device); + wait_for_completion(&rd->rd_done); + } + + trace_rpcrdma_client_remove_one_done(device); + xa_destroy(&rd->rd_xa); + kfree(rd); +} + +static struct ib_client rpcrdma_ib_client = { + .name = "rpcrdma", + .add = rpcrdma_add_one, + .remove = rpcrdma_remove_one, +}; + +/** + * rpcrdma_ib_client_unregister - unregister ib_client for xprtrdma + * + * cel: watch for orphaned rpcrdma_device objects on module unload + */ +void rpcrdma_ib_client_unregister(void) +{ + ib_unregister_client(&rpcrdma_ib_client); +} + +/** + * rpcrdma_ib_client_register - register ib_client for rpcrdma + * + * Returns zero on success, or a negative errno. + */ +int rpcrdma_ib_client_register(void) +{ + return ib_register_client(&rpcrdma_ib_client); +} diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 45c5b41ac8dc..697f571d4c01 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/rdma_rn.h> #include <asm/swab.h> @@ -30,21 +31,32 @@ static void __exit rpc_rdma_cleanup(void) { xprt_rdma_cleanup(); svc_rdma_cleanup(); + rpcrdma_ib_client_unregister(); } static int __init rpc_rdma_init(void) { int rc; + rc = rpcrdma_ib_client_register(); + if (rc) + goto out_rc; + rc = svc_rdma_init(); if (rc) - goto out; + goto out_ib_client; rc = xprt_rdma_init(); if (rc) - svc_rdma_cleanup(); + goto out_svc_rdma; -out: + return 0; + +out_svc_rdma: + svc_rdma_cleanup(); +out_ib_client: + rpcrdma_ib_client_unregister(); +out_rc: return rc; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 190a4de239c8..1478c41c7e9d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1471,8 +1471,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_ep->re_max_requests) credits = r_xprt->rx_ep->re_max_requests; - rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1), - false); + rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1)); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index f86970733eb0..415c0310101f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -74,7 +74,7 @@ enum { SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long), }; -static int svcrdma_counter_handler(struct ctl_table *table, int write, +static int svcrdma_counter_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct percpu_counter *stat = (struct percpu_counter *)table->data; @@ -209,7 +209,6 @@ static struct ctl_table svcrdma_parm_table[] = { .extra1 = &zero, .extra2 = &zero, }, - { }, }; static void svc_rdma_proc_cleanup(void) @@ -234,25 +233,34 @@ static int svc_rdma_proc_init(void) rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err; rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_read; rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_recv; rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_sq; svcrdma_table_header = register_sysctl("sunrpc/svc_rdma", svcrdma_parm_table); + if (!svcrdma_table_header) + goto err_write; + return 0; -out_err: +err_write: + rc = -ENOMEM; + percpu_counter_destroy(&svcrdma_stat_write); +err_sq: percpu_counter_destroy(&svcrdma_stat_sq_starve); +err_recv: percpu_counter_destroy(&svcrdma_stat_recv); +err_read: percpu_counter_destroy(&svcrdma_stat_read); +err: return rc; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index c9be6778643b..e5a78b761012 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -90,7 +90,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, */ get_page(virt_to_page(rqst->rq_buffer)); sctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, sctxt); + return svc_rdma_post_send(rdma, sctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index d72953f29258..292022f0976e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -94,7 +94,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -493,7 +493,13 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount)) return false; - /* A bogus segcount causes this buffer overflow check to fail. */ + /* Before trusting the segcount value enough to use it in + * a computation, perform a simple range check. This is an + * arbitrary but sensible limit (ie, not architectural). + */ + if (unlikely(segcount > RPCSVC_MAXPAGES)) + return false; + p = xdr_inline_decode(&rctxt->rc_stream, segcount * rpcrdma_segment_maxsz * sizeof(*p)); return p != NULL; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c00fcce61d1e..40797114d50a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -197,28 +197,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, llist_add_batch(first, last, &rdma->sc_rw_ctxts); } -/* State for sending a Write or Reply chunk. - * - Tracks progress of writing one chunk over all its segments - * - Stores arguments for the SGL constructor functions - */ -struct svc_rdma_write_info { - struct svcxprt_rdma *wi_rdma; - - const struct svc_rdma_chunk *wi_chunk; - - /* write state of this chunk */ - unsigned int wi_seg_off; - unsigned int wi_seg_no; - - /* SGL constructor arguments */ - const struct xdr_buf *wi_xdr; - unsigned char *wi_base; - unsigned int wi_next_off; - - struct svc_rdma_chunk_ctxt wi_cc; - struct work_struct wi_work; -}; - static struct svc_rdma_write_info * svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk) @@ -253,6 +231,49 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) } /** + * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + */ +void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_chunk_ctxt *cc = &ctxt->sc_reply_info.wi_cc; + + if (!cc->cc_sqecount) + return; + svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE); +} + +/** + * svc_rdma_reply_done - Reply chunk Write completion handler + * @cq: controlling Completion Queue + * @wc: Work Completion report + * + * Pages under I/O are released by a subsequent Send completion. + */ +static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cq->cq_context; + + switch (wc->status) { + case IB_WC_SUCCESS: + trace_svcrdma_wc_reply(&cc->cc_cid); + return; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_reply_err(wc, &cc->cc_cid); + } + + svc_xprt_deferred_close(&rdma->sc_xprt); +} + +/** * svc_rdma_write_done - Write chunk completion * @cq: controlling Completion Queue * @wc: Work Completion @@ -580,41 +601,33 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -/** - * svc_rdma_send_write_chunk - Write all segments in a Write chunk - * @rdma: controlling RDMA transport - * @chunk: Write chunk provided by the client - * @xdr: xdr_buf containing the data payload - * - * Returns a non-negative number of bytes the chunk consumed, or - * %-E2BIG if the payload was larger than the Write chunk, - * %-EINVAL if client provided too many segments, - * %-ENOMEM if rdma_rw context pool was exhausted, - * %-ENOTCONN if posting failed (connection is lost), - * %-EIO if rdma_rw initialization failed (DMA mapping, etc). - */ -int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; + struct xdr_buf payload; int ret; + if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, + chunk->ch_payload_length)) + return -EMSGSIZE; + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; cc = &info->wi_cc; - ret = svc_rdma_xb_write(xdr, info); - if (ret != xdr->len) + ret = svc_rdma_xb_write(&payload, info); + if (ret != payload.len) goto out_err; trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); ret = svc_rdma_post_chunk_ctxt(rdma, cc); if (ret < 0) goto out_err; - return xdr->len; + return 0; out_err: svc_rdma_write_info_free(info); @@ -622,9 +635,37 @@ out_err: } /** - * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk + * svc_rdma_send_write_list - Send all chunks on the Write list * @rdma: controlling RDMA transport - * @rctxt: Write and Reply chunks from client + * @rctxt: Write list provisioned by the client + * @xdr: xdr_buf containing an RPC Reply message + * + * Returns zero on success, or a negative errno if one or more + * Write chunks could not be sent. + */ +int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) +{ + struct svc_rdma_chunk *chunk; + int ret; + + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { + if (!chunk->ch_payload_length) + break; + ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); + if (ret < 0) + return ret; + } + return 0; +} + +/** + * svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk + * @rdma: controlling RDMA transport + * @write_pcl: Write chunk list provided by client + * @reply_pcl: Reply chunk provided by client + * @sctxt: Send WR resources * @xdr: xdr_buf containing an RPC Reply * * Returns a non-negative number of bytes the chunk consumed, or @@ -634,39 +675,45 @@ out_err: * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr) +int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) { - struct svc_rdma_write_info *info; - struct svc_rdma_chunk_ctxt *cc; - struct svc_rdma_chunk *chunk; + struct svc_rdma_write_info *info = &sctxt->sc_reply_info; + struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; + struct ib_send_wr *first_wr; + struct list_head *pos; + struct ib_cqe *cqe; int ret; - if (pcl_is_empty(&rctxt->rc_reply_pcl)) - return 0; - - chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); - info = svc_rdma_write_info_alloc(rdma, chunk); - if (!info) - return -ENOMEM; - cc = &info->wi_cc; + info->wi_rdma = rdma; + info->wi_chunk = pcl_first_chunk(reply_pcl); + info->wi_seg_off = 0; + info->wi_seg_no = 0; + info->wi_cc.cc_cqe.done = svc_rdma_reply_done; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_write, info); if (ret < 0) - goto out_err; + return ret; - trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) - goto out_err; + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; - return xdr->len; + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; -out_err: - svc_rdma_write_info_free(info); - return ret; + trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); + return xdr->len; } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 1a49b7f02041..96154a2367a1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -100,7 +100,7 @@ */ #include <linux/spinlock.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -205,9 +205,13 @@ out: xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, NULL); + svc_rdma_cc_init(rdma, &ctxt->sc_reply_info.wi_cc); ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; ctxt->sc_page_count = 0; + ctxt->sc_wr_chain = &ctxt->sc_send_wr; + ctxt->sc_sqecount = 1; + return ctxt; out_empty: @@ -223,6 +227,8 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + svc_rdma_reply_chunk_release(rdma, ctxt); + if (ctxt->sc_page_count) release_pages(ctxt->sc_pages, ctxt->sc_page_count); @@ -293,7 +299,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_send_ctxt *ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - svc_rdma_wake_send_waiters(rdma, 1); + svc_rdma_wake_send_waiters(rdma, ctxt->sc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) goto flushed; @@ -312,51 +318,76 @@ flushed: } /** - * svc_rdma_send - Post a single Send WR - * @rdma: transport on which to post the WR - * @ctxt: send ctxt with a Send WR ready to post + * svc_rdma_post_send - Post a WR chain to the Send Queue + * @rdma: transport context + * @ctxt: WR chain to post + * + * Copy fields in @ctxt to stack variables in order to guarantee + * that these values remain available after the ib_post_send() call. + * In some error flow cases, svc_rdma_wc_send() releases @ctxt. + * + * Note there is potential for starvation when the Send Queue is + * full because there is no order to when waiting threads are + * awoken. The transport is typically provisioned with a deep + * enough Send Queue that SQ exhaustion should be a rare event. * - * Returns zero if the Send WR was posted successfully. Otherwise, a - * negative errno is returned. + * Return values: + * %0: @ctxt's WR chain was posted successfully + * %-ENOTCONN: The connection was lost */ -int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) +int svc_rdma_post_send(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) { - struct ib_send_wr *wr = &ctxt->sc_send_wr; - int ret; + struct ib_send_wr *first_wr = ctxt->sc_wr_chain; + struct ib_send_wr *send_wr = &ctxt->sc_send_wr; + const struct ib_send_wr *bad_wr = first_wr; + struct rpc_rdma_cid cid = ctxt->sc_cid; + int ret, sqecount = ctxt->sc_sqecount; might_sleep(); /* Sync the transport header buffer */ ib_dma_sync_single_for_device(rdma->sc_pd->device, - wr->sg_list[0].addr, - wr->sg_list[0].length, + send_wr->sg_list[0].addr, + send_wr->sg_list[0].length, DMA_TO_DEVICE); /* If the SQ is full, wait until an SQ entry is available */ - while (1) { - if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { + while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { + if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { + svc_rdma_wake_send_waiters(rdma, sqecount); + + /* When the transport is torn down, assume + * ib_drain_sq() will trigger enough Send + * completions to wake us. The XPT_CLOSE test + * above should then cause the while loop to + * exit. + */ percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &ctxt->sc_cid); - atomic_inc(&rdma->sc_sq_avail); + trace_svcrdma_sq_full(rdma, &cid); wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > 1); - if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) - return -ENOTCONN; - trace_svcrdma_sq_retry(rdma, &ctxt->sc_cid); + atomic_read(&rdma->sc_sq_avail) > 0); + trace_svcrdma_sq_retry(rdma, &cid); continue; } trace_svcrdma_post_send(ctxt); - ret = ib_post_send(rdma->sc_qp, wr, NULL); - if (ret) - break; + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) { + trace_svcrdma_sq_post_err(rdma, &cid, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + + /* If even one WR was posted, there will be a + * Send completion that bumps sc_sq_avail. + */ + if (bad_wr == first_wr) { + svc_rdma_wake_send_waiters(rdma, sqecount); + break; + } + } return 0; } - - trace_svcrdma_sq_post_err(rdma, &ctxt->sc_cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - wake_up(&rdma->sc_send_wait); - return ret; + return -ENOTCONN; } /** @@ -839,16 +870,10 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. * * Depending on whether a Write list or Reply chunk is present, - * the server may send all, a portion of, or none of the xdr_buf. + * the server may Send all, a portion of, or none of the xdr_buf. * In the latter case, only the transport header (sc_sges[0]) is * transmitted. * - * RDMA Send is the last step of transmitting an RPC reply. Pages - * involved in the earlier RDMA Writes are here transferred out - * of the rqstp and into the sctxt's page array. These pages are - * DMA unmapped by each Write completion, but the subsequent Send - * completion finally releases these pages. - * * Assumptions: * - The Reply's transport header will never be larger than a page. */ @@ -857,6 +882,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, const struct svc_rdma_recv_ctxt *rctxt, struct svc_rqst *rqstp) { + struct ib_send_wr *send_wr = &sctxt->sc_send_wr; int ret; ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl, @@ -864,16 +890,19 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, if (ret < 0) return ret; + /* Transfer pages involved in RDMA Writes to the sctxt's + * page array. Completion handling releases these pages. + */ svc_rdma_save_io_pages(rqstp, sctxt); if (rctxt->rc_inv_rkey) { - sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; - sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; + send_wr->opcode = IB_WR_SEND_WITH_INV; + send_wr->ex.invalidate_rkey = rctxt->rc_inv_rkey; } else { - sctxt->sc_send_wr.opcode = IB_WR_SEND; + send_wr->opcode = IB_WR_SEND; } - return svc_rdma_send(rdma, sctxt); + return svc_rdma_post_send(rdma, sctxt); } /** @@ -937,7 +966,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; - if (svc_rdma_send(rdma, sctxt)) + if (svc_rdma_post_send(rdma, sctxt)) goto put_ctxt; return; @@ -984,10 +1013,19 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); if (ret < 0) - goto reply_chunk; - rc_size = ret; + goto put_ctxt; + + rc_size = 0; + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) { + ret = svc_rdma_prepare_reply_chunk(rdma, &rctxt->rc_write_pcl, + &rctxt->rc_reply_pcl, sctxt, + &rqstp->rq_res); + if (ret < 0) + goto reply_chunk; + rc_size = ret; + } *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); @@ -1030,45 +1068,33 @@ drop_connection: /** * svc_rdma_result_payload - special processing for a result payload - * @rqstp: svc_rqst to operate on - * @offset: payload's byte offset in @xdr + * @rqstp: RPC transaction context + * @offset: payload's byte offset in @rqstp->rq_res * @length: size of payload, in bytes * + * Assign the passed-in result payload to the current Write chunk, + * and advance to cur_result_payload to the next Write chunk, if + * there is one. + * * Return values: * %0 if successful or nothing needed to be done - * %-EMSGSIZE on XDR buffer overflow * %-E2BIG if the payload was larger than the Write chunk - * %-EINVAL if client provided too many segments - * %-ENOMEM if rdma_rw context pool was exhausted - * %-ENOTCONN if posting failed (connection is lost) - * %-EIO if rdma_rw initialization failed (DMA mapping, etc) */ int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length) { struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; struct svc_rdma_chunk *chunk; - struct svcxprt_rdma *rdma; - struct xdr_buf subbuf; - int ret; chunk = rctxt->rc_cur_result_payload; if (!length || !chunk) return 0; rctxt->rc_cur_result_payload = pcl_next_chunk(&rctxt->rc_write_pcl, chunk); + if (length > chunk->ch_length) return -E2BIG; - chunk->ch_position = offset; chunk->ch_payload_length = length; - - if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length)) - return -EMSGSIZE; - - rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); - ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); - if (ret < 0) - return ret; return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4f27325ace4a..c3fbf0779d4a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -65,6 +65,8 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, struct net *net, int node); +static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -122,6 +124,41 @@ static void qp_event_handler(struct ib_event *event, void *context) } } +static struct rdma_cm_id * +svc_rdma_create_listen_id(struct net *net, struct sockaddr *sap, + void *context) +{ + struct rdma_cm_id *listen_id; + int ret; + + listen_id = rdma_create_id(net, svc_rdma_listen_handler, context, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(listen_id)) + return listen_id; + + /* Allow both IPv4 and IPv6 sockets to bind a single port + * at the same time. + */ +#if IS_ENABLED(CONFIG_IPV6) + ret = rdma_set_afonly(listen_id, 1); + if (ret) + goto out_destroy; +#endif + ret = rdma_bind_addr(listen_id, sap); + if (ret) + goto out_destroy; + + ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); + if (ret) + goto out_destroy; + + return listen_id; + +out_destroy: + rdma_destroy_id(listen_id); + return ERR_PTR(ret); +} + static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, struct net *net, int node) { @@ -247,17 +284,31 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, * * Return values: * %0: Do not destroy @cma_id - * %1: Destroy @cma_id (never returned here) + * %1: Destroy @cma_id * * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners. */ static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { + struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; + struct svcxprt_rdma *cma_xprt = cma_id->context; + struct svc_xprt *cma_rdma = &cma_xprt->sc_xprt; + struct rdma_cm_id *listen_id; + switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: handle_connect_req(cma_id, &event->param.conn); break; + case RDMA_CM_EVENT_ADDR_CHANGE: + listen_id = svc_rdma_create_listen_id(cma_rdma->xpt_net, + sap, cma_xprt); + if (IS_ERR(listen_id)) { + pr_err("Listener dead, address change failed for device %s\n", + cma_id->device->name); + } else + cma_xprt->sc_cm_id = listen_id; + return 1; default: break; } @@ -288,7 +339,6 @@ static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id, svc_xprt_enqueue(xprt); break; case RDMA_CM_EVENT_DISCONNECTED: - case RDMA_CM_EVENT_DEVICE_REMOVAL: svc_xprt_deferred_close(xprt); break; default: @@ -307,7 +357,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, { struct rdma_cm_id *listen_id; struct svcxprt_rdma *cma_xprt; - int ret; if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) return ERR_PTR(-EAFNOSUPPORT); @@ -317,30 +366,13 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); - listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt, - RDMA_PS_TCP, IB_QPT_RC); + listen_id = svc_rdma_create_listen_id(net, sa, cma_xprt); if (IS_ERR(listen_id)) { - ret = PTR_ERR(listen_id); - goto err0; + kfree(cma_xprt); + return ERR_CAST(listen_id); } - - /* Allow both IPv4 and IPv6 sockets to bind a single port - * at the same time. - */ -#if IS_ENABLED(CONFIG_IPV6) - ret = rdma_set_afonly(listen_id, 1); - if (ret) - goto err1; -#endif - ret = rdma_bind_addr(listen_id, sa); - if (ret) - goto err1; cma_xprt->sc_cm_id = listen_id; - ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); - if (ret) - goto err1; - /* * We need to use the address from the cm_id in case the * caller specified 0 for the port number. @@ -349,12 +381,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); return &cma_xprt->sc_xprt; +} - err1: - rdma_destroy_id(listen_id); - err0: - kfree(cma_xprt); - return ERR_PTR(ret); +static void svc_rdma_xprt_done(struct rpcrdma_notification *rn) +{ + struct svcxprt_rdma *rdma = container_of(rn, struct svcxprt_rdma, + sc_rn); + struct rdma_cm_id *id = rdma->sc_cm_id; + + trace_svcrdma_device_removal(id); + svc_xprt_close(&rdma->sc_xprt); } /* @@ -398,6 +434,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dev = newxprt->sc_cm_id->device; newxprt->sc_port_num = newxprt->sc_cm_id->port_num; + if (rpcrdma_rn_register(dev, &newxprt->sc_rn, svc_rdma_xprt_done)) + goto errout; + newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = svcrdma_max_requests; newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; @@ -415,15 +454,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) newxprt->sc_max_send_sges = dev->attrs.max_send_sge; rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + - newxprt->sc_recv_batch; + newxprt->sc_recv_batch + 1 /* drain */; if (rq_depth > dev->attrs.max_qp_wr) { rq_depth = dev->attrs.max_qp_wr; newxprt->sc_recv_batch = 1; newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } - ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); - ctxts *= newxprt->sc_max_requests; + + /* Arbitrarily estimate the number of rw_ctxs needed for + * this transport. This is enough rw_ctxs to make forward + * progress even if the client is using one rkey per page + * in each Read chunk. + */ + ctxts = 3 * RPCSVC_MAXPAGES; newxprt->sc_sq_depth = rq_depth + ctxts; if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; @@ -460,12 +504,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge); - + dprintk(" send CQ depth = %u, recv CQ depth = %u\n", + newxprt->sc_sq_depth, rq_depth); ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { trace_svcrdma_qp_err(newxprt, ret); goto errout; } + newxprt->sc_max_send_sges = qp_attr.cap.max_send_sge; newxprt->sc_qp = newxprt->sc_cm_id->qp; if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) @@ -546,6 +592,7 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); + struct ib_device *device = rdma->sc_cm_id->device; /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -574,6 +621,7 @@ static void __svc_rdma_free(struct work_struct *work) /* Destroy the CM ID */ rdma_destroy_id(rdma->sc_cm_id); + rpcrdma_rn_unregister(device, &rdma->sc_rn); kfree(rdma); } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 29b0562d62e7..9a8ce5df83ca 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -137,7 +137,6 @@ static struct ctl_table xr_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; #endif diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 4f8d7efa469f..63262ef0c2e3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -49,14 +49,14 @@ * o buffer memory */ +#include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc_rdma.h> #include <linux/log2.h> -#include <asm-generic/barrier.h> -#include <asm/bitops.h> +#include <asm/barrier.h> #include <rdma/ib_cm.h> @@ -69,13 +69,15 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_ep_get(struct rpcrdma_ep *ep); static int rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, + int node); +static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); @@ -222,7 +224,6 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; struct rpcrdma_ep *ep = id->context; might_sleep(); @@ -241,10 +242,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_async_rc = -ENETUNREACH; complete(&ep->re_done); return 0; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - pr_info("rpcrdma: removing device %s for %pISpc\n", - ep->re_id->device->name, sap); - fallthrough; case RDMA_CM_EVENT_ADDR_CHANGE: ep->re_connect_status = -ENODEV; goto disconnected; @@ -280,6 +277,14 @@ disconnected: return 0; } +static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn) +{ + struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn); + + trace_xprtrdma_device_removal(ep->re_id); + xprt_force_disconnect(ep->re_xprt); +} + static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep) { @@ -319,6 +324,10 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, if (rc) goto out; + rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done); + if (rc) + goto out; + return id; out: @@ -346,6 +355,8 @@ static void rpcrdma_ep_destroy(struct kref *kref) ib_dealloc_pd(ep->re_pd); ep->re_pd = NULL; + rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn); + kfree(ep); module_put(THIS_MODULE); } @@ -501,7 +512,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) * outstanding Receives. */ rpcrdma_ep_get(ep); - rpcrdma_post_recvs(r_xprt, 1, true); + rpcrdma_post_recvs(r_xprt, 1); rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) @@ -893,6 +904,8 @@ static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt) static void rpcrdma_req_reset(struct rpcrdma_req *req) { + struct rpcrdma_mr *mr; + /* Credits are valid for only one connection */ req->rl_slot.rq_cong = 0; @@ -902,7 +915,19 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req) rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); - frwr_reset(req); + /* The verbs consumer can't know the state of an MR on the + * req->rl_registered list unless a successful completion + * has occurred, so they cannot be re-used. + */ + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { + struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&mr->mr_all); + spin_unlock(&buf->rb_lock); + + frwr_mr_release(mr); + } } /* ASSUMPTION: the rb_allreqs list is stable for the duration, @@ -920,18 +945,20 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) } static noinline -struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, - bool temp) +struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct ib_device *device = ep->re_id->device; struct rpcrdma_rep *rep; rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, - DMA_FROM_DEVICE); + rep->rr_rdmabuf = rpcrdma_regbuf_alloc_node(ep->re_inline_recv, + DMA_FROM_DEVICE, + ibdev_to_node(device)); if (!rep->rr_rdmabuf) goto out_free; @@ -946,7 +973,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; - rep->rr_temp = temp; spin_lock(&buf->rb_lock); list_add(&rep->rr_all, &buf->rb_all_reps); @@ -965,17 +991,6 @@ static void rpcrdma_rep_free(struct rpcrdma_rep *rep) kfree(rep); } -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) -{ - struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf; - - spin_lock(&buf->rb_lock); - list_del(&rep->rr_all); - spin_unlock(&buf->rb_lock); - - rpcrdma_rep_free(rep); -} - static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) { struct llist_node *node; @@ -1007,10 +1022,8 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - list_for_each_entry(rep, &buf->rb_all_reps, rr_all) { + list_for_each_entry(rep, &buf->rb_all_reps, rr_all) rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); - rep->rr_temp = true; /* Mark this rep for destruction */ - } } static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) @@ -1227,14 +1240,15 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) * or Replies they may be registered externally via frwr_map. */ static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) +rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, + int node) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS); + rb = kmalloc_node(sizeof(*rb), XPRTRDMA_GFP_FLAGS, node); if (!rb) return NULL; - rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS); + rb->rg_data = kmalloc_node(size, XPRTRDMA_GFP_FLAGS, node); if (!rb->rg_data) { kfree(rb); return NULL; @@ -1246,6 +1260,12 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) return rb; } +static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) +{ + return rpcrdma_regbuf_alloc_node(size, direction, NUMA_NO_NODE); +} + /** * rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer * @rb: regbuf to reallocate @@ -1323,10 +1343,9 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance * @needed: current credit grant - * @temp: mark Receive buffers to be deleted after one use * */ -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; @@ -1340,8 +1359,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) if (likely(ep->re_receive_count > needed)) goto out; needed -= ep->re_receive_count; - if (!temp) - needed += RPCRDMA_MAX_RECV_BATCH; + needed += RPCRDMA_MAX_RECV_BATCH; if (atomic_inc_return(&ep->re_receiving) > 1) goto out; @@ -1350,12 +1368,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) wr = NULL; while (needed) { rep = rpcrdma_rep_get_locked(buf); - if (rep && rep->rr_temp) { - rpcrdma_rep_destroy(rep); - continue; - } if (!rep) - rep = rpcrdma_rep_create(r_xprt, temp); + rep = rpcrdma_rep_create(r_xprt); if (!rep) break; if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index da409450dfc0..8147d2b41494 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -56,6 +56,7 @@ #include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ +#include <linux/sunrpc/rdma_rn.h> /* removal notifications */ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ @@ -92,6 +93,7 @@ struct rpcrdma_ep { struct rpcrdma_connect_private re_cm_private; struct rdma_conn_param re_remote_cma; + struct rpcrdma_notification re_rn; int re_receive_count; unsigned int re_max_requests; /* depends on device */ unsigned int re_inline_send; /* negotiated */ @@ -198,7 +200,6 @@ struct rpcrdma_rep { __be32 rr_proc; int rr_wc_flags; u32 rr_inv_rkey; - bool rr_temp; struct rpcrdma_regbuf *rr_rdmabuf; struct rpcrdma_xprt *rr_rxprt; struct rpc_rqst *rr_rqst; @@ -466,7 +467,7 @@ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed); /* * Buffer calls - xprtrdma/verbs.c |