summaryrefslogtreecommitdiff
path: root/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc/xprtrdma/svc_rdma_recvfrom.c')
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c270
1 files changed, 211 insertions, 59 deletions
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 5242ad121450..e7e4a39ca6c6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -94,7 +94,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
@@ -115,24 +115,22 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
rc_list);
}
-static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
- struct rpc_rdma_cid *cid)
-{
- cid->ci_queue_id = rdma->sc_rq_cq->res.id;
- cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
-}
-
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
+ int node = ibdev_to_node(rdma->sc_cm_id->device);
struct svc_rdma_recv_ctxt *ctxt;
+ unsigned long pages;
dma_addr_t addr;
void *buffer;
- ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+ pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server);
+ ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages),
+ GFP_KERNEL, node);
if (!ctxt)
goto fail0;
- buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+ ctxt->rc_maxpages = pages;
+ buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node);
if (!buffer)
goto fail1;
addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
@@ -155,7 +153,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
ctxt->rc_recv_buf = buffer;
- ctxt->rc_temp = false;
+ svc_rdma_cc_init(rdma, &ctxt->rc_cc);
return ctxt;
fail2:
@@ -204,18 +202,11 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
node = llist_del_first(&rdma->sc_recv_ctxts);
if (!node)
- goto out_empty;
- ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+ return NULL;
-out:
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
ctxt->rc_page_count = 0;
return ctxt;
-
-out_empty:
- ctxt = svc_rdma_recv_ctxt_alloc(rdma);
- if (!ctxt)
- return NULL;
- goto out;
}
/**
@@ -227,39 +218,42 @@ out_empty:
void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
+ svc_rdma_cc_release(rdma, &ctxt->rc_cc, DMA_FROM_DEVICE);
+
+ /* @rc_page_count is normally zero here, but error flows
+ * can leave pages in @rc_pages.
+ */
+ release_pages(ctxt->rc_pages, ctxt->rc_page_count);
+
pcl_free(&ctxt->rc_call_pcl);
pcl_free(&ctxt->rc_read_pcl);
pcl_free(&ctxt->rc_write_pcl);
pcl_free(&ctxt->rc_reply_pcl);
- if (!ctxt->rc_temp)
- llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
- else
- svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+ llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
}
/**
- * svc_rdma_release_rqst - Release transport-specific per-rqst resources
- * @rqstp: svc_rqst being released
+ * svc_rdma_release_ctxt - Release transport-specific per-rqst resources
+ * @xprt: the transport which owned the context
+ * @vctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt
*
* Ensure that the recv_ctxt is released whether or not a Reply
* was sent. For example, the client could close the connection,
* or svc_process could drop an RPC, before the Reply is sent.
*/
-void svc_rdma_release_rqst(struct svc_rqst *rqstp)
+void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
{
- struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt;
- struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svc_rdma_recv_ctxt *ctxt = vctxt;
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
- rqstp->rq_xprt_ctxt = NULL;
if (ctxt)
svc_rdma_recv_ctxt_put(rdma, ctxt);
}
static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
- unsigned int wanted, bool temp)
+ unsigned int wanted)
{
const struct ib_recv_wr *bad_wr = NULL;
struct svc_rdma_recv_ctxt *ctxt;
@@ -275,14 +269,13 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
if (!ctxt)
break;
- trace_svcrdma_post_recv(ctxt);
- ctxt->rc_temp = temp;
+ trace_svcrdma_post_recv(&ctxt->rc_cid);
ctxt->rc_recv_wr.next = recv_chain;
recv_chain = &ctxt->rc_recv_wr;
rdma->sc_pending_recvs++;
}
if (!recv_chain)
- return false;
+ return true;
ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr);
if (ret)
@@ -306,11 +299,28 @@ err_free:
* svc_rdma_post_recvs - Post initial set of Recv WRs
* @rdma: fresh svcxprt_rdma
*
- * Returns true if successful, otherwise false.
+ * Return values:
+ * %true: Receive Queue initialization successful
+ * %false: memory allocation or DMA error
*/
bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
{
- return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
+ unsigned int total;
+
+ /* For each credit, allocate enough recv_ctxts for one
+ * posted Receive and one RPC in process.
+ */
+ total = (rdma->sc_max_requests * 2) + rdma->sc_recv_batch;
+ while (total--) {
+ struct svc_rdma_recv_ctxt *ctxt;
+
+ ctxt = svc_rdma_recv_ctxt_alloc(rdma);
+ if (!ctxt)
+ return false;
+ llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
+ }
+
+ return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests);
}
/**
@@ -344,7 +354,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
* client reconnects.
*/
if (rdma->sc_pending_recvs < rdma->sc_max_requests)
- if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false))
+ if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch))
goto dropped;
/* All wc fields are now known to be valid */
@@ -378,6 +388,10 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
+ while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
+ list_del(&ctxt->rc_list);
+ svc_rdma_recv_ctxt_put(rdma, ctxt);
+ }
while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
list_del(&ctxt->rc_list);
svc_rdma_recv_ctxt_put(rdma, ctxt);
@@ -483,7 +497,13 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt)
if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount))
return false;
- /* A bogus segcount causes this buffer overflow check to fail. */
+ /* Before trusting the segcount value enough to use it in
+ * a computation, perform a simple range check. This is an
+ * arbitrary but sensible limit (ie, not architectural).
+ */
+ if (unlikely(segcount > rctxt->rc_maxpages))
+ return false;
+
p = xdr_inline_decode(&rctxt->rc_stream,
segcount * rpcrdma_segment_maxsz * sizeof(*p));
return p != NULL;
@@ -759,6 +779,122 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
return true;
}
+/* Finish constructing the RPC Call message in rqstp::rq_arg.
+ *
+ * The incoming RPC/RDMA message is an RDMA_MSG type message
+ * with a single Read chunk (only the upper layer data payload
+ * was conveyed via RDMA Read).
+ */
+static void svc_rdma_read_complete_one(struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *ctxt)
+{
+ struct svc_rdma_chunk *chunk = pcl_first_chunk(&ctxt->rc_read_pcl);
+ struct xdr_buf *buf = &rqstp->rq_arg;
+ unsigned int length;
+
+ /* Split the Receive buffer between the head and tail
+ * buffers at Read chunk's position. XDR roundup of the
+ * chunk is not included in either the pagelist or in
+ * the tail.
+ */
+ buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position;
+ buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position;
+ buf->head[0].iov_len = chunk->ch_position;
+
+ /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
+ *
+ * If the client already rounded up the chunk length, the
+ * length does not change. Otherwise, the length of the page
+ * list is increased to include XDR round-up.
+ *
+ * Currently these chunks always start at page offset 0,
+ * thus the rounded-up length never crosses a page boundary.
+ */
+ buf->pages = &rqstp->rq_pages[0];
+ length = xdr_align_size(chunk->ch_length);
+ buf->page_len = length;
+ buf->len += length;
+ buf->buflen += length;
+}
+
+/* Finish constructing the RPC Call message in rqstp::rq_arg.
+ *
+ * The incoming RPC/RDMA message is an RDMA_MSG type message
+ * with payload in multiple Read chunks and no PZRC.
+ */
+static void svc_rdma_read_complete_multiple(struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *ctxt)
+{
+ struct xdr_buf *buf = &rqstp->rq_arg;
+
+ buf->len += ctxt->rc_readbytes;
+ buf->buflen += ctxt->rc_readbytes;
+
+ buf->head[0].iov_base = page_address(rqstp->rq_pages[0]);
+ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, ctxt->rc_readbytes);
+ buf->pages = &rqstp->rq_pages[1];
+ buf->page_len = ctxt->rc_readbytes - buf->head[0].iov_len;
+}
+
+/* Finish constructing the RPC Call message in rqstp::rq_arg.
+ *
+ * The incoming RPC/RDMA message is an RDMA_NOMSG type message
+ * (the RPC message body was conveyed via RDMA Read).
+ */
+static void svc_rdma_read_complete_pzrc(struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *ctxt)
+{
+ struct xdr_buf *buf = &rqstp->rq_arg;
+
+ buf->len += ctxt->rc_readbytes;
+ buf->buflen += ctxt->rc_readbytes;
+
+ buf->head[0].iov_base = page_address(rqstp->rq_pages[0]);
+ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, ctxt->rc_readbytes);
+ buf->pages = &rqstp->rq_pages[1];
+ buf->page_len = ctxt->rc_readbytes - buf->head[0].iov_len;
+}
+
+static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *ctxt)
+{
+ unsigned int i;
+
+ /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing
+ * the rq_pages that were already allocated for this rqstp.
+ */
+ release_pages(rqstp->rq_respages, ctxt->rc_page_count);
+ for (i = 0; i < ctxt->rc_page_count; i++)
+ rqstp->rq_pages[i] = ctxt->rc_pages[i];
+
+ /* Update @rqstp's result send buffer to start after the
+ * last page in the RDMA Read payload.
+ */
+ rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* Prevent svc_rdma_recv_ctxt_put() from releasing the
+ * pages in ctxt::rc_pages a second time.
+ */
+ ctxt->rc_page_count = 0;
+
+ /* Finish constructing the RPC Call message. The exact
+ * procedure for that depends on what kind of RPC/RDMA
+ * chunks were provided by the client.
+ */
+ rqstp->rq_arg = ctxt->rc_saved_arg;
+ if (pcl_is_empty(&ctxt->rc_call_pcl)) {
+ if (ctxt->rc_read_pcl.cl_count == 1)
+ svc_rdma_read_complete_one(rqstp, ctxt);
+ else
+ svc_rdma_read_complete_multiple(rqstp, ctxt);
+ } else {
+ svc_rdma_read_complete_pzrc(rqstp, ctxt);
+ }
+
+ trace_svcrdma_read_finished(&ctxt->rc_cid);
+}
+
/**
* svc_rdma_recvfrom - Receive an RPC call
* @rqstp: request structure into which to receive an RPC Call
@@ -776,9 +912,6 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
*
* The next ctxt is removed from the "receive" lists.
*
- * - If the ctxt completes a Read, then finish assembling the Call
- * message and return the number of bytes in the message.
- *
* - If the ctxt completes a Receive, then construct the Call
* message from the contents of the Receive buffer.
*
@@ -787,7 +920,8 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
* in the message.
*
* - If there are Read chunks in this message, post Read WRs to
- * pull that payload and return 0.
+ * pull that payload. When the Read WRs complete, build the
+ * full message and return the number of bytes in it.
*/
int svc_rdma_recvfrom(struct svc_rqst *rqstp)
{
@@ -797,10 +931,23 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svc_rdma_recv_ctxt *ctxt;
int ret;
+ /* Prevent svc_xprt_release() from releasing pages in rq_pages
+ * when returning 0 or an error.
+ */
+ rqstp->rq_respages = rqstp->rq_pages;
+ rqstp->rq_next_page = rqstp->rq_respages;
+
rqstp->rq_xprt_ctxt = NULL;
- ctxt = NULL;
spin_lock(&rdma_xprt->sc_rq_dto_lock);
+ ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
+ if (ctxt) {
+ list_del(&ctxt->rc_list);
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+ svc_xprt_received(xprt);
+ svc_rdma_read_complete(rqstp, ctxt);
+ goto complete;
+ }
ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
if (ctxt)
list_del(&ctxt->rc_list);
@@ -820,12 +967,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
DMA_FROM_DEVICE);
svc_rdma_build_arg_xdr(rqstp, ctxt);
- /* Prevent svc_xprt_release from releasing pages in rq_pages
- * if we return 0 or an error.
- */
- rqstp->rq_respages = rqstp->rq_pages;
- rqstp->rq_next_page = rqstp->rq_respages;
-
ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
if (ret < 0)
goto out_err;
@@ -838,15 +979,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
- !pcl_is_empty(&ctxt->rc_call_pcl)) {
- ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
- if (ret < 0)
- goto out_readfail;
- }
+ !pcl_is_empty(&ctxt->rc_call_pcl))
+ goto out_readlist;
+complete:
rqstp->rq_xprt_ctxt = ctxt;
rqstp->rq_prot = IPPROTO_MAX;
svc_xprt_copy_addrs(rqstp, xprt);
+ set_bit(RQ_SECURE, &rqstp->rq_flags);
return rqstp->rq_arg.len;
out_err:
@@ -854,11 +994,23 @@ out_err:
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return 0;
-out_readfail:
- if (ret == -EINVAL)
- svc_rdma_send_error(rdma_xprt, ctxt, ret);
- svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
- return ret;
+out_readlist:
+ /* This @rqstp is about to be recycled. Save the work
+ * already done constructing the Call message in rq_arg
+ * so it can be restored when the RDMA Reads have
+ * completed.
+ */
+ ctxt->rc_saved_arg = rqstp->rq_arg;
+
+ ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
+ if (ret < 0) {
+ if (ret == -EINVAL)
+ svc_rdma_send_error(rdma_xprt, ctxt, ret);
+ svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
+ svc_xprt_deferred_close(xprt);
+ return ret;
+ }
+ return 0;
out_backchannel:
svc_rdma_handle_bc_reply(rqstp, ctxt);