From 61433af56077f5fd8815281b44938d84feb04687 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 16 Oct 2017 15:01:06 -0400
Subject: xprtrdma: Throw away reply when version is unrecognized

A reply with an unrecognized value in the version field means the
transport header is potentially garbled and therefore all the fields
are untrustworthy.

Fixes: 59aa1f9a3cce3 ("xprtrdma: Properly handle RDMA_ERROR ... ")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index f1889f4d4803..20c9e4cbaa73 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1248,6 +1248,9 @@ rpcrdma_reply_handler(struct work_struct *work)
 	p++;	/* credits */
 	proc = *p++;
 
+	if (vers != rpcrdma_version)
+		goto out_badversion;
+
 	if (rpcrdma_is_bcall(r_xprt, rep, xid, proc))
 		return;
 
@@ -1280,8 +1283,6 @@ rpcrdma_reply_handler(struct work_struct *work)
 	}
 
 	xprt->reestablish_timeout = 0;
-	if (vers != rpcrdma_version)
-		goto out_badversion;
 
 	switch (proc) {
 	case rdma_msg:
@@ -1321,17 +1322,15 @@ out_badstatus:
 	}
 	return;
 
-/* If the incoming reply terminated a pending RPC, the next
- * RPC call will post a replacement receive buffer as it is
- * being marshaled.
- */
 out_badversion:
 	dprintk("RPC:       %s: invalid version %d\n",
 		__func__, be32_to_cpu(vers));
-	status = -EIO;
-	r_xprt->rx_stats.bad_reply_count++;
-	goto out;
+	goto repost;
 
+/* If the incoming reply terminated a pending RPC, the next
+ * RPC call will post a replacement receive buffer as it is
+ * being marshaled.
+ */
 out_badheader:
 	dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
 		rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc));
-- 
cgit 


From 5381e0ec72eeb9467796ac4181ccb7bbce6d3e81 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 16 Oct 2017 15:01:14 -0400
Subject: xprtrdma: Move decoded header fields into rpcrdma_rep

Clean up: Make it easier to pass the decoded XID, vers, credits, and
proc fields around by moving these variables into struct rpcrdma_rep.

Note: the credits field will be handled in a subsequent patch.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 20c9e4cbaa73..e355cd322a32 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -970,14 +970,13 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws,
  * straightforward to check the RPC header's direction field.
  */
 static bool
-rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
-		 __be32 xid, __be32 proc)
+rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 {
 	struct xdr_stream *xdr = &rep->rr_stream;
 	__be32 *p;
 
-	if (proc != rdma_msg)
+	if (rep->rr_proc != rdma_msg)
 		return false;
 
 	/* Peek at stream contents without advancing. */
@@ -992,7 +991,7 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
 		return false;
 
 	/* RPC header */
-	if (*p++ != xid)
+	if (*p++ != rep->rr_xid)
 		return false;
 	if (*p != cpu_to_be32(RPC_CALL))
 		return false;
@@ -1224,41 +1223,40 @@ rpcrdma_reply_handler(struct work_struct *work)
 			container_of(work, struct rpcrdma_rep, rr_work);
 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-	struct xdr_stream *xdr = &rep->rr_stream;
 	struct rpcrdma_req *req;
 	struct rpc_rqst *rqst;
-	__be32 *p, xid, vers, proc;
 	unsigned long cwnd;
 	int status;
+	__be32 *p;
 
 	dprintk("RPC:       %s: incoming rep %p\n", __func__, rep);
 
 	if (rep->rr_hdrbuf.head[0].iov_len == 0)
 		goto out_badstatus;
 
-	xdr_init_decode(xdr, &rep->rr_hdrbuf,
+	xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
 			rep->rr_hdrbuf.head[0].iov_base);
 
 	/* Fixed transport header fields */
-	p = xdr_inline_decode(xdr, 4 * sizeof(*p));
+	p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
 	if (unlikely(!p))
 		goto out_shortreply;
-	xid = *p++;
-	vers = *p++;
+	rep->rr_xid = *p++;
+	rep->rr_vers = *p++;
 	p++;	/* credits */
-	proc = *p++;
+	rep->rr_proc = *p++;
 
-	if (vers != rpcrdma_version)
+	if (rep->rr_vers != rpcrdma_version)
 		goto out_badversion;
 
-	if (rpcrdma_is_bcall(r_xprt, rep, xid, proc))
+	if (rpcrdma_is_bcall(r_xprt, rep))
 		return;
 
 	/* Match incoming rpcrdma_rep to an rpcrdma_req to
 	 * get context for handling any incoming chunks.
 	 */
 	spin_lock(&xprt->recv_lock);
-	rqst = xprt_lookup_rqst(xprt, xid);
+	rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
 	if (!rqst)
 		goto out_norqst;
 	xprt_pin_rqst(rqst);
@@ -1267,7 +1265,7 @@ rpcrdma_reply_handler(struct work_struct *work)
 	req->rl_reply = rep;
 
 	dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
-		__func__, rep, req, be32_to_cpu(xid));
+		__func__, rep, req, be32_to_cpu(rep->rr_xid));
 
 	/* Invalidate and unmap the data payloads before waking the
 	 * waiting application. This guarantees the memory regions
@@ -1284,7 +1282,7 @@ rpcrdma_reply_handler(struct work_struct *work)
 
 	xprt->reestablish_timeout = 0;
 
-	switch (proc) {
+	switch (rep->rr_proc) {
 	case rdma_msg:
 		status = rpcrdma_decode_msg(r_xprt, rep, rqst);
 		break;
@@ -1324,7 +1322,7 @@ out_badstatus:
 
 out_badversion:
 	dprintk("RPC:       %s: invalid version %d\n",
-		__func__, be32_to_cpu(vers));
+		__func__, be32_to_cpu(rep->rr_vers));
 	goto repost;
 
 /* If the incoming reply terminated a pending RPC, the next
@@ -1333,7 +1331,7 @@ out_badversion:
  */
 out_badheader:
 	dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
-		rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc));
+		rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc));
 	r_xprt->rx_stats.bad_reply_count++;
 	status = -EIO;
 	goto out;
@@ -1345,7 +1343,7 @@ out_badheader:
 out_norqst:
 	spin_unlock(&xprt->recv_lock);
 	dprintk("RPC:       %s: no match for incoming xid 0x%08x\n",
-		__func__, be32_to_cpu(xid));
+		__func__, be32_to_cpu(rep->rr_xid));
 	goto repost;
 
 out_shortreply:
-- 
cgit 


From e1352c9610e3235f5e1b159038762d0c01c6ef36 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 16 Oct 2017 15:01:22 -0400
Subject: xprtrdma: Refactor rpcrdma_reply_handler some more

Clean up: I'd like to be able to invoke the tail of
rpcrdma_reply_handler in two different places. Split the tail out
into its own helper function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 105 +++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 47 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e355cd322a32..418bcc6b3e1d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1211,6 +1211,60 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
 	return -EREMOTEIO;
 }
 
+/* Perform XID lookup, reconstruction of the RPC reply, and
+ * RPC completion while holding the transport lock to ensure
+ * the rep, rqst, and rq_task pointers remain stable.
+ */
+void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpc_rqst *rqst = rep->rr_rqst;
+	unsigned long cwnd;
+	int status;
+
+	xprt->reestablish_timeout = 0;
+
+	switch (rep->rr_proc) {
+	case rdma_msg:
+		status = rpcrdma_decode_msg(r_xprt, rep, rqst);
+		break;
+	case rdma_nomsg:
+		status = rpcrdma_decode_nomsg(r_xprt, rep);
+		break;
+	case rdma_error:
+		status = rpcrdma_decode_error(r_xprt, rep, rqst);
+		break;
+	default:
+		status = -EIO;
+	}
+	if (status < 0)
+		goto out_badheader;
+
+out:
+	spin_lock(&xprt->recv_lock);
+	cwnd = xprt->cwnd;
+	xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
+	if (xprt->cwnd > cwnd)
+		xprt_release_rqst_cong(rqst->rq_task);
+
+	xprt_complete_rqst(rqst->rq_task, status);
+	xprt_unpin_rqst(rqst);
+	spin_unlock(&xprt->recv_lock);
+	return;
+
+/* If the incoming reply terminated a pending RPC, the next
+ * RPC call will post a replacement receive buffer as it is
+ * being marshaled.
+ */
+out_badheader:
+	dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
+		rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc));
+	r_xprt->rx_stats.bad_reply_count++;
+	status = -EIO;
+	goto out;
+}
+
 /* Process received RPC/RDMA messages.
  *
  * Errors must result in the RPC task either being awakened, or
@@ -1225,8 +1279,6 @@ rpcrdma_reply_handler(struct work_struct *work)
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
 	struct rpcrdma_req *req;
 	struct rpc_rqst *rqst;
-	unsigned long cwnd;
-	int status;
 	__be32 *p;
 
 	dprintk("RPC:       %s: incoming rep %p\n", __func__, rep);
@@ -1263,6 +1315,7 @@ rpcrdma_reply_handler(struct work_struct *work)
 	spin_unlock(&xprt->recv_lock);
 	req = rpcr_to_rdmar(rqst);
 	req->rl_reply = rep;
+	rep->rr_rqst = rqst;
 
 	dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
 		__func__, rep, req, be32_to_cpu(rep->rr_xid));
@@ -1280,36 +1333,7 @@ rpcrdma_reply_handler(struct work_struct *work)
 						    &req->rl_registered);
 	}
 
-	xprt->reestablish_timeout = 0;
-
-	switch (rep->rr_proc) {
-	case rdma_msg:
-		status = rpcrdma_decode_msg(r_xprt, rep, rqst);
-		break;
-	case rdma_nomsg:
-		status = rpcrdma_decode_nomsg(r_xprt, rep);
-		break;
-	case rdma_error:
-		status = rpcrdma_decode_error(r_xprt, rep, rqst);
-		break;
-	default:
-		status = -EIO;
-	}
-	if (status < 0)
-		goto out_badheader;
-
-out:
-	spin_lock(&xprt->recv_lock);
-	cwnd = xprt->cwnd;
-	xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
-	if (xprt->cwnd > cwnd)
-		xprt_release_rqst_cong(rqst->rq_task);
-
-	xprt_complete_rqst(rqst->rq_task, status);
-	xprt_unpin_rqst(rqst);
-	spin_unlock(&xprt->recv_lock);
-	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
-		__func__, xprt, rqst, status);
+	rpcrdma_complete_rqst(rep);
 	return;
 
 out_badstatus:
@@ -1325,20 +1349,8 @@ out_badversion:
 		__func__, be32_to_cpu(rep->rr_vers));
 	goto repost;
 
-/* If the incoming reply terminated a pending RPC, the next
- * RPC call will post a replacement receive buffer as it is
- * being marshaled.
- */
-out_badheader:
-	dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
-		rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc));
-	r_xprt->rx_stats.bad_reply_count++;
-	status = -EIO;
-	goto out;
-
-/* The req was still available, but by the time the recv_lock
- * was acquired, the rqst and task had been released. Thus the RPC
- * has already been terminated.
+/* The RPC transaction has already been terminated, or the header
+ * is corrupt.
  */
 out_norqst:
 	spin_unlock(&xprt->recv_lock);
@@ -1348,7 +1360,6 @@ out_norqst:
 
 out_shortreply:
 	dprintk("RPC:       %s: short/invalid reply\n", __func__);
-	goto repost;
 
 /* If no pending RPC transaction was matched, post a replacement
  * receive buffer before returning.
-- 
cgit 


From d8f532d20ee43a0117284798d486bc4f98e3b196 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 16 Oct 2017 15:01:30 -0400
Subject: xprtrdma: Invoke rpcrdma_reply_handler directly from RECV completion

I noticed that the soft IRQ thread looked pretty busy under heavy
I/O workloads. perf suggested one area that was expensive was the
queue_work() call in rpcrdma_wc_receive. That gave me some ideas.

Instead of scheduling a separate worker to process RPC Replies,
promote the Receive completion handler to IB_POLL_WORKQUEUE, and
invoke rpcrdma_reply_handler directly.

Note that the poll workqueue is single-threaded. In order to keep
memory invalidation from serializing all RPC Replies, handle any
necessary invalidation tasks in a separate multi-threaded workqueue.

This provides a two-tier scheme, similar to OS I/O interrupt
handlers: A fast interrupt handler that schedules the slow handler
and re-enables the interrupt, and a slower handler that is invoked
for any needed heavy lifting.

Benefits include:
- One less context switch for RPCs that don't register memory
- Receive completion handling is moved out of soft IRQ context to
  make room for other users of soft IRQ
- The same CPU core now DMA syncs and XDR decodes the Receive buffer

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 46 +++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 18 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 418bcc6b3e1d..430f8b5a8c43 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1265,16 +1265,36 @@ out_badheader:
 	goto out;
 }
 
+/* Reply handling runs in the poll worker thread. Anything that
+ * might wait is deferred to a separate workqueue.
+ */
+void rpcrdma_deferred_completion(struct work_struct *work)
+{
+	struct rpcrdma_rep *rep =
+			container_of(work, struct rpcrdma_rep, rr_work);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
+	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+
+	/* Invalidate and unmap the data payloads before waking
+	 * the waiting application. This guarantees the memory
+	 * regions are properly fenced from the server before the
+	 * application accesses the data. It also ensures proper
+	 * send flow control: waking the next RPC waits until this
+	 * RPC has relinquished all its Send Queue entries.
+	 */
+	rpcrdma_mark_remote_invalidation(&req->rl_registered, rep);
+	r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &req->rl_registered);
+
+	rpcrdma_complete_rqst(rep);
+}
+
 /* Process received RPC/RDMA messages.
  *
  * Errors must result in the RPC task either being awakened, or
  * allowed to timeout, to discover the errors at that time.
  */
-void
-rpcrdma_reply_handler(struct work_struct *work)
+void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 {
-	struct rpcrdma_rep *rep =
-			container_of(work, struct rpcrdma_rep, rr_work);
 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
 	struct rpcrdma_req *req;
@@ -1320,20 +1340,10 @@ rpcrdma_reply_handler(struct work_struct *work)
 	dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
 		__func__, rep, req, be32_to_cpu(rep->rr_xid));
 
-	/* Invalidate and unmap the data payloads before waking the
-	 * waiting application. This guarantees the memory regions
-	 * are properly fenced from the server before the application
-	 * accesses the data. It also ensures proper send flow control:
-	 * waking the next RPC waits until this RPC has relinquished
-	 * all its Send Queue entries.
-	 */
-	if (!list_empty(&req->rl_registered)) {
-		rpcrdma_mark_remote_invalidation(&req->rl_registered, rep);
-		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
-						    &req->rl_registered);
-	}
-
-	rpcrdma_complete_rqst(rep);
+	if (list_empty(&req->rl_registered))
+		rpcrdma_complete_rqst(rep);
+	else
+		queue_work(rpcrdma_receive_wq, &rep->rr_work);
 	return;
 
 out_badstatus:
-- 
cgit 


From be798f9082aa54524b209fac2c8164c81cd28f77 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 16 Oct 2017 15:01:39 -0400
Subject: xprtrdma: Decode credits field in rpcrdma_reply_handler

We need to decode and save the incoming rdma_credits field _after_
we know that the direction of the message is "forward direction
Reply". Otherwise, the credits value in reverse direction Calls is
also used to update the forward direction credits.

It is safe to decode the rdma_credits field in rpcrdma_reply_handler
now that rpcrdma_reply_handler is single-threaded. Receives complete
in the same order as they were sent on the NFS server.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 430f8b5a8c43..b8818c09a621 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1244,7 +1244,7 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
 out:
 	spin_lock(&xprt->recv_lock);
 	cwnd = xprt->cwnd;
-	xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
+	xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT;
 	if (xprt->cwnd > cwnd)
 		xprt_release_rqst_cong(rqst->rq_task);
 
@@ -1297,8 +1297,10 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 {
 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_req *req;
 	struct rpc_rqst *rqst;
+	u32 credits;
 	__be32 *p;
 
 	dprintk("RPC:       %s: incoming rep %p\n", __func__, rep);
@@ -1315,7 +1317,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 		goto out_shortreply;
 	rep->rr_xid = *p++;
 	rep->rr_vers = *p++;
-	p++;	/* credits */
+	credits = be32_to_cpu(*p++);
 	rep->rr_proc = *p++;
 
 	if (rep->rr_vers != rpcrdma_version)
@@ -1332,7 +1334,15 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	if (!rqst)
 		goto out_norqst;
 	xprt_pin_rqst(rqst);
+
+	if (credits == 0)
+		credits = 1;	/* don't deadlock */
+	else if (credits > buf->rb_max_requests)
+		credits = buf->rb_max_requests;
+	buf->rb_credits = credits;
+
 	spin_unlock(&xprt->recv_lock);
+
 	req = rpcr_to_rdmar(rqst);
 	req->rl_reply = rep;
 	rep->rr_rqst = rqst;
-- 
cgit 


From ad99f0530710af72b5bbecda9e770c736e92b328 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 20 Oct 2017 10:47:39 -0400
Subject: xprtrdma: Clean up SGE accounting in rpcrdma_prepare_msg_sges()

Clean up. rpcrdma_prepare_hdr_sge() sets num_sge to one, then
rpcrdma_prepare_msg_sges() sets num_sge again to the count of SGEs
it added, plus one for the header SGE just mapped in
rpcrdma_prepare_hdr_sge(). This is confusing, and nails in an
assumption about when these functions are called.

Instead, maintain a running count that both functions can update
with just the number of SGEs they have added to the SGE array.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index b8818c09a621..3c9255824d94 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -637,7 +637,7 @@ map_tail:
 	}
 
 out:
-	req->rl_send_wr.num_sge = sge_no + 1;
+	req->rl_send_wr.num_sge += sge_no;
 	return true;
 
 out_mapping_overflow:
-- 
cgit 


From 394b2c77cb761fb1382b0e97b7cdff2dd717b5ee Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 20 Oct 2017 10:47:47 -0400
Subject: xprtrdma: Fix error handling in rpcrdma_prepare_msg_sges()

When this function fails, it needs to undo the DMA mappings it's
done so far. Otherwise these are leaked.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 3c9255824d94..4f6c5395d198 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -511,6 +511,28 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 	return 0;
 }
 
+/**
+ * rpcrdma_unmap_sges - DMA-unmap Send buffers
+ * @ia: interface adapter (device)
+ * @req: req with possibly some SGEs to be DMA unmapped
+ *
+ */
+void
+rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+	struct ib_sge *sge;
+	unsigned int count;
+
+	/* The first two SGEs contain the transport header and
+	 * the inline buffer. These are always left mapped so
+	 * they can be cheaply re-used.
+	 */
+	sge = &req->rl_send_sge[2];
+	for (count = req->rl_mapped_sges; count--; sge++)
+		ib_dma_unmap_page(ia->ri_device,
+				  sge->addr, sge->length, DMA_TO_DEVICE);
+}
+
 /* Prepare the RPC-over-RDMA header SGE.
  */
 static bool
@@ -641,10 +663,12 @@ out:
 	return true;
 
 out_mapping_overflow:
+	rpcrdma_unmap_sges(ia, req);
 	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
 	return false;
 
 out_mapping_err:
+	rpcrdma_unmap_sges(ia, req);
 	pr_err("rpcrdma: Send mapping error\n");
 	return false;
 }
@@ -671,20 +695,6 @@ out_map:
 	return false;
 }
 
-void
-rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
-{
-	struct ib_device *device = ia->ri_device;
-	struct ib_sge *sge;
-	int count;
-
-	sge = &req->rl_send_sge[2];
-	for (count = req->rl_mapped_sges; count--; sge++)
-		ib_dma_unmap_page(device, sge->addr, sge->length,
-				  DMA_TO_DEVICE);
-	req->rl_mapped_sges = 0;
-}
-
 /**
  * rpcrdma_marshal_req - Marshal and send one RPC request
  * @r_xprt: controlling transport
-- 
cgit 


From 857f9acab9343788fe59f7be3a4710131b705db4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 20 Oct 2017 10:47:55 -0400
Subject: xprtrdma: Change return value of rpcrdma_prepare_send_sges()

Clean up: Make rpcrdma_prepare_send_sges() return a negative errno
instead of a bool. Soon callers will want distinct treatments of
different types of failures.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 52 ++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 20 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 4f6c5395d198..e3ece9843f9d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -544,7 +544,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
 
 	if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
 		if (!__rpcrdma_dma_map_regbuf(ia, rb))
-			return false;
+			goto out_regbuf;
 		sge->addr = rdmab_addr(rb);
 		sge->lkey = rdmab_lkey(rb);
 	}
@@ -554,6 +554,10 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
 				      sge->length, DMA_TO_DEVICE);
 	req->rl_send_wr.num_sge++;
 	return true;
+
+out_regbuf:
+	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+	return false;
 }
 
 /* Prepare the Send SGEs. The head and tail iovec, and each entry
@@ -574,7 +578,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
 	 * DMA-mapped. Sync the content that has changed.
 	 */
 	if (!rpcrdma_dma_map_regbuf(ia, rb))
-		return false;
+		goto out_regbuf;
 	sge_no = 1;
 	sge[sge_no].addr = rdmab_addr(rb);
 	sge[sge_no].length = xdr->head[0].iov_len;
@@ -662,6 +666,10 @@ out:
 	req->rl_send_wr.num_sge += sge_no;
 	return true;
 
+out_regbuf:
+	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+	return false;
+
 out_mapping_overflow:
 	rpcrdma_unmap_sges(ia, req);
 	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
@@ -673,26 +681,32 @@ out_mapping_err:
 	return false;
 }
 
-bool
-rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
-			  u32 hdrlen, struct xdr_buf *xdr,
-			  enum rpcrdma_chunktype rtype)
+/**
+ * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
+ * @r_xprt: controlling transport
+ * @req: context of RPC Call being marshalled
+ * @hdrlen: size of transport header, in bytes
+ * @xdr: xdr_buf containing RPC Call
+ * @rtype: chunk type being encoded
+ *
+ * Returns 0 on success; otherwise a negative errno is returned.
+ */
+int
+rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+			  struct rpcrdma_req *req, u32 hdrlen,
+			  struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
 {
 	req->rl_send_wr.num_sge = 0;
 	req->rl_mapped_sges = 0;
 
-	if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen))
-		goto out_map;
+	if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
+		return -EIO;
 
 	if (rtype != rpcrdma_areadch)
-		if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype))
-			goto out_map;
-
-	return true;
+		if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype))
+			return -EIO;
 
-out_map:
-	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
-	return false;
+	return 0;
 }
 
 /**
@@ -843,12 +857,10 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 		transfertypes[rtype], transfertypes[wtype],
 		xdr_stream_pos(xdr));
 
-	if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req,
-				       xdr_stream_pos(xdr),
-				       &rqst->rq_snd_buf, rtype)) {
-		ret = -EIO;
+	ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
+					&rqst->rq_snd_buf, rtype);
+	if (ret)
 		goto out_err;
-	}
 	return 0;
 
 out_err:
-- 
cgit 


From a062a2a3efc5fece106d96d4a5165f3f23b5cbda Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 20 Oct 2017 10:48:03 -0400
Subject: xprtrdma: "Unoptimize" rpcrdma_prepare_hdr_sge()

Commit 655fec6987be ("xprtrdma: Use gathered Send for large inline
messages") assumed that, since the zeroeth element of the Send SGE
array always pointed to req->rl_rdmabuf, it needed to be initialized
just once. This was a valid assumption because the Send SGE array
and rl_rdmabuf both live in the same rpcrdma_req.

In a subsequent patch, the Send SGE array will be separated from the
rpcrdma_req, so the zeroeth element of the SGE array needs to be
initialized every time.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e3ece9843f9d..7fd102960a81 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -533,7 +533,7 @@ rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
 				  sge->addr, sge->length, DMA_TO_DEVICE);
 }
 
-/* Prepare the RPC-over-RDMA header SGE.
+/* Prepare an SGE for the RPC-over-RDMA transport header.
  */
 static bool
 rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
@@ -542,13 +542,11 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
 	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
 	struct ib_sge *sge = &req->rl_send_sge[0];
 
-	if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
-		if (!__rpcrdma_dma_map_regbuf(ia, rb))
-			goto out_regbuf;
-		sge->addr = rdmab_addr(rb);
-		sge->lkey = rdmab_lkey(rb);
-	}
+	if (!rpcrdma_dma_map_regbuf(ia, rb))
+		goto out_regbuf;
+	sge->addr = rdmab_addr(rb);
 	sge->length = len;
+	sge->lkey = rdmab_lkey(rb);
 
 	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
 				      sge->length, DMA_TO_DEVICE);
-- 
cgit 


From ae72950abf99fb250aca972b3451b6e06a096c68 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 20 Oct 2017 10:48:12 -0400
Subject: xprtrdma: Add data structure to manage RDMA Send arguments

Problem statement:

Recently Sagi Grimberg <sagi@grimberg.me> observed that kernel RDMA-
enabled storage initiators don't handle delayed Send completion
correctly. If Send completion is delayed beyond the end of a ULP
transaction, the ULP may release resources that are still being used
by the HCA to complete a long-running Send operation.

This is a common design trait amongst our initiators. Most Send
operations are faster than the ULP transaction they are part of.
Waiting for a completion for these is typically unnecessary.

Infrequently, a network partition or some other problem crops up
where an ordering problem can occur. In NFS parlance, the RPC Reply
arrives and completes the RPC, but the HCA is still retrying the
Send WR that conveyed the RPC Call. In this case, the HCA can try
to use memory that has been invalidated or DMA unmapped, and the
connection is lost. If that memory has been re-used for something
else (possibly not related to NFS), and the Send retransmission
exposes that data on the wire.

Thus we cannot assume that it is safe to release Send-related
resources just because a ULP reply has arrived.

After some analysis, we have determined that the completion
housekeeping will not be difficult for xprtrdma:

 - Inline Send buffers are registered via the local DMA key, and
   are already left DMA mapped for the lifetime of a transport
   connection, thus no additional handling is necessary for those
 - Gathered Sends involving page cache pages _will_ need to
   DMA unmap those pages after the Send completes. But like
   inline send buffers, they are registered via the local DMA key,
   and thus will not need to be invalidated

In addition, RPC completion will need to wait for Send completion
in the latter case. However, nearly always, the Send that conveys
the RPC Call will have completed long before the RPC Reply
arrives, and thus no additional latency will be accrued.

Design notes:

In this patch, the rpcrdma_sendctx object is introduced, and a
lock-free circular queue is added to manage a set of them per
transport.

The RPC client's send path already prevents sending more than one
RPC Call at the same time. This allows us to treat the consumer
side of the queue (rpcrdma_sendctx_get_locked) as if there is a
single consumer thread.

The producer side of the queue (rpcrdma_sendctx_put_locked) is
invoked only from the Send completion handler, which is a single
thread of execution (soft IRQ).

The only care that needs to be taken is with the tail index, which
is shared between the producer and consumer. Only the producer
updates the tail index. The consumer compares the head with the
tail to ensure that the a sendctx that is in use is never handed
out again (or, expressed more conventionally, the queue is empty).

When the sendctx queue empties completely, there are enough Sends
outstanding that posting more Send operations can result in a Send
Queue overflow. In this case, the ULP is told to wait and try again.
This introduces strong Send Queue accounting to xprtrdma.

As a final touch, Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
suggested a mechanism that does not require signaling every Send.
We signal once every N Sends, and perform SGE unmapping of N Send
operations during that one completion.

Reported-by: Sagi Grimberg <sagi@grimberg.me>
Suggested-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 7fd102960a81..9951c81b82ed 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -512,23 +512,26 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 }
 
 /**
- * rpcrdma_unmap_sges - DMA-unmap Send buffers
- * @ia: interface adapter (device)
- * @req: req with possibly some SGEs to be DMA unmapped
+ * rpcrdma_unmap_sendctx - DMA-unmap Send buffers
+ * @sc: sendctx containing SGEs to unmap
  *
  */
 void
-rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
 {
+	struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia;
 	struct ib_sge *sge;
 	unsigned int count;
 
+	dprintk("RPC:       %s: unmapping %u sges for sc=%p\n",
+		__func__, sc->sc_unmap_count, sc);
+
 	/* The first two SGEs contain the transport header and
 	 * the inline buffer. These are always left mapped so
 	 * they can be cheaply re-used.
 	 */
-	sge = &req->rl_send_sge[2];
-	for (count = req->rl_mapped_sges; count--; sge++)
+	sge = &sc->sc_sges[2];
+	for (count = sc->sc_unmap_count; count; ++sge, --count)
 		ib_dma_unmap_page(ia->ri_device,
 				  sge->addr, sge->length, DMA_TO_DEVICE);
 }
@@ -539,8 +542,9 @@ static bool
 rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
 			u32 len)
 {
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
 	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
-	struct ib_sge *sge = &req->rl_send_sge[0];
+	struct ib_sge *sge = sc->sc_sges;
 
 	if (!rpcrdma_dma_map_regbuf(ia, rb))
 		goto out_regbuf;
@@ -550,7 +554,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
 
 	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
 				      sge->length, DMA_TO_DEVICE);
-	req->rl_send_wr.num_sge++;
+	sc->sc_wr.num_sge++;
 	return true;
 
 out_regbuf:
@@ -565,10 +569,11 @@ static bool
 rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
 			 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
 {
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
 	unsigned int sge_no, page_base, len, remaining;
 	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
 	struct ib_device *device = ia->ri_device;
-	struct ib_sge *sge = req->rl_send_sge;
+	struct ib_sge *sge = sc->sc_sges;
 	u32 lkey = ia->ri_pd->local_dma_lkey;
 	struct page *page, **ppages;
 
@@ -631,7 +636,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
 			sge[sge_no].length = len;
 			sge[sge_no].lkey = lkey;
 
-			req->rl_mapped_sges++;
+			sc->sc_unmap_count++;
 			ppages++;
 			remaining -= len;
 			page_base = 0;
@@ -657,11 +662,11 @@ map_tail:
 			goto out_mapping_err;
 		sge[sge_no].length = len;
 		sge[sge_no].lkey = lkey;
-		req->rl_mapped_sges++;
+		sc->sc_unmap_count++;
 	}
 
 out:
-	req->rl_send_wr.num_sge += sge_no;
+	sc->sc_wr.num_sge += sge_no;
 	return true;
 
 out_regbuf:
@@ -669,12 +674,12 @@ out_regbuf:
 	return false;
 
 out_mapping_overflow:
-	rpcrdma_unmap_sges(ia, req);
+	rpcrdma_unmap_sendctx(sc);
 	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
 	return false;
 
 out_mapping_err:
-	rpcrdma_unmap_sges(ia, req);
+	rpcrdma_unmap_sendctx(sc);
 	pr_err("rpcrdma: Send mapping error\n");
 	return false;
 }
@@ -694,8 +699,11 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 			  struct rpcrdma_req *req, u32 hdrlen,
 			  struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
 {
-	req->rl_send_wr.num_sge = 0;
-	req->rl_mapped_sges = 0;
+	req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf);
+	if (!req->rl_sendctx)
+		return -ENOBUFS;
+	req->rl_sendctx->sc_wr.num_sge = 0;
+	req->rl_sendctx->sc_unmap_count = 0;
 
 	if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
 		return -EIO;
-- 
cgit 


From 0ba6f37012db2f88f881cd818aec6e1886f61abb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 20 Oct 2017 10:48:28 -0400
Subject: xprtrdma: Refactor rpcrdma_deferred_completion

Invoke a common routine for releasing hardware resources (for
example, invalidating MRs). This needs to be done whether an
RPC Reply has arrived or the RPC was terminated early.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 9951c81b82ed..853dede38900 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1293,6 +1293,20 @@ out_badheader:
 	goto out;
 }
 
+void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	/* Invalidate and unmap the data payloads before waking
+	 * the waiting application. This guarantees the memory
+	 * regions are properly fenced from the server before the
+	 * application accesses the data. It also ensures proper
+	 * send flow control: waking the next RPC waits until this
+	 * RPC has relinquished all its Send Queue entries.
+	 */
+	if (!list_empty(&req->rl_registered))
+		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
+						    &req->rl_registered);
+}
+
 /* Reply handling runs in the poll worker thread. Anything that
  * might wait is deferred to a separate workqueue.
  */
@@ -1301,18 +1315,9 @@ void rpcrdma_deferred_completion(struct work_struct *work)
 	struct rpcrdma_rep *rep =
 			container_of(work, struct rpcrdma_rep, rr_work);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
-	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
 
-	/* Invalidate and unmap the data payloads before waking
-	 * the waiting application. This guarantees the memory
-	 * regions are properly fenced from the server before the
-	 * application accesses the data. It also ensures proper
-	 * send flow control: waking the next RPC waits until this
-	 * RPC has relinquished all its Send Queue entries.
-	 */
 	rpcrdma_mark_remote_invalidation(&req->rl_registered, rep);
-	r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &req->rl_registered);
-
+	rpcrdma_release_rqst(rep->rr_rxprt, req);
 	rpcrdma_complete_rqst(rep);
 }
 
@@ -1374,6 +1379,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	req = rpcr_to_rdmar(rqst);
 	req->rl_reply = rep;
 	rep->rr_rqst = rqst;
+	clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
 
 	dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
 		__func__, rep, req, be32_to_cpu(rep->rr_xid));
-- 
cgit 


From 01bb35c89d90abe6fd1c0be001f84bbdfa7fa7d1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 20 Oct 2017 10:48:36 -0400
Subject: xprtrdma: RPC completion should wait for Send completion

When an RPC Call includes a file data payload, that payload can come
from pages in the page cache, or a user buffer (for direct I/O).

If the payload can fit inline, xprtrdma includes it in the Send
using a scatter-gather technique. xprtrdma mustn't allow the RPC
consumer to re-use the memory where that payload resides before the
Send completes. Otherwise, the new contents of that memory would be
exposed by an HCA retransmit of the Send operation.

So, block RPC completion on Send completion, but only in the case
where a separate file data payload is part of the Send. This
prevents the reuse of that memory while it is still part of a Send
operation without an undue cost to other cases.

Waiting is avoided in the common case because typically the Send
will have completed long before the RPC Reply arrives.

These days, an RPC timeout will trigger a disconnect, which tears
down the QP. The disconnect flushes all waiting Sends. This bounds
the amount of time the reply handler has to wait for a Send
completion.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 853dede38900..4fdeaac6ebe6 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -534,6 +534,11 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
 	for (count = sc->sc_unmap_count; count; ++sge, --count)
 		ib_dma_unmap_page(ia->ri_device,
 				  sge->addr, sge->length, DMA_TO_DEVICE);
+
+	if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) {
+		smp_mb__after_atomic();
+		wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
+	}
 }
 
 /* Prepare an SGE for the RPC-over-RDMA transport header.
@@ -667,6 +672,8 @@ map_tail:
 
 out:
 	sc->sc_wr.num_sge += sge_no;
+	if (sc->sc_unmap_count)
+		__set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
 	return true;
 
 out_regbuf:
@@ -704,6 +711,8 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 		return -ENOBUFS;
 	req->rl_sendctx->sc_wr.num_sge = 0;
 	req->rl_sendctx->sc_unmap_count = 0;
+	req->rl_sendctx->sc_req = req;
+	__clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
 
 	if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
 		return -EIO;
@@ -1305,6 +1314,20 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	if (!list_empty(&req->rl_registered))
 		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
 						    &req->rl_registered);
+
+	/* Ensure that any DMA mapped pages associated with
+	 * the Send of the RPC Call have been unmapped before
+	 * allowing the RPC to complete. This protects argument
+	 * memory not controlled by the RPC client from being
+	 * re-used before we're done with it.
+	 */
+	if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
+		r_xprt->rx_stats.reply_waits_for_send++;
+		out_of_line_wait_on_bit(&req->rl_flags,
+					RPCRDMA_REQ_F_TX_RESOURCES,
+					bit_wait,
+					TASK_UNINTERRUPTIBLE);
+	}
 }
 
 /* Reply handling runs in the poll worker thread. Anything that
@@ -1384,7 +1407,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
 		__func__, rep, req, be32_to_cpu(rep->rr_xid));
 
-	if (list_empty(&req->rl_registered))
+	if (list_empty(&req->rl_registered) &&
+	    !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags))
 		rpcrdma_complete_rqst(rep);
 	else
 		queue_work(rpcrdma_receive_wq, &rep->rr_work);
-- 
cgit 


From 2232df5ece121fd7049ccff95cbb3acfab278d75 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Oct 2017 16:21:57 -0400
Subject: rpcrdma: Remove C structure definitions of XDR data items

Clean up: C-structure style XDR encoding and decoding logic has
been replaced over the past several merge windows on both the
client and server. These data structures are no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 4fdeaac6ebe6..45cb5497b37f 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -75,11 +75,11 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
 
 	/* Maximum Read list size */
 	maxsegs += 2;	/* segment for head and tail buffers */
-	size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+	size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
 
 	/* Minimal Read chunk size */
 	size += sizeof(__be32);	/* segment count */
-	size += sizeof(struct rpcrdma_segment);
+	size += rpcrdma_segment_maxsz * sizeof(__be32);
 	size += sizeof(__be32);	/* list discriminator */
 
 	dprintk("RPC:       %s: max call header size = %u\n",
@@ -102,7 +102,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
 	/* Maximum Write list size */
 	maxsegs += 2;	/* segment for head and tail buffers */
 	size = sizeof(__be32);		/* segment count */
-	size += maxsegs * sizeof(struct rpcrdma_segment);
+	size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
 	size += sizeof(__be32);	/* list discriminator */
 
 	dprintk("RPC:       %s: max reply header size = %u\n",
-- 
cgit 


From 62b56a675565a2e40f2cdf50455977448fd87413 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Oct 2017 16:22:14 -0400
Subject: xprtrdma: Update copyright notices

Credit work contributed by Oracle engineers since 2014.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 45cb5497b37f..ed34dc0f144c 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
-- 
cgit