diff options
Diffstat (limited to 'net/sunrpc/xprtrdma/rpc_rdma.c')
| -rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 960 |
1 files changed, 545 insertions, 415 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index d18614e02b4e..3aac1456e23e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2014-2020, Oracle and/or its affiliates. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -54,10 +54,6 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* Returns size of largest RPC-over-RDMA header in a Call message * * The largest Call header contains a full-size Read list and a @@ -71,15 +67,13 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Read list size */ - size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); + size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ size += sizeof(__be32); /* segment count */ size += rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ - dprintk("RPC: %s: max call header size = %u\n", - __func__, size); return size; } @@ -96,25 +90,29 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Write list size */ - size = sizeof(__be32); /* segment count */ + size += sizeof(__be32); /* segment count */ size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ - dprintk("RPC: %s: max reply header size = %u\n", - __func__, size); return size; } -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_set_max_header_sizes - Initialize inline payload sizes + * @ep: endpoint to initialize + * + * The max_inline fields contain the maximum size of an RPC message + * so the marshaling code doesn't have to repeat this calculation + * for every RPC. + */ +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) { - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - unsigned int maxsegs = ia->ri_max_segs; - - ia->ri_max_inline_write = cdata->inline_wsize - - rpcrdma_max_call_header_size(maxsegs); - ia->ri_max_inline_read = cdata->inline_rsize - - rpcrdma_max_reply_header_size(maxsegs); + unsigned int maxsegs = ep->re_max_rdma_segs; + + ep->re_max_inline_send = + ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); + ep->re_max_inline_recv = + ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); } /* The client can send a request inline as long as the RPCRDMA header @@ -129,9 +127,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { struct xdr_buf *xdr = &rqst->rq_snd_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ia.ri_max_inline_write) + if (xdr->len > ep->re_max_inline_send) return false; if (xdr->page_len) { @@ -142,7 +141,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ia.ri_max_send_sges) + if (++count > ep->re_attr.cap.max_send_sge) return false; } } @@ -159,14 +158,49 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; +} + +/* The client is required to provide a Reply chunk if the maximum + * size of the non-payload part of the RPC Reply is larger than + * the inline threshold. + */ +static bool +rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, + const struct rpc_rqst *rqst) +{ + const struct xdr_buf *buf = &rqst->rq_rcv_buf; + + return (buf->head[0].iov_len + buf->tail[0].iov_len) < + r_xprt->rx_ep->re_max_inline_recv; +} + +/* ACL likes to be lazy in allocating pages. For TCP, these + * pages can be allocated during receive processing. Not true + * for RDMA, which must always provision receive buffers + * up front. + */ +static noinline int +rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) +{ + struct page **ppages; + int len; + + len = buf->page_len; + ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); + while (len > 0) { + if (!*ppages) + *ppages = alloc_page(GFP_NOWAIT); + if (!*ppages) + return -ENOBUFS; + ppages++; + len -= PAGE_SIZE; + } - return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; + return 0; } -/* Split @vec on page boundaries into SGEs. FMR registers pages, not - * a byte range. Other modes coalesce these SGEs into a single MR - * when they can. +/* Convert @vec to a single SGL element. * * Returns pointer to next available SGE, and bumps the total number * of SGEs consumed. @@ -175,22 +209,11 @@ static struct rpcrdma_mr_seg * rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, unsigned int *n) { - u32 remaining, page_offset; - char *base; - - base = vec->iov_base; - page_offset = offset_in_page(base); - remaining = vec->iov_len; - while (remaining) { - seg->mr_page = NULL; - seg->mr_offset = base; - seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); - remaining -= seg->mr_len; - base += seg->mr_len; - ++seg; - ++(*n); - page_offset = 0; - } + seg->mr_page = virt_to_page(vec->iov_base); + seg->mr_offset = offset_in_page(vec->iov_base); + seg->mr_len = vec->iov_len; + ++seg; + ++(*n); return seg; } @@ -218,17 +241,8 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. - */ - if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { - if (!*ppages) - *ppages = alloc_page(GFP_ATOMIC); - if (!*ppages) - return -ENOBUFS; - } seg->mr_page = *ppages; - seg->mr_offset = (char *)page_base; + seg->mr_offset = page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); len -= seg->mr_len; ++ppages; @@ -237,22 +251,11 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, page_base = 0; } - /* When encoding a Read chunk, the tail iovec contains an - * XDR pad and may be omitted. - */ - if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) - goto out; - - /* When encoding a Write chunk, some servers need to see an - * extra segment for non-XDR-aligned Write chunks. The upper - * layer provides space in the tail iovec that may be used - * for this purpose. - */ - if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_readch || type == rpcrdma_writech) goto out; if (xdrbuf->tail[0].iov_len) - seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); + rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); out: if (unlikely(n > RPCRDMA_MAX_SEGS)) @@ -260,40 +263,6 @@ out: return n; } -static inline int -encode_item_present(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, sizeof(*p)); - if (unlikely(!p)) - return -EMSGSIZE; - - *p = xdr_one; - return 0; -} - -static inline int -encode_item_not_present(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, sizeof(*p)); - if (unlikely(!p)) - return -EMSGSIZE; - - *p = xdr_zero; - return 0; -} - -static void -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) -{ - *iptr++ = cpu_to_be32(mr->mr_handle); - *iptr++ = cpu_to_be32(mr->mr_length); - xdr_encode_hyper(iptr, mr->mr_offset); -} - static int encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { @@ -303,7 +272,7 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) if (unlikely(!p)) return -EMSGSIZE; - xdr_encode_rdma_segment(p, mr); + xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset); return 0; } @@ -318,11 +287,35 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return -EMSGSIZE; *p++ = xdr_one; /* Item present */ - *p++ = cpu_to_be32(position); - xdr_encode_rdma_segment(p, mr); + xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length, + mr->mr_offset); return 0; } +static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, + struct rpcrdma_mr **mr) +{ + *mr = rpcrdma_mr_pop(&req->rl_free_mrs); + if (!*mr) { + *mr = rpcrdma_mr_get(r_xprt); + if (!*mr) + goto out_getmr_err; + (*mr)->mr_req = req; + } + + rpcrdma_mr_push(*mr, &req->rl_registered); + return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); + +out_getmr_err: + trace_xprtrdma_nomrs_err(r_xprt, req); + xprt_wait_for_buffer_space(&r_xprt->rx_xprt); + rpcrdma_mrs_refresh(r_xprt); + return ERR_PTR(-EAGAIN); +} + /* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * @@ -337,9 +330,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, * * Only a single @pos value is currently supported. */ -static noinline int -rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) +static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype rtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -347,6 +341,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, unsigned int pos; int nsegs; + if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) + goto done; + pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) pos = 0; @@ -357,10 +354,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return nsegs; do { - seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; @@ -370,6 +366,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nsegs -= mr->mr_nents; } while (nsegs); +done: + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; return 0; } @@ -388,16 +387,21 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * * Only a single Write chunk is currently supported. */ -static noinline int -rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr_seg *seg; struct rpcrdma_mr *mr; int nsegs, nchunks; __be32 *segcount; + if (wtype != rpcrdma_writech) + goto done; + seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, @@ -405,7 +409,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (nsegs < 0) return nsegs; - if (encode_item_present(xdr) < 0) + if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; segcount = xdr_reserve_space(xdr, sizeof(*segcount)); if (unlikely(!segcount)) @@ -414,10 +418,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; @@ -429,9 +432,24 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nsegs -= mr->mr_nents; } while (nsegs); + if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { + if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) + return -EMSGSIZE; + + trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, + nsegs); + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; + nsegs -= mr->mr_nents; + } + /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); +done: + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; return 0; } @@ -447,9 +465,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * Returns zero on success, or a negative errno if a failure occurred. * @xdr is advanced to the next position in the stream. */ -static noinline int -rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -457,12 +476,18 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, int nsegs, nchunks; __be32 *segcount; + if (wtype != rpcrdma_replych) { + if (xdr_stream_encode_item_absent(xdr) < 0) + return -EMSGSIZE; + return 0; + } + seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return nsegs; - if (encode_item_present(xdr) < 0) + if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; segcount = xdr_reserve_space(xdr, sizeof(*segcount)); if (unlikely(!segcount)) @@ -471,10 +496,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; @@ -492,183 +516,267 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return 0; } +static void rpcrdma_sendctx_done(struct kref *kref) +{ + struct rpcrdma_req *req = + container_of(kref, struct rpcrdma_req, rl_kref); + struct rpcrdma_rep *rep = req->rl_reply; + + rpcrdma_complete_rqst(rep); + rep->rr_rxprt->rx_stats.reply_waits_for_send++; +} + /** - * rpcrdma_unmap_sendctx - DMA-unmap Send buffers + * rpcrdma_sendctx_unmap - DMA-unmap Send buffer * @sc: sendctx containing SGEs to unmap * */ -void -rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) +void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) { - struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia; + struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf; struct ib_sge *sge; - unsigned int count; + + if (!sc->sc_unmap_count) + return; /* The first two SGEs contain the transport header and * the inline buffer. These are always left mapped so * they can be cheaply re-used. */ - sge = &sc->sc_sges[2]; - for (count = sc->sc_unmap_count; count; ++sge, --count) - ib_dma_unmap_page(ia->ri_device, - sge->addr, sge->length, DMA_TO_DEVICE); - - if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) { - smp_mb__after_atomic(); - wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES); - } + for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; + ++sge, --sc->sc_unmap_count) + ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length, + DMA_TO_DEVICE); + + kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done); } /* Prepare an SGE for the RPC-over-RDMA transport header. */ -static bool -rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - u32 len) +static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 len) { struct rpcrdma_sendctx *sc = req->rl_sendctx; struct rpcrdma_regbuf *rb = req->rl_rdmabuf; - struct ib_sge *sge = sc->sc_sges; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; - if (!rpcrdma_dma_map_regbuf(ia, rb)) - goto out_regbuf; sge->addr = rdmab_addr(rb); sge->length = len; sge->lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, - sge->length, DMA_TO_DEVICE); - sc->sc_wr.num_sge++; - return true; - -out_regbuf: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); - return false; + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, + DMA_TO_DEVICE); } -/* Prepare the Send SGEs. The head and tail iovec, and each entry - * in the page list, gets its own SGE. +/* The head iovec is straightforward, as it is usually already + * DMA-mapped. Sync the content that has changed. */ -static bool -rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, unsigned int len) { struct rpcrdma_sendctx *sc = req->rl_sendctx; - unsigned int sge_no, page_base, len, remaining; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; struct rpcrdma_regbuf *rb = req->rl_sendbuf; - struct ib_device *device = ia->ri_device; - struct ib_sge *sge = sc->sc_sges; - u32 lkey = ia->ri_pd->local_dma_lkey; - struct page *page, **ppages; - /* The head iovec is straightforward, as it is already - * DMA-mapped. Sync the content that has changed. - */ - if (!rpcrdma_dma_map_regbuf(ia, rb)) - goto out_regbuf; - sge_no = 1; - sge[sge_no].addr = rdmab_addr(rb); - sge[sge_no].length = xdr->head[0].iov_len; - sge[sge_no].lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr, - sge[sge_no].length, DMA_TO_DEVICE); - - /* If there is a Read chunk, the page list is being handled - * via explicit RDMA, and thus is skipped here. However, the - * tail iovec may include an XDR pad for the page list, as - * well as additional content, and may not reside in the - * same page as the head iovec. - */ - if (rtype == rpcrdma_readch) { - len = xdr->tail[0].iov_len; + if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) + return false; - /* Do not include the tail if it is only an XDR pad */ - if (len < 4) - goto out; + sge->addr = rdmab_addr(rb); + sge->length = len; + sge->lkey = rdmab_lkey(rb); - page = virt_to_page(xdr->tail[0].iov_base); - page_base = offset_in_page(xdr->tail[0].iov_base); + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, + DMA_TO_DEVICE); + return true; +} - /* If the content in the page list is an odd length, - * xdr_write_pages() has added a pad at the beginning - * of the tail iovec. Force the tail's non-pad content - * to land at the next XDR position in the Send message. - */ - page_base += len & 3; - len -= len & 3; - goto map_tail; - } +/* If there is a page list present, DMA map and prepare an + * SGE for each page to be sent. + */ +static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + struct rpcrdma_sendctx *sc = req->rl_sendctx; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; + unsigned int page_base, len, remaining; + struct page **ppages; + struct ib_sge *sge; - /* If there is a page list present, temporarily DMA map - * and prepare an SGE for each page to be sent. - */ - if (xdr->page_len) { - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_base = offset_in_page(xdr->page_base); - remaining = xdr->page_len; - while (remaining) { - sge_no++; - if (sge_no > RPCRDMA_MAX_SEND_SGES - 2) - goto out_mapping_overflow; - - len = min_t(u32, PAGE_SIZE - page_base, remaining); - sge[sge_no].addr = ib_dma_map_page(device, *ppages, - page_base, len, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(device, sge[sge_no].addr)) - goto out_mapping_err; - sge[sge_no].length = len; - sge[sge_no].lkey = lkey; - - sc->sc_unmap_count++; - ppages++; - remaining -= len; - page_base = 0; - } - } + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + sge = &sc->sc_sges[req->rl_wr.num_sge++]; + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); + sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages, + page_base, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) + goto out_mapping_err; - /* The tail iovec is not always constructed in the same - * page where the head iovec resides (see, for example, - * gss_wrap_req_priv). To neatly accommodate that case, - * DMA map it separately. - */ - if (xdr->tail[0].iov_len) { - page = virt_to_page(xdr->tail[0].iov_base); - page_base = offset_in_page(xdr->tail[0].iov_base); - len = xdr->tail[0].iov_len; + sge->length = len; + sge->lkey = rdmab_lkey(rb); -map_tail: - sge_no++; - sge[sge_no].addr = ib_dma_map_page(device, page, - page_base, len, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(device, sge[sge_no].addr)) - goto out_mapping_err; - sge[sge_no].length = len; - sge[sge_no].lkey = lkey; sc->sc_unmap_count++; + ppages++; + remaining -= len; + page_base = 0; } -out: - sc->sc_wr.num_sge += sge_no; - if (sc->sc_unmap_count) - __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); return true; -out_regbuf: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); +out_mapping_err: + trace_xprtrdma_dma_maperr(sge->addr); return false; +} -out_mapping_overflow: - rpcrdma_unmap_sendctx(sc); - pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); - return false; +/* The tail iovec may include an XDR pad for the page list, + * as well as additional content, and may not reside in the + * same page as the head iovec. + */ +static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, + struct xdr_buf *xdr, + unsigned int page_base, unsigned int len) +{ + struct rpcrdma_sendctx *sc = req->rl_sendctx; + struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; + struct page *page = virt_to_page(xdr->tail[0].iov_base); + + sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) + goto out_mapping_err; + + sge->length = len; + sge->lkey = rdmab_lkey(rb); + ++sc->sc_unmap_count; + return true; out_mapping_err: - rpcrdma_unmap_sendctx(sc); - trace_xprtrdma_dma_maperr(sge[sge_no].addr); + trace_xprtrdma_dma_maperr(sge->addr); return false; } +/* Copy the tail to the end of the head buffer. + */ +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + unsigned char *dst; + + dst = (unsigned char *)xdr->head[0].iov_base; + dst += xdr->head[0].iov_len + xdr->page_len; + memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); + r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; +} + +/* Copy pagelist content into the head buffer. + */ +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + unsigned int len, page_base, remaining; + struct page **ppages; + unsigned char *src, *dst; + + dst = (unsigned char *)xdr->head[0].iov_base; + dst += xdr->head[0].iov_len; + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + src = page_address(*ppages); + src += page_base; + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); + memcpy(dst, src, len); + r_xprt->rx_stats.pullup_copy_count += len; + + ppages++; + dst += len; + remaining -= len; + page_base = 0; + } +} + +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. + * When the head, pagelist, and tail are small, a pull-up copy + * is considerably less costly than DMA mapping the components + * of @xdr. + * + * Assumptions: + * - the caller has already verified that the total length + * of the RPC Call body will fit into @rl_sendbuf. + */ +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + if (unlikely(xdr->tail[0].iov_len)) + rpcrdma_pullup_tail_iov(r_xprt, req, xdr); + + if (unlikely(xdr->page_len)) + rpcrdma_pullup_pagelist(r_xprt, req, xdr); + + /* The whole RPC message resides in the head iovec now */ + return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); +} + +static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + struct kvec *tail = &xdr->tail[0]; + + if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) + return false; + if (xdr->page_len) + if (!rpcrdma_prepare_pagelist(req, xdr)) + return false; + if (tail->iov_len) + if (!rpcrdma_prepare_tail_iov(req, xdr, + offset_in_page(tail->iov_base), + tail->iov_len)) + return false; + + if (req->rl_sendctx->sc_unmap_count) + kref_get(&req->rl_kref); + return true; +} + +static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) + return false; + + /* If there is a Read chunk, the page list is being handled + * via explicit RDMA, and thus is skipped here. + */ + + /* Do not include the tail if it is only an XDR pad */ + if (xdr->tail[0].iov_len > 3) { + unsigned int page_base, len; + + /* If the content in the page list is an odd length, + * xdr_write_pages() adds a pad at the beginning of + * the tail iovec. Force the tail's non-pad content to + * land at the next XDR position in the Send message. + */ + page_base = offset_in_page(xdr->tail[0].iov_base); + len = xdr->tail[0].iov_len; + page_base += len & 3; + len -= len & 3; + if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) + return false; + kref_get(&req->rl_kref); + } + + return true; +} + /** * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR * @r_xprt: controlling transport @@ -679,27 +787,54 @@ out_mapping_err: * * Returns 0 on success; otherwise a negative errno is returned. */ -int -rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, u32 hdrlen, - struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype) { - req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); + int ret; + + ret = -EAGAIN; + req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); if (!req->rl_sendctx) - return -EAGAIN; - req->rl_sendctx->sc_wr.num_sge = 0; + goto out_nosc; req->rl_sendctx->sc_unmap_count = 0; req->rl_sendctx->sc_req = req; - __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); - - if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) - return -EIO; - - if (rtype != rpcrdma_areadch) - if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype)) - return -EIO; + kref_init(&req->rl_kref); + req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe; + req->rl_wr.sg_list = req->rl_sendctx->sc_sges; + req->rl_wr.num_sge = 0; + req->rl_wr.opcode = IB_WR_SEND; + + rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen); + + ret = -EIO; + switch (rtype) { + case rpcrdma_noch_pullup: + if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_noch_mapped: + if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_readch: + if (!rpcrdma_prepare_readch(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_areadch: + break; + default: + goto out_unmap; + } return 0; + +out_unmap: + rpcrdma_sendctx_unmap(req->rl_sendctx); +out_nosc: + trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); + return ret; } /** @@ -727,13 +862,20 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct xdr_stream *xdr = &req->rl_stream; enum rpcrdma_chunktype rtype, wtype; + struct xdr_buf *buf = &rqst->rq_snd_buf; bool ddp_allowed; __be32 *p; int ret; + if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { + ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); + if (ret) + return ret; + } + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); - xdr_init_encode(xdr, &req->rl_hdrbuf, - req->rl_rdmabuf->rg_base); + xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), + rqst); /* Fixed header fields */ ret = -EMSGSIZE; @@ -742,14 +884,14 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) goto out_err; *p++ = rqst->rq_xid; *p++ = rpcrdma_version; - *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); + *p++ = r_xprt->rx_buf.rb_max_requests; /* When the ULP employs a GSS flavor that guarantees integrity * or privacy, direct data placement of individual data items * is not allowed. */ - ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & - RPCAUTH_AUTH_DATATOUCH); + ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH, + &rqst->rq_cred->cr_auth->au_flags); /* * Chunks needed for results? @@ -762,7 +904,8 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) */ if (rpcrdma_results_inline(r_xprt, rqst)) wtype = rpcrdma_noch; - else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) + else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) && + rpcrdma_nonpayload_inline(r_xprt, rqst)) wtype = rpcrdma_writech; else wtype = rpcrdma_replych; @@ -783,8 +926,9 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) */ if (rpcrdma_args_inline(r_xprt, rqst)) { *p++ = rdma_msg; - rtype = rpcrdma_noch; - } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + rtype = buf->len < rdmab_length(req->rl_sendbuf) ? + rpcrdma_noch_pullup : rpcrdma_noch_mapped; + } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { *p++ = rdma_msg; rtype = rpcrdma_readch; } else { @@ -793,17 +937,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) rtype = rpcrdma_areadch; } - /* If this is a retransmit, discard previously registered - * chunks. Very likely the connection has been replaced, - * so these registrations are invalid and unusable. - */ - while (unlikely(!list_empty(&req->rl_registered))) { - struct rpcrdma_mr *mr; - - mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_recycle(mr); - } - /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -826,52 +959,65 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * send a Call message with a Position Zero Read chunk and a * regular Read chunk at the same time. */ - if (rtype != rpcrdma_noch) { - ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); - if (ret) - goto out_err; - } - ret = encode_item_not_present(xdr); + ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); if (ret) goto out_err; - - if (wtype == rpcrdma_writech) { - ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); - if (ret) - goto out_err; - } - ret = encode_item_not_present(xdr); + ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); if (ret) goto out_err; - - if (wtype != rpcrdma_replych) - ret = encode_item_not_present(xdr); - else - ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); + ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); if (ret) goto out_err; - trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype); - - ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), - &rqst->rq_snd_buf, rtype); + ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, + buf, rtype); if (ret) goto out_err; + + trace_xprtrdma_marshal(req, rtype, wtype); return 0; out_err: - switch (ret) { - case -EAGAIN: - xprt_wait_for_buffer_space(rqst->rq_xprt); - break; - case -ENOBUFS: - break; - default: - r_xprt->rx_stats.failed_marshal_count++; - } + trace_xprtrdma_marshal_failed(rqst, ret); + r_xprt->rx_stats.failed_marshal_count++; + frwr_reset(req); return ret; } +static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt, + struct rpcrdma_buffer *buf, + u32 grant) +{ + buf->rb_credits = grant; + xprt->cwnd = grant << RPC_CWNDSHIFT; +} + +static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock(&xprt->transport_lock); + __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant); + spin_unlock(&xprt->transport_lock); +} + +/** + * rpcrdma_reset_cwnd - Reset the xprt's congestion window + * @r_xprt: controlling transport instance + * + * Prepare @r_xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ +void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock(&xprt->transport_lock); + xprt->cong = 0; + __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1); + spin_unlock(&xprt->transport_lock); +} + /** * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs * @rqst: controlling RPC request @@ -911,7 +1057,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) curlen = rqst->rq_rcv_buf.head[0].iov_len; if (curlen > copy_len) curlen = copy_len; - trace_xprtrdma_fixup(rqst, copy_len, curlen); srcp += curlen; copy_len -= curlen; @@ -931,8 +1076,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > pagelist_len) curlen = pagelist_len; - trace_xprtrdma_fixup_pg(rqst, i, srcp, - copy_len, curlen); destp = kmap_atomic(ppages[i]); memcpy(destp + page_base, srcp, curlen); flush_dcache_page(ppages[i]); @@ -964,6 +1107,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) rqst->rq_private_buf.tail[0].iov_base = srcp; } + if (fixup_copy_count) + trace_xprtrdma_fixup(rqst, fixup_copy_count); return fixup_copy_count; } @@ -976,6 +1121,7 @@ static bool rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct xdr_stream *xdr = &rep->rr_stream; __be32 *p; @@ -986,11 +1132,11 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) p = xdr_inline_decode(xdr, 0); /* Chunk lists */ - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; /* RPC header */ @@ -999,19 +1145,19 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) if (*p != cpu_to_be32(RPC_CALL)) return false; + /* No bc service. */ + if (xprt->bc_serv == NULL) + return false; + /* Now that we are sure this is a backchannel call, * advance to the RPC header. */ p = xdr_inline_decode(xdr, 3 * sizeof(*p)); if (unlikely(!p)) - goto out_short; + return true; rpcrdma_bc_receive_call(r_xprt, rep); return true; - -out_short: - pr_warn("RPC/RDMA short backward direction call\n"); - return true; } #else /* CONFIG_SUNRPC_BACKCHANNEL */ { @@ -1029,10 +1175,7 @@ static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) if (unlikely(!p)) return -EIO; - handle = be32_to_cpup(p++); - *length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); - + xdr_decode_rdma_segment(p, &handle, length, &offset); trace_xprtrdma_decode_seg(handle, *length, offset); return 0; } @@ -1068,7 +1211,7 @@ static int decode_read_list(struct xdr_stream *xdr) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (unlikely(*p != xdr_zero)) + if (unlikely(xdr_item_is_present(p))) return -EIO; return 0; } @@ -1087,7 +1230,7 @@ static int decode_write_list(struct xdr_stream *xdr, u32 *length) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (*p == xdr_zero) + if (xdr_item_is_absent(p)) break; if (!first) return -EIO; @@ -1109,7 +1252,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) return -EIO; *length = 0; - if (*p != xdr_zero) + if (xdr_item_is_present(p)) if (decode_write_chunk(xdr, length)) return -EIO; return 0; @@ -1186,29 +1329,47 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %s: server reports " - "version error (%u-%u), xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1)), - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_vers(rqst, p, p + 1); break; case err_chunk: - dprintk("RPC: %s: server reports " - "header decoding error, xid %08x\n", __func__, - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_chunk(rqst); break; default: - dprintk("RPC: %s: server reports " - "unrecognized error %d, xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_unrecognized(rqst, p); } - r_xprt->rx_stats.bad_reply_count++; - return -EREMOTEIO; + return -EIO; +} + +/** + * rpcrdma_unpin_rqst - Release rqst without completing it + * @rep: RPC/RDMA Receive context + * + * This is done when a connection is lost so that a Reply + * can be dropped and its matching Call can be subsequently + * retransmitted on a new connection. + */ +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) +{ + struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + req->rl_reply = NULL; + rep->rr_rqst = NULL; + + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->queue_lock); } -/* Perform XID lookup, reconstruction of the RPC reply, and - * RPC completion while holding the transport lock to ensure - * the rep, rqst, and rq_task pointers remain stable. +/** + * rpcrdma_complete_rqst - Pass completed rqst back to RPC + * @rep: RPC/RDMA Receive context + * + * Reconstruct the RPC reply and complete the transaction + * while @rqst is still pinned to ensure the rep, rqst, and + * rq_task pointers remain stable. */ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) { @@ -1217,8 +1378,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpc_rqst *rqst = rep->rr_rqst; int status; - xprt->reestablish_timeout = 0; - switch (rep->rr_proc) { case rdma_msg: status = rpcrdma_decode_msg(r_xprt, rep, rqst); @@ -1242,61 +1401,25 @@ out: spin_unlock(&xprt->queue_lock); return; -/* If the incoming reply terminated a pending RPC, the next - * RPC call will post a replacement receive buffer as it is - * being marshaled. - */ out_badheader: - trace_xprtrdma_reply_hdr(rep); + trace_xprtrdma_reply_hdr_err(rep); r_xprt->rx_stats.bad_reply_count++; + rqst->rq_task->tk_status = status; + status = 0; goto out; } -void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +static void rpcrdma_reply_done(struct kref *kref) { - /* Invalidate and unmap the data payloads before waking - * the waiting application. This guarantees the memory - * regions are properly fenced from the server before the - * application accesses the data. It also ensures proper - * send flow control: waking the next RPC waits until this - * RPC has relinquished all its Send Queue entries. - */ - if (!list_empty(&req->rl_registered)) - frwr_unmap_sync(r_xprt, &req->rl_registered); + struct rpcrdma_req *req = + container_of(kref, struct rpcrdma_req, rl_kref); - /* Ensure that any DMA mapped pages associated with - * the Send of the RPC Call have been unmapped before - * allowing the RPC to complete. This protects argument - * memory not controlled by the RPC client from being - * re-used before we're done with it. - */ - if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { - r_xprt->rx_stats.reply_waits_for_send++; - out_of_line_wait_on_bit(&req->rl_flags, - RPCRDMA_REQ_F_TX_RESOURCES, - bit_wait, - TASK_UNINTERRUPTIBLE); - } + rpcrdma_complete_rqst(req->rl_reply); } -/* Reply handling runs in the poll worker thread. Anything that - * might wait is deferred to a separate workqueue. - */ -void rpcrdma_deferred_completion(struct work_struct *work) -{ - struct rpcrdma_rep *rep = - container_of(work, struct rpcrdma_rep, rr_work); - struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - - trace_xprtrdma_defer_cmp(rep); - if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) - frwr_reminv(rep, &req->rl_registered); - rpcrdma_release_rqst(r_xprt, req); - rpcrdma_complete_rqst(rep); -} - -/* Process received RPC/RDMA messages. +/** + * rpcrdma_reply_handler - Process received RPC/RDMA messages + * @rep: Incoming rpcrdma_rep object to process * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. @@ -1311,9 +1434,15 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; + /* Any data means we had a useful conversation, so + * then we don't need to delay the next reconnect. + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + /* Fixed transport header fields */ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, - rep->rr_hdrbuf.head[0].iov_base); + rep->rr_hdrbuf.head[0].iov_base, NULL); p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); if (unlikely(!p)) goto out_shortreply; @@ -1340,40 +1469,41 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > buf->rb_max_requests) - credits = buf->rb_max_requests; - if (buf->rb_credits != credits) { - spin_lock_bh(&xprt->transport_lock); - buf->rb_credits = credits; - xprt->cwnd = credits << RPC_CWNDSHIFT; - spin_unlock_bh(&xprt->transport_lock); - } + else if (credits > r_xprt->rx_ep->re_max_requests) + credits = r_xprt->rx_ep->re_max_requests; + rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1)); + if (buf->rb_credits != credits) + rpcrdma_update_cwnd(r_xprt, credits); req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - trace_xprtrdma_leaked_rep(rqst, req->rl_reply); - rpcrdma_recv_buffer_put(req->rl_reply); - } + if (unlikely(req->rl_reply)) + rpcrdma_rep_put(buf, req->rl_reply); req->rl_reply = rep; rep->rr_rqst = rqst; - clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); - trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); - queue_work(buf->rb_completion_wq, &rep->rr_work); + trace_xprtrdma_reply(rqst->rq_task, rep, credits); + + if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) + frwr_reminv(rep, &req->rl_registered); + if (!list_empty(&req->rl_registered)) + frwr_unmap_async(r_xprt, req); + /* LocalInv completion will complete the RPC */ + else + kref_put(&req->rl_kref, rpcrdma_reply_done); return; out_badversion: - trace_xprtrdma_reply_vers(rep); + trace_xprtrdma_reply_vers_err(rep); goto out; out_norqst: spin_unlock(&xprt->queue_lock); - trace_xprtrdma_reply_rqst(rep); + trace_xprtrdma_reply_rqst_err(rep); goto out; out_shortreply: - trace_xprtrdma_reply_short(rep); + trace_xprtrdma_reply_short_err(rep); out: - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); } |
