summaryrefslogtreecommitdiff
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/addr.c4
-rw-r--r--net/sunrpc/auth_gss/Makefile2
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c4
-rw-r--r--net/sunrpc/auth_gss/auth_gss_internal.h6
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c231
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c55
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_internal.h7
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c11
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_test.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c1
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c27
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c12
-rw-r--r--net/sunrpc/cache.c77
-rw-r--r--net/sunrpc/clnt.c64
-rw-r--r--net/sunrpc/debugfs.c15
-rw-r--r--net/sunrpc/rpc_pipe.c17
-rw-r--r--net/sunrpc/sched.c6
-rw-r--r--net/sunrpc/stats.c2
-rw-r--r--net/sunrpc/sunrpc.h4
-rw-r--r--net/sunrpc/svc.c357
-rw-r--r--net/sunrpc/svc_xprt.c219
-rw-r--r--net/sunrpc/svcauth.c29
-rw-r--r--net/sunrpc/svcauth_unix.c3
-rw-r--r--net/sunrpc/svcsock.c30
-rw-r--r--net/sunrpc/sysctl.c5
-rw-r--r--net/sunrpc/xdr.c6
-rw-r--r--net/sunrpc/xprt.c9
-rw-r--r--net/sunrpc/xprtmultipath.c17
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c9
-rw-r--r--net/sunrpc/xprtrdma/ib_client.c184
-rw-r--r--net/sunrpc/xprtrdma/module.c18
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c3
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c22
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c10
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c181
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c150
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c112
-rw-r--r--net/sunrpc/xprtrdma/transport.c1
-rw-r--r--net/sunrpc/xprtrdma/verbs.c96
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h5
-rw-r--r--net/sunrpc/xprtsock.c66
44 files changed, 1150 insertions, 935 deletions
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index d435bffc6199..97ff11973c49 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -284,10 +284,10 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags)
}
if (snprintf(portbuf, sizeof(portbuf),
- ".%u.%u", port >> 8, port & 0xff) > (int)sizeof(portbuf))
+ ".%u.%u", port >> 8, port & 0xff) >= (int)sizeof(portbuf))
return NULL;
- if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) > sizeof(addrbuf))
+ if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) >= sizeof(addrbuf))
return NULL;
return kstrdup(addrbuf, gfp_flags);
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index ad1736d93b76..452f67deebc6 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
-auth_rpcgss-y := auth_gss.o gss_generic_token.o \
+auth_rpcgss-y := auth_gss.o \
gss_mech_switch.o svcauth_gss.o \
gss_rpc_upcall.o gss_rpc_xdr.o trace.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index c7af0220f82f..369310909fc9 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1875,8 +1875,10 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages);
/* slack space should prevent this ever happening: */
- if (unlikely(snd_buf->len > snd_buf->buflen))
+ if (unlikely(snd_buf->len > snd_buf->buflen)) {
+ status = -EIO;
goto wrap_failed;
+ }
/* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was
* done anyway, so it's safe to put the request on the wire: */
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h
index c53b329092d4..4ebc1b7043d9 100644
--- a/net/sunrpc/auth_gss/auth_gss_internal.h
+++ b/net/sunrpc/auth_gss/auth_gss_internal.h
@@ -23,7 +23,7 @@ simple_get_bytes(const void *p, const void *end, void *res, size_t len)
}
static inline const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
+simple_get_netobj_noprof(const void *p, const void *end, struct xdr_netobj *dest)
{
const void *q;
unsigned int len;
@@ -35,7 +35,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
if (unlikely(q > end || q < p))
return ERR_PTR(-EFAULT);
if (len) {
- dest->data = kmemdup(p, len, GFP_KERNEL);
+ dest->data = kmemdup_noprof(p, len, GFP_KERNEL);
if (unlikely(dest->data == NULL))
return ERR_PTR(-ENOMEM);
} else
@@ -43,3 +43,5 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
dest->len = len;
return q;
}
+
+#define simple_get_netobj(...) alloc_hooks(simple_get_netobj_noprof(__VA_ARGS__))
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
deleted file mode 100644
index 4a4082bb22ad..000000000000
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * linux/net/sunrpc/gss_generic_token.c
- *
- * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
- *
- * Copyright (c) 2000 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <andros@umich.edu>
- */
-
-/*
- * Copyright 1993 by OpenVision Technologies, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software
- * and its documentation for any purpose is hereby granted without fee,
- * provided that the above copyright notice appears in all copies and
- * that both that copyright notice and this permission notice appear in
- * supporting documentation, and that the name of OpenVision not be used
- * in advertising or publicity pertaining to distribution of the software
- * without specific, written prior permission. OpenVision makes no
- * representations about the suitability of this software for any
- * purpose. It is provided "as is" without express or implied warranty.
- *
- * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
- * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
- * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/sunrpc/gss_asn1.h>
-
-
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_AUTH
-#endif
-
-
-/* TWRITE_STR from gssapiP_generic.h */
-#define TWRITE_STR(ptr, str, len) \
- memcpy((ptr), (char *) (str), (len)); \
- (ptr) += (len);
-
-/* XXXX this code currently makes the assumption that a mech oid will
- never be longer than 127 bytes. This assumption is not inherent in
- the interfaces, so the code can be fixed if the OSI namespace
- balloons unexpectedly. */
-
-/* Each token looks like this:
-
-0x60 tag for APPLICATION 0, SEQUENCE
- (constructed, definite-length)
- <length> possible multiple bytes, need to parse/generate
- 0x06 tag for OBJECT IDENTIFIER
- <moid_length> compile-time constant string (assume 1 byte)
- <moid_bytes> compile-time constant string
- <inner_bytes> the ANY containing the application token
- bytes 0,1 are the token type
- bytes 2,n are the token data
-
-For the purposes of this abstraction, the token "header" consists of
-the sequence tag and length octets, the mech OID DER encoding, and the
-first two inner bytes, which indicate the token type. The token
-"body" consists of everything else.
-
-*/
-
-static int
-der_length_size( int length)
-{
- if (length < (1<<7))
- return 1;
- else if (length < (1<<8))
- return 2;
-#if (SIZEOF_INT == 2)
- else
- return 3;
-#else
- else if (length < (1<<16))
- return 3;
- else if (length < (1<<24))
- return 4;
- else
- return 5;
-#endif
-}
-
-static void
-der_write_length(unsigned char **buf, int length)
-{
- if (length < (1<<7)) {
- *(*buf)++ = (unsigned char) length;
- } else {
- *(*buf)++ = (unsigned char) (der_length_size(length)+127);
-#if (SIZEOF_INT > 2)
- if (length >= (1<<24))
- *(*buf)++ = (unsigned char) (length>>24);
- if (length >= (1<<16))
- *(*buf)++ = (unsigned char) ((length>>16)&0xff);
-#endif
- if (length >= (1<<8))
- *(*buf)++ = (unsigned char) ((length>>8)&0xff);
- *(*buf)++ = (unsigned char) (length&0xff);
- }
-}
-
-/* returns decoded length, or < 0 on failure. Advances buf and
- decrements bufsize */
-
-static int
-der_read_length(unsigned char **buf, int *bufsize)
-{
- unsigned char sf;
- int ret;
-
- if (*bufsize < 1)
- return -1;
- sf = *(*buf)++;
- (*bufsize)--;
- if (sf & 0x80) {
- if ((sf &= 0x7f) > ((*bufsize)-1))
- return -1;
- if (sf > SIZEOF_INT)
- return -1;
- ret = 0;
- for (; sf; sf--) {
- ret = (ret<<8) + (*(*buf)++);
- (*bufsize)--;
- }
- } else {
- ret = sf;
- }
-
- return ret;
-}
-
-/* returns the length of a token, given the mech oid and the body size */
-
-int
-g_token_size(struct xdr_netobj *mech, unsigned int body_size)
-{
- /* set body_size to sequence contents size */
- body_size += 2 + (int) mech->len; /* NEED overflow check */
- return 1 + der_length_size(body_size) + body_size;
-}
-
-EXPORT_SYMBOL_GPL(g_token_size);
-
-/* fills in a buffer with the token header. The buffer is assumed to
- be the right size. buf is advanced past the token header */
-
-void
-g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf)
-{
- *(*buf)++ = 0x60;
- der_write_length(buf, 2 + mech->len + body_size);
- *(*buf)++ = 0x06;
- *(*buf)++ = (unsigned char) mech->len;
- TWRITE_STR(*buf, mech->data, ((int) mech->len));
-}
-
-EXPORT_SYMBOL_GPL(g_make_token_header);
-
-/*
- * Given a buffer containing a token, reads and verifies the token,
- * leaving buf advanced past the token header, and setting body_size
- * to the number of remaining bytes. Returns 0 on success,
- * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
- * mechanism in the token does not match the mech argument. buf and
- * *body_size are left unmodified on error.
- */
-u32
-g_verify_token_header(struct xdr_netobj *mech, int *body_size,
- unsigned char **buf_in, int toksize)
-{
- unsigned char *buf = *buf_in;
- int seqsize;
- struct xdr_netobj toid;
- int ret = 0;
-
- if ((toksize-=1) < 0)
- return G_BAD_TOK_HEADER;
- if (*buf++ != 0x60)
- return G_BAD_TOK_HEADER;
-
- if ((seqsize = der_read_length(&buf, &toksize)) < 0)
- return G_BAD_TOK_HEADER;
-
- if (seqsize != toksize)
- return G_BAD_TOK_HEADER;
-
- if ((toksize-=1) < 0)
- return G_BAD_TOK_HEADER;
- if (*buf++ != 0x06)
- return G_BAD_TOK_HEADER;
-
- if ((toksize-=1) < 0)
- return G_BAD_TOK_HEADER;
- toid.len = *buf++;
-
- if ((toksize-=toid.len) < 0)
- return G_BAD_TOK_HEADER;
- toid.data = buf;
- buf+=toid.len;
-
- if (! g_OID_equal(&toid, mech))
- ret = G_WRONG_MECH;
-
- /* G_WRONG_MECH is not returned immediately because it's more important
- to return G_BAD_TOK_HEADER if the token header is in fact bad */
-
- if ((toksize-=2) < 0)
- return G_BAD_TOK_HEADER;
-
- if (ret)
- return ret;
-
- *buf_in = buf;
- *body_size = toksize;
-
- return ret;
-}
-
-EXPORT_SYMBOL_GPL(g_verify_token_header);
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index d2b02710ab07..9a27201638e2 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -442,35 +442,6 @@ encryptor(struct scatterlist *sg, void *data)
return 0;
}
-int
-gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf,
- int offset, struct page **pages)
-{
- int ret;
- struct encryptor_desc desc;
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-
- BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0);
-
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
-
- memset(desc.iv, 0, sizeof(desc.iv));
- desc.req = req;
- desc.pos = offset;
- desc.outbuf = buf;
- desc.pages = pages;
- desc.fragno = 0;
- desc.fraglen = 0;
-
- sg_init_table(desc.infrags, 4);
- sg_init_table(desc.outfrags, 4);
-
- ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc);
- skcipher_request_zero(req);
- return ret;
-}
-
struct decryptor_desc {
u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
struct skcipher_request *req;
@@ -525,32 +496,6 @@ decryptor(struct scatterlist *sg, void *data)
return 0;
}
-int
-gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf,
- int offset)
-{
- int ret;
- struct decryptor_desc desc;
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-
- /* XXXJBF: */
- BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0);
-
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
-
- memset(desc.iv, 0, sizeof(desc.iv));
- desc.req = req;
- desc.fragno = 0;
- desc.fraglen = 0;
-
- sg_init_table(desc.frags, 4);
-
- ret = xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
- skcipher_request_zero(req);
- return ret;
-}
-
/*
* This function makes the assumption that it was ultimately called
* from gss_wrap().
diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h
index 3afd4065bf3d..a47e9ec228a5 100644
--- a/net/sunrpc/auth_gss/gss_krb5_internal.h
+++ b/net/sunrpc/auth_gss/gss_krb5_internal.h
@@ -172,13 +172,6 @@ u32 krb5_decrypt(struct crypto_sync_skcipher *key, void *iv, void *in,
int xdr_extend_head(struct xdr_buf *buf, unsigned int base,
unsigned int shiftlen);
-int gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm,
- struct xdr_buf *outbuf, int offset,
- struct page **pages);
-
-int gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm,
- struct xdr_buf *inbuf, int offset);
-
u32 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
struct xdr_buf *buf, struct page **pages);
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 06d8ee0db000..4eb19c3a54c7 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -168,7 +168,7 @@ static int krb5_DK(const struct gss_krb5_enctype *gk5e,
goto err_return;
blocksize = crypto_sync_skcipher_blocksize(cipher);
if (crypto_sync_skcipher_setkey(cipher, inkey->data, inkey->len))
- goto err_return;
+ goto err_free_cipher;
ret = -ENOMEM;
inblockdata = kmalloc(blocksize, gfp_mask);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 64cff717c3d9..3366505bc669 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -398,6 +398,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
u64 seq_send64;
int keylen;
u32 time32;
+ int ret;
p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags));
if (IS_ERR(p))
@@ -450,8 +451,16 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
}
ctx->mech_used.len = gss_kerberos_mech.gm_oid.len;
- return gss_krb5_import_ctx_v2(ctx, gfp_mask);
+ ret = gss_krb5_import_ctx_v2(ctx, gfp_mask);
+ if (ret) {
+ p = ERR_PTR(ret);
+ goto out_free;
+ }
+ return 0;
+
+out_free:
+ kfree(ctx->mech_used.data);
out_err:
return PTR_ERR(p);
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c
index 85625e3f3814..a5bff02cd7ba 100644
--- a/net/sunrpc/auth_gss/gss_krb5_test.c
+++ b/net/sunrpc/auth_gss/gss_krb5_test.c
@@ -17,7 +17,7 @@
#include "gss_krb5_internal.h"
-MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
struct gss_krb5_test_param {
const char *desc;
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index fae632da1058..c84d0cf61980 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -13,7 +13,6 @@
#include <linux/module.h>
#include <linux/oid_registry.h>
#include <linux/sunrpc/msg_prot.h>
-#include <linux/sunrpc/gss_asn1.h>
#include <linux/sunrpc/auth_gss.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/gss_err.h>
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index d79f12c2550a..cb32ab9a8395 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -250,8 +250,8 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
if (!creds) {
- kfree(oa->data);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto free_oa;
}
oa->data[0].option.data = CREDS_VALUE;
@@ -265,29 +265,40 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
/* option buffer */
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- return -ENOSPC;
+ if (unlikely(p == NULL)) {
+ err = -ENOSPC;
+ goto free_creds;
+ }
length = be32_to_cpup(p);
p = xdr_inline_decode(xdr, length);
- if (unlikely(p == NULL))
- return -ENOSPC;
+ if (unlikely(p == NULL)) {
+ err = -ENOSPC;
+ goto free_creds;
+ }
if (length == sizeof(CREDS_VALUE) &&
memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) {
/* We have creds here. parse them */
err = gssx_dec_linux_creds(xdr, creds);
if (err)
- return err;
+ goto free_creds;
oa->data[0].value.len = 1; /* presence */
} else {
/* consume uninteresting buffer */
err = gssx_dec_buffer(xdr, &dummy);
if (err)
- return err;
+ goto free_creds;
}
}
return 0;
+
+free_creds:
+ kfree(creds);
+free_oa:
+ kfree(oa->data);
+ oa->data = NULL;
+ return err;
}
static int gssx_dec_status(struct xdr_stream *xdr,
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 24de94184700..73a90ad873fb 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1033,17 +1033,11 @@ null_verifier:
static void gss_free_in_token_pages(struct gssp_in_token *in_token)
{
- u32 inlen;
int i;
i = 0;
- inlen = in_token->page_len;
- while (inlen) {
- if (in_token->pages[i])
- put_page(in_token->pages[i]);
- inlen -= inlen > PAGE_SIZE ? PAGE_SIZE : inlen;
- }
-
+ while (in_token->pages[i])
+ put_page(in_token->pages[i++]);
kfree(in_token->pages);
in_token->pages = NULL;
}
@@ -1075,7 +1069,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp,
goto out_denied_free;
pages = DIV_ROUND_UP(inlen, PAGE_SIZE);
- in_token->pages = kcalloc(pages, sizeof(struct page *), GFP_KERNEL);
+ in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
if (!in_token->pages)
goto out_denied_free;
in_token->page_base = 0;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 95ff74706104..7ce5e28a6c03 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -281,21 +281,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
return rv;
}
-/*
- * This is the generic cache management routine for all
- * the authentication caches.
- * It checks the currency of a cache item and will (later)
- * initiate an upcall to fill it if needed.
- *
- *
- * Returns 0 if the cache_head can be used, or cache_puts it and returns
- * -EAGAIN if upcall is pending and request has been queued
- * -ETIMEDOUT if upcall failed or request could not be queue or
- * upcall completed but item is still invalid (implying that
- * the cache item has been replaced with a newer one).
- * -ENOENT if cache entry was negative
- */
-int cache_check(struct cache_detail *detail,
+int cache_check_rcu(struct cache_detail *detail,
struct cache_head *h, struct cache_req *rqstp)
{
int rv;
@@ -336,6 +322,31 @@ int cache_check(struct cache_detail *detail,
rv = -ETIMEDOUT;
}
}
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(cache_check_rcu);
+
+/*
+ * This is the generic cache management routine for all
+ * the authentication caches.
+ * It checks the currency of a cache item and will (later)
+ * initiate an upcall to fill it if needed.
+ *
+ *
+ * Returns 0 if the cache_head can be used, or cache_puts it and returns
+ * -EAGAIN if upcall is pending and request has been queued
+ * -ETIMEDOUT if upcall failed or request could not be queue or
+ * upcall completed but item is still invalid (implying that
+ * the cache item has been replaced with a newer one).
+ * -ENOENT if cache entry was negative
+ */
+int cache_check(struct cache_detail *detail,
+ struct cache_head *h, struct cache_req *rqstp)
+{
+ int rv;
+
+ rv = cache_check_rcu(detail, h, rqstp);
if (rv)
cache_put(h, detail);
return rv;
@@ -731,11 +742,10 @@ static bool cache_defer_req(struct cache_req *req, struct cache_head *item)
static void cache_revisit_request(struct cache_head *item)
{
struct cache_deferred_req *dreq;
- struct list_head pending;
struct hlist_node *tmp;
int hash = DFR_HASH(item);
+ LIST_HEAD(pending);
- INIT_LIST_HEAD(&pending);
spin_lock(&cache_defer_lock);
hlist_for_each_entry_safe(dreq, tmp, &cache_defer_hash[hash], hash)
@@ -756,10 +766,8 @@ static void cache_revisit_request(struct cache_head *item)
void cache_clean_deferred(void *owner)
{
struct cache_deferred_req *dreq, *tmp;
- struct list_head pending;
+ LIST_HEAD(pending);
-
- INIT_LIST_HEAD(&pending);
spin_lock(&cache_defer_lock);
list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) {
@@ -1085,9 +1093,8 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch)
{
struct cache_queue *cq, *tmp;
struct cache_request *cr;
- struct list_head dequeued;
+ LIST_HEAD(dequeued);
- INIT_LIST_HEAD(&dequeued);
spin_lock(&queue_lock);
list_for_each_entry_safe(cq, tmp, &detail->queue, list)
if (!cq->reader) {
@@ -1431,15 +1438,11 @@ static int c_show(struct seq_file *m, void *p)
seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n",
convert_to_wallclock(cp->expiry_time),
kref_read(&cp->ref), cp->flags);
- cache_get(cp);
- if (cache_check(cd, cp, NULL))
- /* cache_check does a cache_put on failure */
+
+ if (cache_check_rcu(cd, cp, NULL))
+ seq_puts(m, "# ");
+ else if (cache_is_expired(cd, cp))
seq_puts(m, "# ");
- else {
- if (cache_is_expired(cd, cp))
- seq_puts(m, "# ");
- cache_put(cp, cd);
- }
return cd->cache_show(m, cd, cp);
}
@@ -1596,7 +1599,6 @@ static int cache_release_procfs(struct inode *inode, struct file *filp)
}
static const struct proc_ops cache_channel_proc_ops = {
- .proc_lseek = no_llseek,
.proc_read = cache_read_procfs,
.proc_write = cache_write_procfs,
.proc_poll = cache_poll_procfs,
@@ -1662,7 +1664,6 @@ static const struct proc_ops cache_flush_proc_ops = {
.proc_read = read_flush_procfs,
.proc_write = write_flush_procfs,
.proc_release = release_flush_procfs,
- .proc_lseek = no_llseek,
};
static void remove_cache_proc_entries(struct cache_detail *cd)
@@ -1673,12 +1674,14 @@ static void remove_cache_proc_entries(struct cache_detail *cd)
}
}
-#ifdef CONFIG_PROC_FS
static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
{
struct proc_dir_entry *p;
struct sunrpc_net *sn;
+ if (!IS_ENABLED(CONFIG_PROC_FS))
+ return 0;
+
sn = net_generic(net, sunrpc_net_id);
cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc);
if (cd->procfs == NULL)
@@ -1706,12 +1709,6 @@ out_nomem:
remove_cache_proc_entries(cd);
return -ENOMEM;
}
-#else /* CONFIG_PROC_FS */
-static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
-{
- return 0;
-}
-#endif
void __init cache_initialize(void)
{
@@ -1815,7 +1812,6 @@ static int cache_release_pipefs(struct inode *inode, struct file *filp)
const struct file_operations cache_file_operations_pipefs = {
.owner = THIS_MODULE,
- .llseek = no_llseek,
.read = cache_read_pipefs,
.write = cache_write_pipefs,
.poll = cache_poll_pipefs,
@@ -1881,7 +1877,6 @@ const struct file_operations cache_flush_operations_pipefs = {
.read = read_flush_pipefs,
.write = write_flush_pipefs,
.release = release_flush_pipefs,
- .llseek = no_llseek,
};
int sunrpc_cache_register_pipefs(struct dentry *parent,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index cda0935a68c9..2fe88ea79a70 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -48,13 +48,8 @@
# define RPCDBG_FACILITY RPCDBG_CALL
#endif
-/*
- * All RPC clients are linked into this list
- */
-
static DECLARE_WAIT_QUEUE_HEAD(destroy_wait);
-
static void call_start(struct rpc_task *task);
static void call_reserve(struct rpc_task *task);
static void call_reserveresult(struct rpc_task *task);
@@ -405,7 +400,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
clnt->cl_maxproc = version->nrprocs;
clnt->cl_prog = args->prognumber ? : program->number;
clnt->cl_vers = version->number;
- clnt->cl_stats = program->stats;
+ clnt->cl_stats = args->stats ? : program->stats;
clnt->cl_metrics = rpc_alloc_iostats(clnt);
rpc_init_pipe_dir_head(&clnt->cl_pipedir_objects);
err = -ENOMEM;
@@ -546,7 +541,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
.connect_timeout = args->connect_timeout,
.reconnect_timeout = args->reconnect_timeout,
};
- char servername[48];
+ char servername[RPC_MAXNETNAMELEN];
struct rpc_clnt *clnt;
int i;
@@ -691,6 +686,7 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt)
.version = clnt->cl_vers,
.authflavor = clnt->cl_auth->au_flavor,
.cred = clnt->cl_cred,
+ .stats = clnt->cl_stats,
};
return __rpc_clone_client(&args, clnt);
}
@@ -713,6 +709,7 @@ rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
.version = clnt->cl_vers,
.authflavor = flavor,
.cred = clnt->cl_cred,
+ .stats = clnt->cl_stats,
};
return __rpc_clone_client(&args, clnt);
}
@@ -961,12 +958,17 @@ void rpc_shutdown_client(struct rpc_clnt *clnt)
trace_rpc_clnt_shutdown(clnt);
+ clnt->cl_shutdown = 1;
while (!list_empty(&clnt->cl_tasks)) {
rpc_killall_tasks(clnt);
wait_event_timeout(destroy_wait,
list_empty(&clnt->cl_tasks), 1*HZ);
}
+ /* wait for tasks still in workqueue or waitqueue */
+ wait_event_timeout(destroy_wait,
+ atomic_read(&clnt->cl_task_count) == 0, 1 * HZ);
+
rpc_release_client(clnt);
}
EXPORT_SYMBOL_GPL(rpc_shutdown_client);
@@ -1068,6 +1070,8 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
.version = vers,
.authflavor = old->cl_auth->au_flavor,
.cred = old->cl_cred,
+ .stats = old->cl_stats,
+ .timeout = old->cl_timeout,
};
struct rpc_clnt *clnt;
int err;
@@ -1140,6 +1144,7 @@ void rpc_task_release_client(struct rpc_task *task)
list_del(&task->tk_task);
spin_unlock(&clnt->cl_lock);
task->tk_client = NULL;
+ atomic_dec(&clnt->cl_task_count);
rpc_release_client(clnt);
}
@@ -1190,10 +1195,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
task->tk_flags |= RPC_TASK_TIMEOUT;
if (clnt->cl_noretranstimeo)
task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
- /* Add to the client's list of all tasks */
- spin_lock(&clnt->cl_lock);
- list_add_tail(&task->tk_task, &clnt->cl_tasks);
- spin_unlock(&clnt->cl_lock);
+ atomic_inc(&clnt->cl_task_count);
}
static void
@@ -1788,9 +1790,14 @@ call_reserveresult(struct rpc_task *task)
if (status >= 0) {
if (task->tk_rqstp) {
task->tk_action = call_refresh;
+
+ /* Add to the client's list of all tasks */
+ spin_lock(&task->tk_client->cl_lock);
+ if (list_empty(&task->tk_task))
+ list_add_tail(&task->tk_task, &task->tk_client->cl_tasks);
+ spin_unlock(&task->tk_client->cl_lock);
return;
}
-
rpc_call_rpcerror(task, -EIO);
return;
}
@@ -1855,13 +1862,13 @@ call_refreshresult(struct rpc_task *task)
fallthrough;
case -EAGAIN:
status = -EACCES;
- fallthrough;
- case -EKEYEXPIRED:
if (!task->tk_cred_retry)
break;
task->tk_cred_retry--;
trace_rpc_retry_refresh_status(task);
return;
+ case -EKEYEXPIRED:
+ break;
case -ENOMEM:
rpc_delay(task, HZ >> 4);
return;
@@ -1889,12 +1896,6 @@ call_allocate(struct rpc_task *task)
if (req->rq_buffer)
return;
- if (proc->p_proc != 0) {
- BUG_ON(proc->p_arglen == 0);
- if (proc->p_decode != NULL)
- BUG_ON(proc->p_replen == 0);
- }
-
/*
* Calculate the size (in quads) of the RPC call
* and reply headers, and convert both values
@@ -2322,12 +2323,13 @@ call_transmit_status(struct rpc_task *task)
task->tk_action = call_transmit;
task->tk_status = 0;
break;
- case -ECONNREFUSED:
case -EHOSTDOWN:
case -ENETDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
case -EPERM:
+ break;
+ case -ECONNREFUSED:
if (RPC_IS_SOFTCONN(task)) {
if (!task->tk_msg.rpc_proc->p_proc)
trace_xprt_ping(task->tk_xprt,
@@ -2695,8 +2697,19 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr)
goto out_msg_denied;
error = rpcauth_checkverf(task, xdr);
- if (error)
+ if (error) {
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+ if (!test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) {
+ rpcauth_invalcred(task);
+ if (!task->tk_cred_retry)
+ goto out_err;
+ task->tk_cred_retry--;
+ trace_rpc__stale_creds(task);
+ return -EKEYREJECTED;
+ }
goto out_verifier;
+ }
p = xdr_inline_decode(xdr, sizeof(*p));
if (!p)
@@ -3314,8 +3327,11 @@ bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr);
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-static void rpc_show_header(void)
+static void rpc_show_header(struct rpc_clnt *clnt)
{
+ printk(KERN_INFO "clnt[%pISpc] RPC tasks[%d]\n",
+ (struct sockaddr *)&clnt->cl_xprt->addr,
+ atomic_read(&clnt->cl_task_count));
printk(KERN_INFO "-pid- flgs status -client- --rqstp- "
"-timeout ---ops--\n");
}
@@ -3347,7 +3363,7 @@ void rpc_show_tasks(struct net *net)
spin_lock(&clnt->cl_lock);
list_for_each_entry(task, &clnt->cl_tasks, tk_task) {
if (!header) {
- rpc_show_header();
+ rpc_show_header(clnt);
header++;
}
rpc_show_task(clnt, task);
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index a176d5a0b0ee..32417db340de 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -74,6 +74,9 @@ tasks_stop(struct seq_file *f, void *v)
{
struct rpc_clnt *clnt = f->private;
spin_unlock(&clnt->cl_lock);
+ seq_printf(f, "clnt[%pISpc] RPC tasks[%d]\n",
+ (struct sockaddr *)&clnt->cl_xprt->addr,
+ atomic_read(&clnt->cl_task_count));
}
static const struct seq_operations tasks_seq_operations = {
@@ -179,6 +182,18 @@ xprt_info_show(struct seq_file *f, void *v)
seq_printf(f, "addr: %s\n", xprt->address_strings[RPC_DISPLAY_ADDR]);
seq_printf(f, "port: %s\n", xprt->address_strings[RPC_DISPLAY_PORT]);
seq_printf(f, "state: 0x%lx\n", xprt->state);
+ seq_printf(f, "netns: %u\n", xprt->xprt_net->ns.inum);
+
+ if (xprt->ops->get_srcaddr) {
+ int ret, buflen;
+ char buf[INET6_ADDRSTRLEN];
+
+ buflen = ARRAY_SIZE(buf);
+ ret = xprt->ops->get_srcaddr(xprt, buf, buflen);
+ if (ret < 0)
+ ret = sprintf(buf, "<closed>");
+ seq_printf(f, "saddr: %.*s\n", ret, buf);
+ }
return 0;
}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index dcc2b4f49e77..eadc00410ebc 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -385,7 +385,6 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
static const struct file_operations rpc_pipe_fops = {
.owner = THIS_MODULE,
- .llseek = no_llseek,
.read = rpc_pipe_read,
.write = rpc_pipe_write,
.poll = rpc_pipe_poll,
@@ -631,7 +630,7 @@ static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
const char *name)
{
- struct qstr q = QSTR_INIT(name, strlen(name));
+ struct qstr q = QSTR(name);
struct dentry *dentry = d_hash_and_lookup(parent, &q);
if (!dentry) {
dentry = d_alloc(parent, &q);
@@ -1191,8 +1190,7 @@ static const struct rpc_filelist files[] = {
struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
const unsigned char *dir_name)
{
- struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name));
- return d_hash_and_lookup(sb->s_root, &dir);
+ return d_hash_and_lookup(sb->s_root, &QSTR(dir_name));
}
EXPORT_SYMBOL_GPL(rpc_d_lookup_sb);
@@ -1301,11 +1299,9 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
struct dentry *gssd_dentry;
struct dentry *clnt_dentry = NULL;
struct dentry *pipe_dentry = NULL;
- struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name,
- strlen(files[RPCAUTH_gssd].name));
/* We should never get this far if "gssd" doesn't exist */
- gssd_dentry = d_hash_and_lookup(root, &q);
+ gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name));
if (!gssd_dentry)
return ERR_PTR(-ENOENT);
@@ -1315,9 +1311,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
goto out;
}
- q.name = gssd_dummy_clnt_dir[0].name;
- q.len = strlen(gssd_dummy_clnt_dir[0].name);
- clnt_dentry = d_hash_and_lookup(gssd_dentry, &q);
+ clnt_dentry = d_hash_and_lookup(gssd_dentry,
+ &QSTR(gssd_dummy_clnt_dir[0].name));
if (!clnt_dentry) {
__rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
pipe_dentry = ERR_PTR(-ENOENT);
@@ -1490,7 +1485,7 @@ int register_rpc_pipefs(void)
rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
sizeof(struct rpc_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
init_once);
if (!rpc_inode_cachep)
return -ENOMEM;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 6debf4fd42d4..9b45fbdc90ca 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -369,8 +369,10 @@ static void rpc_make_runnable(struct workqueue_struct *wq,
if (RPC_IS_ASYNC(task)) {
INIT_WORK(&task->u.tk_work, rpc_async_schedule);
queue_work(wq, &task->u.tk_work);
- } else
+ } else {
+ smp_mb__after_atomic();
wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
+ }
}
/*
@@ -862,8 +864,6 @@ void rpc_signal_task(struct rpc_task *task)
if (!rpc_task_set_rpc_status(task, -ERESTARTSYS))
return;
trace_rpc_task_signalled(task, task->tk_action);
- set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
- smp_mb__after_atomic();
queue = READ_ONCE(task->tk_waitqueue);
if (queue)
rpc_wake_up_queued_task(queue, task);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 65fc1297c6df..383860cb1d5b 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(rpc_proc_unregister);
struct proc_dir_entry *
svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops)
{
- return do_register(net, statp->program->pg_name, statp, proc_ops);
+ return do_register(net, statp->program->pg_name, net, proc_ops);
}
EXPORT_SYMBOL_GPL(svc_proc_register);
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index d4a362c9e4b3..e3c6e3b63f0b 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -36,7 +36,11 @@ static inline int sock_is_loopback(struct sock *sk)
return loopback;
}
+struct svc_serv;
+struct svc_rqst;
int rpc_clients_notifier_register(void);
void rpc_clients_notifier_unregister(void);
void auth_domain_cleanup(void);
+void svc_sock_update_bufs(struct svc_serv *serv);
+enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp);
#endif /* _NET_SUNRPC_SUNRPC_H */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b969e505c7b7..e7f9c295d13c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -32,6 +32,7 @@
#include <trace/events/sunrpc.h>
#include "fail.h"
+#include "sunrpc.h"
#define RPCDBG_FACILITY RPCDBG_SVCDSP
@@ -72,57 +73,100 @@ static struct svc_pool_map svc_pool_map = {
static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
static int
-param_set_pool_mode(const char *val, const struct kernel_param *kp)
+__param_set_pool_mode(const char *val, struct svc_pool_map *m)
{
- int *ip = (int *)kp->arg;
- struct svc_pool_map *m = &svc_pool_map;
- int err;
+ int err, mode;
mutex_lock(&svc_pool_map_mutex);
- err = -EBUSY;
- if (m->count)
- goto out;
-
err = 0;
if (!strncmp(val, "auto", 4))
- *ip = SVC_POOL_AUTO;
+ mode = SVC_POOL_AUTO;
else if (!strncmp(val, "global", 6))
- *ip = SVC_POOL_GLOBAL;
+ mode = SVC_POOL_GLOBAL;
else if (!strncmp(val, "percpu", 6))
- *ip = SVC_POOL_PERCPU;
+ mode = SVC_POOL_PERCPU;
else if (!strncmp(val, "pernode", 7))
- *ip = SVC_POOL_PERNODE;
+ mode = SVC_POOL_PERNODE;
else
err = -EINVAL;
+ if (err)
+ goto out;
+
+ if (m->count == 0)
+ m->mode = mode;
+ else if (mode != m->mode)
+ err = -EBUSY;
out:
mutex_unlock(&svc_pool_map_mutex);
return err;
}
static int
-param_get_pool_mode(char *buf, const struct kernel_param *kp)
+param_set_pool_mode(const char *val, const struct kernel_param *kp)
+{
+ struct svc_pool_map *m = kp->arg;
+
+ return __param_set_pool_mode(val, m);
+}
+
+int sunrpc_set_pool_mode(const char *val)
+{
+ return __param_set_pool_mode(val, &svc_pool_map);
+}
+EXPORT_SYMBOL(sunrpc_set_pool_mode);
+
+/**
+ * sunrpc_get_pool_mode - get the current pool_mode for the host
+ * @buf: where to write the current pool_mode
+ * @size: size of @buf
+ *
+ * Grab the current pool_mode from the svc_pool_map and write
+ * the resulting string to @buf. Returns the number of characters
+ * written to @buf (a'la snprintf()).
+ */
+int
+sunrpc_get_pool_mode(char *buf, size_t size)
{
- int *ip = (int *)kp->arg;
+ struct svc_pool_map *m = &svc_pool_map;
- switch (*ip)
+ switch (m->mode)
{
case SVC_POOL_AUTO:
- return sysfs_emit(buf, "auto\n");
+ return snprintf(buf, size, "auto");
case SVC_POOL_GLOBAL:
- return sysfs_emit(buf, "global\n");
+ return snprintf(buf, size, "global");
case SVC_POOL_PERCPU:
- return sysfs_emit(buf, "percpu\n");
+ return snprintf(buf, size, "percpu");
case SVC_POOL_PERNODE:
- return sysfs_emit(buf, "pernode\n");
+ return snprintf(buf, size, "pernode");
default:
- return sysfs_emit(buf, "%d\n", *ip);
+ return snprintf(buf, size, "%d", m->mode);
}
}
+EXPORT_SYMBOL(sunrpc_get_pool_mode);
+
+static int
+param_get_pool_mode(char *buf, const struct kernel_param *kp)
+{
+ char str[16];
+ int len;
+
+ len = sunrpc_get_pool_mode(str, ARRAY_SIZE(str));
+
+ /* Ensure we have room for newline and NUL */
+ len = min_t(int, len, ARRAY_SIZE(str) - 2);
+
+ /* tack on the newline */
+ str[len] = '\n';
+ str[len + 1] = '\0';
+
+ return sysfs_emit(buf, "%s", str);
+}
module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode,
- &svc_pool_map.mode, 0644);
+ &svc_pool_map, 0644);
/*
* Detect best pool mapping mode heuristically,
@@ -250,10 +294,8 @@ svc_pool_map_get(void)
int npools = -1;
mutex_lock(&svc_pool_map_mutex);
-
if (m->count++) {
mutex_unlock(&svc_pool_map_mutex);
- WARN_ON_ONCE(m->npools <= 1);
return m->npools;
}
@@ -275,32 +317,21 @@ svc_pool_map_get(void)
m->mode = SVC_POOL_GLOBAL;
}
m->npools = npools;
-
- if (npools == 1)
- /* service is unpooled, so doesn't hold a reference */
- m->count--;
-
mutex_unlock(&svc_pool_map_mutex);
return npools;
}
/*
- * Drop a reference to the global map of cpus to pools, if
- * pools were in use, i.e. if npools > 1.
+ * Drop a reference to the global map of cpus to pools.
* When the last reference is dropped, the map data is
- * freed; this allows the sysadmin to change the pool
- * mode using the pool_mode module option without
- * rebooting or re-loading sunrpc.ko.
+ * freed; this allows the sysadmin to change the pool.
*/
static void
-svc_pool_map_put(int npools)
+svc_pool_map_put(void)
{
struct svc_pool_map *m = &svc_pool_map;
- if (npools <= 1)
- return;
mutex_lock(&svc_pool_map_mutex);
-
if (!--m->count) {
kfree(m->to_pool);
m->to_pool = NULL;
@@ -308,7 +339,6 @@ svc_pool_map_put(int npools)
m->pool_to = NULL;
m->npools = 0;
}
-
mutex_unlock(&svc_pool_map_mutex);
}
@@ -388,7 +418,7 @@ struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv)
return &serv->sv_pools[pidx % serv->sv_nrpools];
}
-int svc_rpcb_setup(struct svc_serv *serv, struct net *net)
+static int svc_rpcb_setup(struct svc_serv *serv, struct net *net)
{
int err;
@@ -400,7 +430,6 @@ int svc_rpcb_setup(struct svc_serv *serv, struct net *net)
svc_unregister(serv, net);
return 0;
}
-EXPORT_SYMBOL_GPL(svc_rpcb_setup);
void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net)
{
@@ -411,10 +440,11 @@ EXPORT_SYMBOL_GPL(svc_rpcb_cleanup);
static int svc_uses_rpcbind(struct svc_serv *serv)
{
- struct svc_program *progp;
- unsigned int i;
+ unsigned int p, i;
+
+ for (p = 0; p < serv->sv_nprogs; p++) {
+ struct svc_program *progp = &serv->sv_programs[p];
- for (progp = serv->sv_program; progp; progp = progp->pg_next) {
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
@@ -451,8 +481,8 @@ __svc_init_bc(struct svc_serv *serv)
* Create an RPC service
*/
static struct svc_serv *
-__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
- int (*threadfn)(void *data))
+__svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats,
+ unsigned int bufsize, int npools, int (*threadfn)(void *data))
{
struct svc_serv *serv;
unsigned int vers;
@@ -462,25 +492,27 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
return NULL;
serv->sv_name = prog->pg_name;
- serv->sv_program = prog;
- serv->sv_stats = prog->pg_stats;
+ serv->sv_programs = prog;
+ serv->sv_nprogs = nprogs;
+ serv->sv_stats = stats;
if (bufsize > RPCSVC_MAXPAYLOAD)
bufsize = RPCSVC_MAXPAYLOAD;
serv->sv_max_payload = bufsize? bufsize : 4096;
serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE);
serv->sv_threadfn = threadfn;
xdrsize = 0;
- while (prog) {
- prog->pg_lovers = prog->pg_nvers-1;
- for (vers=0; vers<prog->pg_nvers ; vers++)
- if (prog->pg_vers[vers]) {
- prog->pg_hivers = vers;
- if (prog->pg_lovers > vers)
- prog->pg_lovers = vers;
- if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
- xdrsize = prog->pg_vers[vers]->vs_xdrsize;
+ for (i = 0; i < nprogs; i++) {
+ struct svc_program *progp = &prog[i];
+
+ progp->pg_lovers = progp->pg_nvers-1;
+ for (vers = 0; vers < progp->pg_nvers ; vers++)
+ if (progp->pg_vers[vers]) {
+ progp->pg_hivers = vers;
+ if (progp->pg_lovers > vers)
+ progp->pg_lovers = vers;
+ if (progp->pg_vers[vers]->vs_xdrsize > xdrsize)
+ xdrsize = progp->pg_vers[vers]->vs_xdrsize;
}
- prog = prog->pg_next;
}
serv->sv_xdrsize = xdrsize;
INIT_LIST_HEAD(&serv->sv_tempsocks);
@@ -529,31 +561,36 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize,
int (*threadfn)(void *data))
{
- return __svc_create(prog, bufsize, 1, threadfn);
+ return __svc_create(prog, 1, NULL, bufsize, 1, threadfn);
}
EXPORT_SYMBOL_GPL(svc_create);
/**
* svc_create_pooled - Create an RPC service with pooled threads
- * @prog: the RPC program the new service will handle
+ * @prog: Array of RPC programs the new service will handle
+ * @nprogs: Number of programs in the array
+ * @stats: the stats struct if desired
* @bufsize: maximum message size for @prog
* @threadfn: a function to service RPC requests for @prog
*
* Returns an instantiated struct svc_serv object or NULL.
*/
struct svc_serv *svc_create_pooled(struct svc_program *prog,
+ unsigned int nprogs,
+ struct svc_stat *stats,
unsigned int bufsize,
int (*threadfn)(void *data))
{
struct svc_serv *serv;
unsigned int npools = svc_pool_map_get();
- serv = __svc_create(prog, bufsize, npools, threadfn);
+ serv = __svc_create(prog, nprogs, stats, bufsize, npools, threadfn);
if (!serv)
goto out_err;
+ serv->sv_is_pooled = true;
return serv;
out_err:
- svc_pool_map_put(npools);
+ svc_pool_map_put();
return NULL;
}
EXPORT_SYMBOL_GPL(svc_create_pooled);
@@ -570,20 +607,21 @@ svc_destroy(struct svc_serv **servp)
*servp = NULL;
- dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name);
+ dprintk("svc: svc_destroy(%s)\n", serv->sv_programs->pg_name);
timer_shutdown_sync(&serv->sv_temptimer);
/*
* Remaining transports at this point are not expected.
*/
WARN_ONCE(!list_empty(&serv->sv_permsocks),
- "SVC: permsocks remain for %s\n", serv->sv_program->pg_name);
+ "SVC: permsocks remain for %s\n", serv->sv_programs->pg_name);
WARN_ONCE(!list_empty(&serv->sv_tempsocks),
- "SVC: tempsocks remain for %s\n", serv->sv_program->pg_name);
+ "SVC: tempsocks remain for %s\n", serv->sv_programs->pg_name);
cache_clean_deferred(serv);
- svc_pool_map_put(serv->sv_nrpools);
+ if (serv->sv_is_pooled)
+ svc_pool_map_put();
for (i = 0; i < serv->sv_nrpools; i++) {
struct svc_pool *pool = &serv->sv_pools[i];
@@ -613,8 +651,8 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
if (pages > RPCSVC_MAXPAGES)
pages = RPCSVC_MAXPAGES;
- ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages,
- rqstp->rq_pages);
+ ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages,
+ rqstp->rq_pages);
return ret == pages;
}
@@ -631,8 +669,21 @@ svc_release_buffer(struct svc_rqst *rqstp)
put_page(rqstp->rq_pages[i]);
}
-struct svc_rqst *
-svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
+static void
+svc_rqst_free(struct svc_rqst *rqstp)
+{
+ folio_batch_release(&rqstp->rq_fbatch);
+ svc_release_buffer(rqstp);
+ if (rqstp->rq_scratch_page)
+ put_page(rqstp->rq_scratch_page);
+ kfree(rqstp->rq_resp);
+ kfree(rqstp->rq_argp);
+ kfree(rqstp->rq_auth_data);
+ kfree_rcu(rqstp, rq_rcu_head);
+}
+
+static struct svc_rqst *
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
{
struct svc_rqst *rqstp;
@@ -660,27 +711,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
goto out_enomem;
- return rqstp;
-out_enomem:
- svc_rqst_free(rqstp);
- return NULL;
-}
-EXPORT_SYMBOL_GPL(svc_rqst_alloc);
-
-static struct svc_rqst *
-svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
-{
- struct svc_rqst *rqstp;
+ rqstp->rq_err = -EAGAIN; /* No error yet */
- rqstp = svc_rqst_alloc(serv, pool, node);
- if (!rqstp)
- return ERR_PTR(-ENOMEM);
-
- spin_lock_bh(&serv->sv_lock);
serv->sv_nrthreads += 1;
- spin_unlock_bh(&serv->sv_lock);
-
- atomic_inc(&pool->sp_nrthreads);
+ pool->sp_nrthreads += 1;
/* Protected by whatever lock the service uses when calling
* svc_set_num_threads()
@@ -688,6 +722,10 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
return rqstp;
+
+out_enomem:
+ svc_rqst_free(rqstp);
+ return NULL;
}
/**
@@ -735,31 +773,22 @@ svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool,
struct svc_pool *pool;
unsigned int i;
-retry:
pool = target_pool;
- if (pool != NULL) {
- if (atomic_inc_not_zero(&pool->sp_nrthreads))
- goto found_pool;
- return NULL;
- } else {
+ if (!pool) {
for (i = 0; i < serv->sv_nrpools; i++) {
pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
- if (atomic_inc_not_zero(&pool->sp_nrthreads))
- goto found_pool;
+ if (pool->sp_nrthreads)
+ break;
}
- return NULL;
}
-found_pool:
- set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
- set_bit(SP_NEED_VICTIM, &pool->sp_flags);
- if (!atomic_dec_and_test(&pool->sp_nrthreads))
+ if (pool && pool->sp_nrthreads) {
+ set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
+ set_bit(SP_NEED_VICTIM, &pool->sp_flags);
return pool;
- /* Nothing left in this pool any more */
- clear_bit(SP_NEED_VICTIM, &pool->sp_flags);
- clear_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
- goto retry;
+ }
+ return NULL;
}
static int
@@ -770,6 +799,7 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
struct svc_pool *chosen_pool;
unsigned int state = serv->sv_nrthreads-1;
int node;
+ int err;
do {
nrservs--;
@@ -777,8 +807,8 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
node = svc_pool_map_get_node(chosen_pool->sp_id);
rqstp = svc_prepare_thread(serv, chosen_pool, node);
- if (IS_ERR(rqstp))
- return PTR_ERR(rqstp);
+ if (!rqstp)
+ return -ENOMEM;
task = kthread_create_on_node(serv->sv_threadfn, rqstp,
node, "%s", serv->sv_name);
if (IS_ERR(task)) {
@@ -792,6 +822,13 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
svc_sock_update_bufs(serv);
wake_up_process(task);
+
+ wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
+ err = rqstp->rq_err;
+ if (err) {
+ svc_exit_thread(rqstp);
+ return err;
+ }
} while (nrservs > 0);
return 0;
@@ -838,7 +875,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
if (!pool)
nrservs -= serv->sv_nrthreads;
else
- nrservs -= atomic_read(&pool->sp_nrthreads);
+ nrservs -= pool->sp_nrthreads;
if (nrservs > 0)
return svc_start_kthreads(serv, pool, nrservs);
@@ -900,25 +937,21 @@ void svc_rqst_release_pages(struct svc_rqst *rqstp)
}
}
-/*
- * Called from a server thread as it's exiting. Caller must hold the "service
- * mutex" for the service.
+/**
+ * svc_exit_thread - finalise the termination of a sunrpc server thread
+ * @rqstp: the svc_rqst which represents the thread.
+ *
+ * When a thread started with svc_new_thread() exits it must call
+ * svc_exit_thread() as its last act. This must be done with the
+ * service mutex held. Normally this is held by a DIFFERENT thread, the
+ * one that is calling svc_set_num_threads() and which will wait for
+ * SP_VICTIM_REMAINS to be cleared before dropping the mutex. If the
+ * thread exits for any reason other than svc_thread_should_stop()
+ * returning %true (which indicated that svc_set_num_threads() is
+ * waiting for it to exit), then it must take the service mutex itself,
+ * which can only safely be done using mutex_try_lock().
*/
void
-svc_rqst_free(struct svc_rqst *rqstp)
-{
- folio_batch_release(&rqstp->rq_fbatch);
- svc_release_buffer(rqstp);
- if (rqstp->rq_scratch_page)
- put_page(rqstp->rq_scratch_page);
- kfree(rqstp->rq_resp);
- kfree(rqstp->rq_argp);
- kfree(rqstp->rq_auth_data);
- kfree_rcu(rqstp, rq_rcu_head);
-}
-EXPORT_SYMBOL_GPL(svc_rqst_free);
-
-void
svc_exit_thread(struct svc_rqst *rqstp)
{
struct svc_serv *serv = rqstp->rq_server;
@@ -926,11 +959,8 @@ svc_exit_thread(struct svc_rqst *rqstp)
list_del_rcu(&rqstp->rq_all);
- atomic_dec(&pool->sp_nrthreads);
-
- spin_lock_bh(&serv->sv_lock);
+ pool->sp_nrthreads -= 1;
serv->sv_nrthreads -= 1;
- spin_unlock_bh(&serv->sv_lock);
svc_sock_update_bufs(serv);
svc_rqst_free(rqstp);
@@ -1065,6 +1095,7 @@ static int __svc_register(struct net *net, const char *progname,
return error;
}
+static
int svc_rpcbind_set_version(struct net *net,
const struct svc_program *progp,
u32 version, int family,
@@ -1075,7 +1106,6 @@ int svc_rpcbind_set_version(struct net *net,
version, family, proto, port);
}
-EXPORT_SYMBOL_GPL(svc_rpcbind_set_version);
int svc_generic_rpcbind_set(struct net *net,
const struct svc_program *progp,
@@ -1123,15 +1153,16 @@ int svc_register(const struct svc_serv *serv, struct net *net,
const int family, const unsigned short proto,
const unsigned short port)
{
- struct svc_program *progp;
- unsigned int i;
+ unsigned int p, i;
int error = 0;
WARN_ON_ONCE(proto == 0 && port == 0);
if (proto == 0 && port == 0)
return -EINVAL;
- for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+ for (p = 0; p < serv->sv_nprogs; p++) {
+ struct svc_program *progp = &serv->sv_programs[p];
+
for (i = 0; i < progp->pg_nvers; i++) {
error = progp->pg_rpcbind_set(net, progp, i,
@@ -1183,13 +1214,14 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi
static void svc_unregister(const struct svc_serv *serv, struct net *net)
{
struct sighand_struct *sighand;
- struct svc_program *progp;
unsigned long flags;
- unsigned int i;
+ unsigned int p, i;
clear_thread_flag(TIF_SIGPENDING);
- for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+ for (p = 0; p < serv->sv_nprogs; p++) {
+ struct svc_program *progp = &serv->sv_programs[p];
+
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
@@ -1263,8 +1295,6 @@ svc_generic_init_request(struct svc_rqst *rqstp,
if (rqstp->rq_proc >= versp->vs_nproc)
goto err_bad_proc;
rqstp->rq_procinfo = procp = &versp->vs_proc[rqstp->rq_proc];
- if (!procp)
- goto err_bad_proc;
/* Initialize storage for argp and resp */
memset(rqstp->rq_argp, 0, procp->pc_argzero);
@@ -1291,13 +1321,13 @@ static int
svc_process_common(struct svc_rqst *rqstp)
{
struct xdr_stream *xdr = &rqstp->rq_res_stream;
- struct svc_program *progp;
+ struct svc_program *progp = NULL;
const struct svc_procedure *procp = NULL;
struct svc_serv *serv = rqstp->rq_server;
struct svc_process_info process;
enum svc_auth_status auth_res;
unsigned int aoffset;
- int rc;
+ int pr, rc;
__be32 *p;
/* Will be turned off only when NFSv4 Sessions are used */
@@ -1321,9 +1351,9 @@ svc_process_common(struct svc_rqst *rqstp)
rqstp->rq_vers = be32_to_cpup(p++);
rqstp->rq_proc = be32_to_cpup(p);
- for (progp = serv->sv_program; progp; progp = progp->pg_next)
- if (rqstp->rq_prog == progp->pg_prog)
- break;
+ for (pr = 0; pr < serv->sv_nprogs; pr++)
+ if (rqstp->rq_prog == serv->sv_programs[pr].pg_prog)
+ progp = &serv->sv_programs[pr];
/*
* Decode auth data, and add verifier to reply buffer.
@@ -1375,7 +1405,8 @@ svc_process_common(struct svc_rqst *rqstp)
goto err_bad_proc;
/* Syntactic check complete */
- serv->sv_stats->rpccnt++;
+ if (serv->sv_stats)
+ serv->sv_stats->rpccnt++;
trace_svc_process(rqstp, progp->pg_name);
aoffset = xdr_stream_pos(xdr);
@@ -1427,7 +1458,8 @@ err_short_len:
goto close_xprt;
err_bad_rpc:
- serv->sv_stats->rpcbadfmt++;
+ if (serv->sv_stats)
+ serv->sv_stats->rpcbadfmt++;
xdr_stream_encode_u32(xdr, RPC_MSG_DENIED);
xdr_stream_encode_u32(xdr, RPC_MISMATCH);
/* Only RPCv2 supported */
@@ -1438,7 +1470,8 @@ err_bad_rpc:
err_bad_auth:
dprintk("svc: authentication failed (%d)\n",
be32_to_cpu(rqstp->rq_auth_stat));
- serv->sv_stats->rpcbadauth++;
+ if (serv->sv_stats)
+ serv->sv_stats->rpcbadauth++;
/* Restore write pointer to location of reply status: */
xdr_truncate_encode(xdr, XDR_UNIT * 2);
xdr_stream_encode_u32(xdr, RPC_MSG_DENIED);
@@ -1448,7 +1481,8 @@ err_bad_auth:
err_bad_prog:
dprintk("svc: unknown program %d\n", rqstp->rq_prog);
- serv->sv_stats->rpcbadfmt++;
+ if (serv->sv_stats)
+ serv->sv_stats->rpcbadfmt++;
*rqstp->rq_accept_statp = rpc_prog_unavail;
goto sendit;
@@ -1456,7 +1490,8 @@ err_bad_vers:
svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n",
rqstp->rq_vers, rqstp->rq_prog, progp->pg_name);
- serv->sv_stats->rpcbadfmt++;
+ if (serv->sv_stats)
+ serv->sv_stats->rpcbadfmt++;
*rqstp->rq_accept_statp = rpc_prog_mismatch;
/*
@@ -1470,23 +1505,34 @@ err_bad_vers:
err_bad_proc:
svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc);
- serv->sv_stats->rpcbadfmt++;
+ if (serv->sv_stats)
+ serv->sv_stats->rpcbadfmt++;
*rqstp->rq_accept_statp = rpc_proc_unavail;
goto sendit;
err_garbage_args:
svc_printk(rqstp, "failed to decode RPC header\n");
- serv->sv_stats->rpcbadfmt++;
+ if (serv->sv_stats)
+ serv->sv_stats->rpcbadfmt++;
*rqstp->rq_accept_statp = rpc_garbage_args;
goto sendit;
err_system_err:
- serv->sv_stats->rpcbadfmt++;
+ if (serv->sv_stats)
+ serv->sv_stats->rpcbadfmt++;
*rqstp->rq_accept_statp = rpc_system_err;
goto sendit;
}
+/*
+ * Drop request
+ */
+static void svc_drop(struct svc_rqst *rqstp)
+{
+ trace_svc_drop(rqstp);
+}
+
/**
* svc_process - Execute one RPC transaction
* @rqstp: RPC transaction context
@@ -1534,7 +1580,8 @@ void svc_process(struct svc_rqst *rqstp)
out_baddir:
svc_printk(rqstp, "bad direction 0x%08x, dropping request\n",
be32_to_cpu(*p));
- rqstp->rq_server->sv_stats->rpcbadfmt++;
+ if (rqstp->rq_server->sv_stats)
+ rqstp->rq_server->sv_stats->rpcbadfmt++;
out_drop:
svc_drop(rqstp);
}
@@ -1548,9 +1595,11 @@ out_drop:
*/
void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
{
+ struct rpc_timeout timeout = {
+ .to_increment = 0,
+ };
struct rpc_task *task;
int proc_error;
- struct rpc_timeout timeout;
/* Build the svc_rqst used by the common processing routine */
rqstp->rq_xid = req->rq_xid;
@@ -1603,6 +1652,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
timeout.to_initval = req->rq_xprt->timeout->to_initval;
timeout.to_retries = req->rq_xprt->timeout->to_retries;
}
+ timeout.to_maxval = timeout.to_initval;
memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
task = rpc_run_bc_task(req, &timeout);
@@ -1612,7 +1662,6 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
rpc_put_task(task);
}
-EXPORT_SYMBOL_GPL(svc_process_bc);
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
/**
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b4a85a227bd7..ae25405d8bd2 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -46,7 +46,6 @@ static LIST_HEAD(svc_xprt_class_list);
/* SMP locking strategy:
*
- * svc_pool->sp_lock protects most of the fields of that pool.
* svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
* when both need to be taken (rare), svc_serv->sv_lock is first.
* The "service mutex" protects svc_serv->sv_nrthread.
@@ -158,6 +157,7 @@ int svc_print_xprts(char *buf, int maxlen)
*/
void svc_xprt_deferred_close(struct svc_xprt *xprt)
{
+ trace_svc_xprt_close(xprt);
if (!test_and_set_bit(XPT_CLOSE, &xprt->xpt_flags))
svc_xprt_enqueue(xprt);
}
@@ -211,51 +211,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
}
EXPORT_SYMBOL_GPL(svc_xprt_init);
-static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
- struct svc_serv *serv,
- struct net *net,
- const int family,
- const unsigned short port,
- int flags)
-{
- struct sockaddr_in sin = {
- .sin_family = AF_INET,
- .sin_addr.s_addr = htonl(INADDR_ANY),
- .sin_port = htons(port),
- };
-#if IS_ENABLED(CONFIG_IPV6)
- struct sockaddr_in6 sin6 = {
- .sin6_family = AF_INET6,
- .sin6_addr = IN6ADDR_ANY_INIT,
- .sin6_port = htons(port),
- };
-#endif
- struct svc_xprt *xprt;
- struct sockaddr *sap;
- size_t len;
-
- switch (family) {
- case PF_INET:
- sap = (struct sockaddr *)&sin;
- len = sizeof(sin);
- break;
-#if IS_ENABLED(CONFIG_IPV6)
- case PF_INET6:
- sap = (struct sockaddr *)&sin6;
- len = sizeof(sin6);
- break;
-#endif
- default:
- return ERR_PTR(-EAFNOSUPPORT);
- }
-
- xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
- if (IS_ERR(xprt))
- trace_svc_xprt_create_err(serv->sv_program->pg_name,
- xcl->xcl_name, sap, len, xprt);
- return xprt;
-}
-
/**
* svc_xprt_received - start next receiver thread
* @xprt: controlling transport
@@ -294,9 +249,8 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
}
static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
- struct net *net, const int family,
- const unsigned short port, int flags,
- const struct cred *cred)
+ struct net *net, struct sockaddr *sap,
+ size_t len, int flags, const struct cred *cred)
{
struct svc_xprt_class *xcl;
@@ -312,8 +266,11 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
goto err;
spin_unlock(&svc_xprt_class_lock);
- newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags);
+ newxprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
if (IS_ERR(newxprt)) {
+ trace_svc_xprt_create_err(serv->sv_programs->pg_name,
+ xcl->xcl_name, sap, len,
+ newxprt);
module_put(xcl->xcl_owner);
return PTR_ERR(newxprt);
}
@@ -330,6 +287,48 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
}
/**
+ * svc_xprt_create_from_sa - Add a new listener to @serv from socket address
+ * @serv: target RPC service
+ * @xprt_name: transport class name
+ * @net: network namespace
+ * @sap: socket address pointer
+ * @flags: SVC_SOCK flags
+ * @cred: credential to bind to this transport
+ *
+ * Return local xprt port on success or %-EPROTONOSUPPORT on failure
+ */
+int svc_xprt_create_from_sa(struct svc_serv *serv, const char *xprt_name,
+ struct net *net, struct sockaddr *sap,
+ int flags, const struct cred *cred)
+{
+ size_t len;
+ int err;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ len = sizeof(struct sockaddr_in);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ len = sizeof(struct sockaddr_in6);
+ break;
+#endif
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ err = _svc_xprt_create(serv, xprt_name, net, sap, len, flags, cred);
+ if (err == -EPROTONOSUPPORT) {
+ request_module("svc%s", xprt_name);
+ err = _svc_xprt_create(serv, xprt_name, net, sap, len, flags,
+ cred);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(svc_xprt_create_from_sa);
+
+/**
* svc_xprt_create - Add a new listener to @serv
* @serv: target RPC service
* @xprt_name: transport class name
@@ -339,23 +338,41 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
* @flags: SVC_SOCK flags
* @cred: credential to bind to this transport
*
- * Return values:
- * %0: New listener added successfully
- * %-EPROTONOSUPPORT: Requested transport type not supported
+ * Return local xprt port on success or %-EPROTONOSUPPORT on failure
*/
int svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
const unsigned short port, int flags,
const struct cred *cred)
{
- int err;
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ .sin_port = htons(port),
+ };
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = htons(port),
+ };
+#endif
+ struct sockaddr *sap;
- err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred);
- if (err == -EPROTONOSUPPORT) {
- request_module("svc%s", xprt_name);
- err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred);
+ switch (family) {
+ case PF_INET:
+ sap = (struct sockaddr *)&sin;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case PF_INET6:
+ sap = (struct sockaddr *)&sin6;
+ break;
+#endif
+ default:
+ return -EAFNOSUPPORT;
}
- return err;
+
+ return svc_xprt_create_from_sa(serv, xprt_name, net, sap, flags, cred);
}
EXPORT_SYMBOL_GPL(svc_xprt_create);
@@ -589,7 +606,8 @@ int svc_port_is_privileged(struct sockaddr *sin)
}
/*
- * Make sure that we don't have too many active connections. If we have,
+ * Make sure that we don't have too many connections that have not yet
+ * demonstrated that they have access to the NFS server. If we have,
* something must be dropped. It's not clear what will happen if we allow
* "too many" connections, but when dealing with network-facing software,
* we have to code defensively. Here we do that by imposing hard limits.
@@ -601,34 +619,26 @@ int svc_port_is_privileged(struct sockaddr *sin)
* The only somewhat efficient mechanism would be if drop old
* connections from the same IP first. But right now we don't even
* record the client IP in svc_sock.
- *
- * single-threaded services that expect a lot of clients will probably
- * need to set sv_maxconn to override the default value which is based
- * on the number of threads
*/
static void svc_check_conn_limits(struct svc_serv *serv)
{
- unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn :
- (serv->sv_nrthreads+3) * 20;
-
- if (serv->sv_tmpcnt > limit) {
- struct svc_xprt *xprt = NULL;
+ if (serv->sv_tmpcnt > XPT_MAX_TMP_CONN) {
+ struct svc_xprt *xprt = NULL, *xprti;
spin_lock_bh(&serv->sv_lock);
if (!list_empty(&serv->sv_tempsocks)) {
- /* Try to help the admin */
- net_notice_ratelimited("%s: too many open connections, consider increasing the %s\n",
- serv->sv_name, serv->sv_maxconn ?
- "max number of connections" :
- "number of threads");
/*
* Always select the oldest connection. It's not fair,
- * but so is life
+ * but nor is life.
*/
- xprt = list_entry(serv->sv_tempsocks.prev,
- struct svc_xprt,
- xpt_list);
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
- svc_xprt_get(xprt);
+ list_for_each_entry_reverse(xprti, &serv->sv_tempsocks,
+ xpt_list) {
+ if (!test_bit(XPT_PEER_VALID, &xprti->xpt_flags)) {
+ xprt = xprti;
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_get(xprt);
+ break;
+ }
+ }
}
spin_unlock_bh(&serv->sv_lock);
@@ -654,8 +664,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
}
for (filled = 0; filled < pages; filled = ret) {
- ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
- rqstp->rq_pages);
+ ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages);
if (ret > filled)
/* Made progress, don't sleep yet */
continue;
@@ -888,15 +897,6 @@ void svc_recv(struct svc_rqst *rqstp)
}
EXPORT_SYMBOL_GPL(svc_recv);
-/*
- * Drop request
- */
-void svc_drop(struct svc_rqst *rqstp)
-{
- trace_svc_drop(rqstp);
-}
-EXPORT_SYMBOL_GPL(svc_drop);
-
/**
* svc_send - Return reply to client
* @rqstp: RPC transaction context
@@ -1031,7 +1031,8 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
spin_lock_bh(&serv->sv_lock);
list_del_init(&xprt->xpt_list);
- if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+ if (test_bit(XPT_TEMP, &xprt->xpt_flags) &&
+ !test_bit(XPT_PEER_VALID, &xprt->xpt_flags))
serv->sv_tmpcnt--;
spin_unlock_bh(&serv->sv_lock);
@@ -1260,6 +1261,40 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
}
/**
+ * svc_find_listener - find an RPC transport instance
+ * @serv: pointer to svc_serv to search
+ * @xcl_name: C string containing transport's class name
+ * @net: owner net pointer
+ * @sa: sockaddr containing address
+ *
+ * Return the transport instance pointer for the endpoint accepting
+ * connections/peer traffic from the specified transport class,
+ * and matching sockaddr.
+ */
+struct svc_xprt *svc_find_listener(struct svc_serv *serv, const char *xcl_name,
+ struct net *net, const struct sockaddr *sa)
+{
+ struct svc_xprt *xprt;
+ struct svc_xprt *found = NULL;
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+ if (xprt->xpt_net != net)
+ continue;
+ if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
+ continue;
+ if (!rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local))
+ continue;
+ found = xprt;
+ svc_xprt_get(xprt);
+ break;
+ }
+ spin_unlock_bh(&serv->sv_lock);
+ return found;
+}
+EXPORT_SYMBOL_GPL(svc_find_listener);
+
+/**
* svc_find_xprt - find an RPC transport instance
* @serv: pointer to svc_serv to search
* @xcl_name: C string containing transport's class name
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 1619211f0960..55b4d2874188 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -18,6 +18,7 @@
#include <linux/sunrpc/svcauth.h>
#include <linux/err.h>
#include <linux/hash.h>
+#include <linux/user_namespace.h>
#include <trace/events/sunrpc.h>
@@ -98,7 +99,6 @@ enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp)
rqstp->rq_authop = aops;
return aops->accept(rqstp);
}
-EXPORT_SYMBOL_GPL(svc_authenticate);
/**
* svc_set_client - Assign an appropriate 'auth_domain' as the client
@@ -176,6 +176,33 @@ rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp)
}
EXPORT_SYMBOL_GPL(svc_auth_flavor);
+/**
+ * svcauth_map_clnt_to_svc_cred_local - maps a generic cred
+ * to a svc_cred suitable for use in nfsd.
+ * @clnt: rpc_clnt associated with nfs client
+ * @cred: generic cred associated with nfs client
+ * @svc: returned svc_cred that is suitable for use in nfsd
+ */
+void svcauth_map_clnt_to_svc_cred_local(struct rpc_clnt *clnt,
+ const struct cred *cred,
+ struct svc_cred *svc)
+{
+ struct user_namespace *userns = clnt->cl_cred ?
+ clnt->cl_cred->user_ns : &init_user_ns;
+
+ memset(svc, 0, sizeof(struct svc_cred));
+
+ svc->cr_uid = KUIDT_INIT(from_kuid_munged(userns, cred->fsuid));
+ svc->cr_gid = KGIDT_INIT(from_kgid_munged(userns, cred->fsgid));
+ svc->cr_flavor = clnt->cl_auth->au_flavor;
+ if (cred->group_info)
+ svc->cr_group_info = get_group_info(cred->group_info);
+ /* These aren't relevant for local (network is bypassed) */
+ svc->cr_principal = NULL;
+ svc->cr_gss_mech = NULL;
+}
+EXPORT_SYMBOL_GPL(svcauth_map_clnt_to_svc_cred_local);
+
/**************************************************
* 'auth_domains' are stored in a hash table indexed by name.
* When the last reference to an 'auth_domain' is dropped,
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 04b45588ae6f..8ca98b146ec8 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -697,7 +697,8 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
rqstp->rq_auth_stat = rpc_autherr_badcred;
ipm = ip_map_cached_get(xprt);
if (ipm == NULL)
- ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class,
+ ipm = __ip_map_lookup(sn->ip_map_cache,
+ rqstp->rq_server->sv_programs->pg_class,
&sin6->sin6_addr);
if (ipm == NULL)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 545017a3daa4..72e5a01df3d3 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1083,9 +1083,6 @@ static void svc_tcp_fragment_received(struct svc_sock *svsk)
/* If we have more data, signal svc_xprt_enqueue() to try again */
svsk->sk_tcplen = 0;
svsk->sk_marker = xdr_zero;
-
- smp_wmb();
- tcp_set_rcvlowat(svsk->sk_sk, 1);
}
/**
@@ -1175,17 +1172,10 @@ err_incomplete:
goto err_delete;
if (len == want)
svc_tcp_fragment_received(svsk);
- else {
- /* Avoid more ->sk_data_ready() calls until the rest
- * of the message has arrived. This reduces service
- * thread wake-ups on large incoming messages. */
- tcp_set_rcvlowat(svsk->sk_sk,
- svc_sock_reclen(svsk) - svsk->sk_tcplen);
-
+ else
trace_svcsock_tcp_recv_short(&svsk->sk_xprt,
svc_sock_reclen(svsk),
svsk->sk_tcplen - sizeof(rpc_fraghdr));
- }
goto err_noclose;
error:
if (len != -EAGAIN)
@@ -1206,15 +1196,6 @@ err_noclose:
* MSG_SPLICE_PAGES is used exclusively to reduce the number of
* copy operations in this path. Therefore the caller must ensure
* that the pages backing @xdr are unchanging.
- *
- * Note that the send is non-blocking. The caller has incremented
- * the reference count on each page backing the RPC message, and
- * the network layer will "put" these pages when transmission is
- * complete.
- *
- * This is safe for our RPC services because the memory backing
- * the head and tail components is never kmalloc'd. These always
- * come from pages in the svc_rqst::rq_pages array.
*/
static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
rpc_fraghdr marker, unsigned int *sentp)
@@ -1244,6 +1225,7 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
1 + count, sizeof(marker) + rqstp->rq_res.len);
ret = sock_sendmsg(svsk->sk_sock, &msg);
+ page_frag_free(buf);
if (ret < 0)
return ret;
*sentp += ret;
@@ -1386,7 +1368,6 @@ void svc_sock_update_bufs(struct svc_serv *serv)
set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
spin_unlock_bh(&serv->sv_lock);
}
-EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
/*
* Initialize socket for RPC use and create svc_sock struct
@@ -1560,6 +1541,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
newlen = error;
if (protocol == IPPROTO_TCP) {
+ sk_net_refcnt_upgrade(sock->sk);
if ((error = kernel_listen(sock, 64)) < 0)
goto bummer;
}
@@ -1617,7 +1599,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
static void svc_sock_free(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;
trace_svcsock_free(svsk, sock);
@@ -1627,8 +1608,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
sockfd_put(sock);
else
sock_release(sock);
- if (pfc->va)
- __page_frag_cache_drain(virt_to_head_page(pfc->va),
- pfc->pagecnt_bias);
+
+ page_frag_cache_drain(&svsk->sk_frag_cache);
kfree(svsk);
}
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 93941ab12549..bdb587a72422 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -40,7 +40,7 @@ EXPORT_SYMBOL_GPL(nlm_debug);
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-static int proc_do_xprt(struct ctl_table *table, int write,
+static int proc_do_xprt(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[256];
@@ -62,7 +62,7 @@ static int proc_do_xprt(struct ctl_table *table, int write,
}
static int
-proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp,
+proc_dodebug(const struct ctl_table *table, int write, void *buffer, size_t *lenp,
loff_t *ppos)
{
char tmpbuf[20], *s = NULL;
@@ -160,7 +160,6 @@ static struct ctl_table debug_table[] = {
.mode = 0444,
.proc_handler = proc_do_xprt,
},
- { }
};
void
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 62e07c330a66..4e003cb516fe 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1097,6 +1097,12 @@ out_overflow:
* Checks that we have enough buffer space to encode 'nbytes' more
* bytes of data. If so, update the total xdr_buf length, and
* adjust the length of the current kvec.
+ *
+ * The returned pointer is valid only until the next call to
+ * xdr_reserve_space() or xdr_commit_encode() on @xdr. The current
+ * implementation of this API guarantees that space reserved for a
+ * four-byte data item remains valid until @xdr is destroyed, but
+ * that might not always be true in the future.
*/
__be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
{
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index af13fdfa6672..09f245cda526 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1398,6 +1398,12 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task)
if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
return;
if (!list_empty(&req->rq_xmit)) {
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (list_is_first(&req->rq_xmit, &xprt->xmit_queue) &&
+ xprt->ops->abort_send_request)
+ xprt->ops->abort_send_request(req);
+
list_del(&req->rq_xmit);
if (!list_empty(&req->rq_xmit2)) {
struct rpc_rqst *next = list_first_entry(&req->rq_xmit2,
@@ -1541,6 +1547,9 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
int is_retrans = RPC_WAS_SENT(task);
int status;
+ if (test_bit(XPRT_CLOSE_WAIT, &xprt->state))
+ return -ENOTCONN;
+
if (!req->rq_bytes_sent) {
if (xprt_request_data_received(task)) {
status = 0;
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 720d3ba742ec..7e98d4dd9f10 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -603,23 +603,6 @@ struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi,
}
/**
- * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor
- * @xpi: pointer to rpc_xprt_iter
- *
- * Returns a reference to the struct rpc_xprt that is currently
- * pointed to by the cursor.
- */
-struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi)
-{
- struct rpc_xprt *xprt;
-
- rcu_read_lock();
- xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt);
- rcu_read_unlock();
- return xprt;
-}
-
-/**
* xprt_iter_get_next - Returns the next rpc_xprt following the cursor
* @xpi: pointer to rpc_xprt_iter
*
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 55b21bae866d..3232aa23cdb4 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
-rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
+rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
svc_rdma_pcl.o module.o
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index ffbf99894970..31434aeb8e29 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -54,7 +54,7 @@ static void frwr_cid_init(struct rpcrdma_ep *ep,
cid->ci_completion_id = mr->mr_ibmr->res.id;
}
-static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+static void frwr_mr_unmap(struct rpcrdma_mr *mr)
{
if (mr->mr_device) {
trace_xprtrdma_mr_unmap(mr);
@@ -73,7 +73,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr)
{
int rc;
- frwr_mr_unmap(mr->mr_xprt, mr);
+ frwr_mr_unmap(mr);
rc = ib_dereg_mr(mr->mr_ibmr);
if (rc)
@@ -84,7 +84,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr)
static void frwr_mr_put(struct rpcrdma_mr *mr)
{
- frwr_mr_unmap(mr->mr_xprt, mr);
+ frwr_mr_unmap(mr);
/* The MR is returned to the req's MR free list instead
* of to the xprt's MR free list. No spinlock is needed.
@@ -92,7 +92,8 @@ static void frwr_mr_put(struct rpcrdma_mr *mr)
rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
}
-/* frwr_reset - Place MRs back on the free list
+/**
+ * frwr_reset - Place MRs back on @req's free list
* @req: request to reset
*
* Used after a failed marshal. For FRWR, this means the MRs
diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c
new file mode 100644
index 000000000000..28c68b5f6823
--- /dev/null
+++ b/net/sunrpc/xprtrdma/ib_client.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2024 Oracle. All rights reserved.
+ */
+
+/* #include <linux/module.h>
+#include <linux/slab.h> */
+#include <linux/xarray.h>
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/rdma_rn.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+/* Per-ib_device private data for rpcrdma */
+struct rpcrdma_device {
+ struct kref rd_kref;
+ unsigned long rd_flags;
+ struct ib_device *rd_device;
+ struct xarray rd_xa;
+ struct completion rd_done;
+};
+
+#define RPCRDMA_RD_F_REMOVING (0)
+
+static struct ib_client rpcrdma_ib_client;
+
+/*
+ * Listeners have no associated device, so we never register them.
+ * Note that ib_get_client_data() does not check if @device is
+ * NULL for us.
+ */
+static struct rpcrdma_device *rpcrdma_get_client_data(struct ib_device *device)
+{
+ if (!device)
+ return NULL;
+ return ib_get_client_data(device, &rpcrdma_ib_client);
+}
+
+/**
+ * rpcrdma_rn_register - register to get device removal notifications
+ * @device: device to monitor
+ * @rn: notification object that wishes to be notified
+ * @done: callback to notify caller of device removal
+ *
+ * Returns zero on success. The callback in rn_done is guaranteed
+ * to be invoked when the device is removed, unless this notification
+ * is unregistered first.
+ *
+ * On failure, a negative errno is returned.
+ */
+int rpcrdma_rn_register(struct ib_device *device,
+ struct rpcrdma_notification *rn,
+ void (*done)(struct rpcrdma_notification *rn))
+{
+ struct rpcrdma_device *rd = rpcrdma_get_client_data(device);
+
+ if (!rd || test_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags))
+ return -ENETUNREACH;
+
+ if (xa_alloc(&rd->rd_xa, &rn->rn_index, rn, xa_limit_32b, GFP_KERNEL) < 0)
+ return -ENOMEM;
+ kref_get(&rd->rd_kref);
+ rn->rn_done = done;
+ trace_rpcrdma_client_register(device, rn);
+ return 0;
+}
+
+static void rpcrdma_rn_release(struct kref *kref)
+{
+ struct rpcrdma_device *rd = container_of(kref, struct rpcrdma_device,
+ rd_kref);
+
+ trace_rpcrdma_client_completion(rd->rd_device);
+ complete(&rd->rd_done);
+}
+
+/**
+ * rpcrdma_rn_unregister - stop device removal notifications
+ * @device: monitored device
+ * @rn: notification object that no longer wishes to be notified
+ */
+void rpcrdma_rn_unregister(struct ib_device *device,
+ struct rpcrdma_notification *rn)
+{
+ struct rpcrdma_device *rd = rpcrdma_get_client_data(device);
+
+ if (!rd)
+ return;
+
+ trace_rpcrdma_client_unregister(device, rn);
+ xa_erase(&rd->rd_xa, rn->rn_index);
+ kref_put(&rd->rd_kref, rpcrdma_rn_release);
+}
+
+/**
+ * rpcrdma_add_one - ib_client device insertion callback
+ * @device: device about to be inserted
+ *
+ * Returns zero on success. xprtrdma private data has been allocated
+ * for this device. On failure, a negative errno is returned.
+ */
+static int rpcrdma_add_one(struct ib_device *device)
+{
+ struct rpcrdma_device *rd;
+
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return -ENOMEM;
+
+ kref_init(&rd->rd_kref);
+ xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC);
+ rd->rd_device = device;
+ init_completion(&rd->rd_done);
+ ib_set_client_data(device, &rpcrdma_ib_client, rd);
+
+ trace_rpcrdma_client_add_one(device);
+ return 0;
+}
+
+/**
+ * rpcrdma_remove_one - ib_client device removal callback
+ * @device: device about to be removed
+ * @client_data: this module's private per-device data
+ *
+ * Upon return, all transports associated with @device have divested
+ * themselves from IB hardware resources.
+ */
+static void rpcrdma_remove_one(struct ib_device *device,
+ void *client_data)
+{
+ struct rpcrdma_device *rd = client_data;
+ struct rpcrdma_notification *rn;
+ unsigned long index;
+
+ trace_rpcrdma_client_remove_one(device);
+
+ set_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags);
+ xa_for_each(&rd->rd_xa, index, rn)
+ rn->rn_done(rn);
+
+ /*
+ * Wait only if there are still outstanding notification
+ * registrants for this device.
+ */
+ if (!refcount_dec_and_test(&rd->rd_kref.refcount)) {
+ trace_rpcrdma_client_wait_on(device);
+ wait_for_completion(&rd->rd_done);
+ }
+
+ trace_rpcrdma_client_remove_one_done(device);
+ xa_destroy(&rd->rd_xa);
+ kfree(rd);
+}
+
+static struct ib_client rpcrdma_ib_client = {
+ .name = "rpcrdma",
+ .add = rpcrdma_add_one,
+ .remove = rpcrdma_remove_one,
+};
+
+/**
+ * rpcrdma_ib_client_unregister - unregister ib_client for xprtrdma
+ *
+ * cel: watch for orphaned rpcrdma_device objects on module unload
+ */
+void rpcrdma_ib_client_unregister(void)
+{
+ ib_unregister_client(&rpcrdma_ib_client);
+}
+
+/**
+ * rpcrdma_ib_client_register - register ib_client for rpcrdma
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int rpcrdma_ib_client_register(void)
+{
+ return ib_register_client(&rpcrdma_ib_client);
+}
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 45c5b41ac8dc..697f571d4c01 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/rdma_rn.h>
#include <asm/swab.h>
@@ -30,21 +31,32 @@ static void __exit rpc_rdma_cleanup(void)
{
xprt_rdma_cleanup();
svc_rdma_cleanup();
+ rpcrdma_ib_client_unregister();
}
static int __init rpc_rdma_init(void)
{
int rc;
+ rc = rpcrdma_ib_client_register();
+ if (rc)
+ goto out_rc;
+
rc = svc_rdma_init();
if (rc)
- goto out;
+ goto out_ib_client;
rc = xprt_rdma_init();
if (rc)
- svc_rdma_cleanup();
+ goto out_svc_rdma;
-out:
+ return 0;
+
+out_svc_rdma:
+ svc_rdma_cleanup();
+out_ib_client:
+ rpcrdma_ib_client_unregister();
+out_rc:
return rc;
}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 190a4de239c8..1478c41c7e9d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1471,8 +1471,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_ep->re_max_requests)
credits = r_xprt->rx_ep->re_max_requests;
- rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
- false);
+ rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1));
if (buf->rb_credits != credits)
rpcrdma_update_cwnd(r_xprt, credits);
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index f86970733eb0..415c0310101f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -74,7 +74,7 @@ enum {
SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long),
};
-static int svcrdma_counter_handler(struct ctl_table *table, int write,
+static int svcrdma_counter_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct percpu_counter *stat = (struct percpu_counter *)table->data;
@@ -209,7 +209,6 @@ static struct ctl_table svcrdma_parm_table[] = {
.extra1 = &zero,
.extra2 = &zero,
},
- { },
};
static void svc_rdma_proc_cleanup(void)
@@ -234,25 +233,34 @@ static int svc_rdma_proc_init(void)
rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL);
if (rc)
- goto out_err;
+ goto err;
rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL);
if (rc)
- goto out_err;
+ goto err_read;
rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL);
if (rc)
- goto out_err;
+ goto err_recv;
rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL);
if (rc)
- goto out_err;
+ goto err_sq;
svcrdma_table_header = register_sysctl("sunrpc/svc_rdma",
svcrdma_parm_table);
+ if (!svcrdma_table_header)
+ goto err_write;
+
return 0;
-out_err:
+err_write:
+ rc = -ENOMEM;
+ percpu_counter_destroy(&svcrdma_stat_write);
+err_sq:
percpu_counter_destroy(&svcrdma_stat_sq_starve);
+err_recv:
percpu_counter_destroy(&svcrdma_stat_recv);
+err_read:
percpu_counter_destroy(&svcrdma_stat_read);
+err:
return rc;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index c9be6778643b..e5a78b761012 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -90,7 +90,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
*/
get_page(virt_to_page(rqst->rq_buffer));
sctxt->sc_send_wr.opcode = IB_WR_SEND;
- return svc_rdma_send(rdma, sctxt);
+ return svc_rdma_post_send(rdma, sctxt);
}
/* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index d72953f29258..292022f0976e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -94,7 +94,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
@@ -493,7 +493,13 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt)
if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount))
return false;
- /* A bogus segcount causes this buffer overflow check to fail. */
+ /* Before trusting the segcount value enough to use it in
+ * a computation, perform a simple range check. This is an
+ * arbitrary but sensible limit (ie, not architectural).
+ */
+ if (unlikely(segcount > RPCSVC_MAXPAGES))
+ return false;
+
p = xdr_inline_decode(&rctxt->rc_stream,
segcount * rpcrdma_segment_maxsz * sizeof(*p));
return p != NULL;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index c00fcce61d1e..40797114d50a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -197,28 +197,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
llist_add_batch(first, last, &rdma->sc_rw_ctxts);
}
-/* State for sending a Write or Reply chunk.
- * - Tracks progress of writing one chunk over all its segments
- * - Stores arguments for the SGL constructor functions
- */
-struct svc_rdma_write_info {
- struct svcxprt_rdma *wi_rdma;
-
- const struct svc_rdma_chunk *wi_chunk;
-
- /* write state of this chunk */
- unsigned int wi_seg_off;
- unsigned int wi_seg_no;
-
- /* SGL constructor arguments */
- const struct xdr_buf *wi_xdr;
- unsigned char *wi_base;
- unsigned int wi_next_off;
-
- struct svc_rdma_chunk_ctxt wi_cc;
- struct work_struct wi_work;
-};
-
static struct svc_rdma_write_info *
svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
const struct svc_rdma_chunk *chunk)
@@ -253,6 +231,49 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
}
/**
+ * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
+ * @rdma: controlling transport
+ * @ctxt: Send context that is being released
+ */
+void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt)
+{
+ struct svc_rdma_chunk_ctxt *cc = &ctxt->sc_reply_info.wi_cc;
+
+ if (!cc->cc_sqecount)
+ return;
+ svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE);
+}
+
+/**
+ * svc_rdma_reply_done - Reply chunk Write completion handler
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion report
+ *
+ * Pages under I/O are released by a subsequent Send completion.
+ */
+static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_chunk_ctxt *cc =
+ container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+ struct svcxprt_rdma *rdma = cq->cq_context;
+
+ switch (wc->status) {
+ case IB_WC_SUCCESS:
+ trace_svcrdma_wc_reply(&cc->cc_cid);
+ return;
+ case IB_WC_WR_FLUSH_ERR:
+ trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid);
+ break;
+ default:
+ trace_svcrdma_wc_reply_err(wc, &cc->cc_cid);
+ }
+
+ svc_xprt_deferred_close(&rdma->sc_xprt);
+}
+
+/**
* svc_rdma_write_done - Write chunk completion
* @cq: controlling Completion Queue
* @wc: Work Completion
@@ -580,41 +601,33 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
return xdr->len;
}
-/**
- * svc_rdma_send_write_chunk - Write all segments in a Write chunk
- * @rdma: controlling RDMA transport
- * @chunk: Write chunk provided by the client
- * @xdr: xdr_buf containing the data payload
- *
- * Returns a non-negative number of bytes the chunk consumed, or
- * %-E2BIG if the payload was larger than the Write chunk,
- * %-EINVAL if client provided too many segments,
- * %-ENOMEM if rdma_rw context pool was exhausted,
- * %-ENOTCONN if posting failed (connection is lost),
- * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
- */
-int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
- const struct svc_rdma_chunk *chunk,
- const struct xdr_buf *xdr)
+static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_chunk *chunk,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
struct svc_rdma_chunk_ctxt *cc;
+ struct xdr_buf payload;
int ret;
+ if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position,
+ chunk->ch_payload_length))
+ return -EMSGSIZE;
+
info = svc_rdma_write_info_alloc(rdma, chunk);
if (!info)
return -ENOMEM;
cc = &info->wi_cc;
- ret = svc_rdma_xb_write(xdr, info);
- if (ret != xdr->len)
+ ret = svc_rdma_xb_write(&payload, info);
+ if (ret != payload.len)
goto out_err;
trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
ret = svc_rdma_post_chunk_ctxt(rdma, cc);
if (ret < 0)
goto out_err;
- return xdr->len;
+ return 0;
out_err:
svc_rdma_write_info_free(info);
@@ -622,9 +635,37 @@ out_err:
}
/**
- * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * svc_rdma_send_write_list - Send all chunks on the Write list
* @rdma: controlling RDMA transport
- * @rctxt: Write and Reply chunks from client
+ * @rctxt: Write list provisioned by the client
+ * @xdr: xdr_buf containing an RPC Reply message
+ *
+ * Returns zero on success, or a negative errno if one or more
+ * Write chunks could not be sent.
+ */
+int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ const struct xdr_buf *xdr)
+{
+ struct svc_rdma_chunk *chunk;
+ int ret;
+
+ pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
+ if (!chunk->ch_payload_length)
+ break;
+ ret = svc_rdma_send_write_chunk(rdma, chunk, xdr);
+ if (ret < 0)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @write_pcl: Write chunk list provided by client
+ * @reply_pcl: Reply chunk provided by client
+ * @sctxt: Send WR resources
* @xdr: xdr_buf containing an RPC Reply
*
* Returns a non-negative number of bytes the chunk consumed, or
@@ -634,39 +675,45 @@ out_err:
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
- const struct svc_rdma_recv_ctxt *rctxt,
- const struct xdr_buf *xdr)
+int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_pcl *write_pcl,
+ const struct svc_rdma_pcl *reply_pcl,
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct xdr_buf *xdr)
{
- struct svc_rdma_write_info *info;
- struct svc_rdma_chunk_ctxt *cc;
- struct svc_rdma_chunk *chunk;
+ struct svc_rdma_write_info *info = &sctxt->sc_reply_info;
+ struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+ struct ib_send_wr *first_wr;
+ struct list_head *pos;
+ struct ib_cqe *cqe;
int ret;
- if (pcl_is_empty(&rctxt->rc_reply_pcl))
- return 0;
-
- chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
- info = svc_rdma_write_info_alloc(rdma, chunk);
- if (!info)
- return -ENOMEM;
- cc = &info->wi_cc;
+ info->wi_rdma = rdma;
+ info->wi_chunk = pcl_first_chunk(reply_pcl);
+ info->wi_seg_off = 0;
+ info->wi_seg_no = 0;
+ info->wi_cc.cc_cqe.done = svc_rdma_reply_done;
- ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ ret = pcl_process_nonpayloads(write_pcl, xdr,
svc_rdma_xb_write, info);
if (ret < 0)
- goto out_err;
+ return ret;
- trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
- ret = svc_rdma_post_chunk_ctxt(rdma, cc);
- if (ret < 0)
- goto out_err;
+ first_wr = sctxt->sc_wr_chain;
+ cqe = &cc->cc_cqe;
+ list_for_each(pos, &cc->cc_rwctxts) {
+ struct svc_rdma_rw_ctxt *rwc;
- return xdr->len;
+ rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
+ first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, cqe, first_wr);
+ cqe = NULL;
+ }
+ sctxt->sc_wr_chain = first_wr;
+ sctxt->sc_sqecount += cc->cc_sqecount;
-out_err:
- svc_rdma_write_info_free(info);
- return ret;
+ trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
+ return xdr->len;
}
/**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 1a49b7f02041..96154a2367a1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -100,7 +100,7 @@
*/
#include <linux/spinlock.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
@@ -205,9 +205,13 @@ out:
xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf,
ctxt->sc_xprt_buf, NULL);
+ svc_rdma_cc_init(rdma, &ctxt->sc_reply_info.wi_cc);
ctxt->sc_send_wr.num_sge = 0;
ctxt->sc_cur_sge_no = 0;
ctxt->sc_page_count = 0;
+ ctxt->sc_wr_chain = &ctxt->sc_send_wr;
+ ctxt->sc_sqecount = 1;
+
return ctxt;
out_empty:
@@ -223,6 +227,8 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
struct ib_device *device = rdma->sc_cm_id->device;
unsigned int i;
+ svc_rdma_reply_chunk_release(rdma, ctxt);
+
if (ctxt->sc_page_count)
release_pages(ctxt->sc_pages, ctxt->sc_page_count);
@@ -293,7 +299,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
struct svc_rdma_send_ctxt *ctxt =
container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
- svc_rdma_wake_send_waiters(rdma, 1);
+ svc_rdma_wake_send_waiters(rdma, ctxt->sc_sqecount);
if (unlikely(wc->status != IB_WC_SUCCESS))
goto flushed;
@@ -312,51 +318,76 @@ flushed:
}
/**
- * svc_rdma_send - Post a single Send WR
- * @rdma: transport on which to post the WR
- * @ctxt: send ctxt with a Send WR ready to post
+ * svc_rdma_post_send - Post a WR chain to the Send Queue
+ * @rdma: transport context
+ * @ctxt: WR chain to post
+ *
+ * Copy fields in @ctxt to stack variables in order to guarantee
+ * that these values remain available after the ib_post_send() call.
+ * In some error flow cases, svc_rdma_wc_send() releases @ctxt.
+ *
+ * Note there is potential for starvation when the Send Queue is
+ * full because there is no order to when waiting threads are
+ * awoken. The transport is typically provisioned with a deep
+ * enough Send Queue that SQ exhaustion should be a rare event.
*
- * Returns zero if the Send WR was posted successfully. Otherwise, a
- * negative errno is returned.
+ * Return values:
+ * %0: @ctxt's WR chain was posted successfully
+ * %-ENOTCONN: The connection was lost
*/
-int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
+int svc_rdma_post_send(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt)
{
- struct ib_send_wr *wr = &ctxt->sc_send_wr;
- int ret;
+ struct ib_send_wr *first_wr = ctxt->sc_wr_chain;
+ struct ib_send_wr *send_wr = &ctxt->sc_send_wr;
+ const struct ib_send_wr *bad_wr = first_wr;
+ struct rpc_rdma_cid cid = ctxt->sc_cid;
+ int ret, sqecount = ctxt->sc_sqecount;
might_sleep();
/* Sync the transport header buffer */
ib_dma_sync_single_for_device(rdma->sc_pd->device,
- wr->sg_list[0].addr,
- wr->sg_list[0].length,
+ send_wr->sg_list[0].addr,
+ send_wr->sg_list[0].length,
DMA_TO_DEVICE);
/* If the SQ is full, wait until an SQ entry is available */
- while (1) {
- if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
+ while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) {
+ if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
+ svc_rdma_wake_send_waiters(rdma, sqecount);
+
+ /* When the transport is torn down, assume
+ * ib_drain_sq() will trigger enough Send
+ * completions to wake us. The XPT_CLOSE test
+ * above should then cause the while loop to
+ * exit.
+ */
percpu_counter_inc(&svcrdma_stat_sq_starve);
- trace_svcrdma_sq_full(rdma, &ctxt->sc_cid);
- atomic_inc(&rdma->sc_sq_avail);
+ trace_svcrdma_sq_full(rdma, &cid);
wait_event(rdma->sc_send_wait,
- atomic_read(&rdma->sc_sq_avail) > 1);
- if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
- return -ENOTCONN;
- trace_svcrdma_sq_retry(rdma, &ctxt->sc_cid);
+ atomic_read(&rdma->sc_sq_avail) > 0);
+ trace_svcrdma_sq_retry(rdma, &cid);
continue;
}
trace_svcrdma_post_send(ctxt);
- ret = ib_post_send(rdma->sc_qp, wr, NULL);
- if (ret)
- break;
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret) {
+ trace_svcrdma_sq_post_err(rdma, &cid, ret);
+ svc_xprt_deferred_close(&rdma->sc_xprt);
+
+ /* If even one WR was posted, there will be a
+ * Send completion that bumps sc_sq_avail.
+ */
+ if (bad_wr == first_wr) {
+ svc_rdma_wake_send_waiters(rdma, sqecount);
+ break;
+ }
+ }
return 0;
}
-
- trace_svcrdma_sq_post_err(rdma, &ctxt->sc_cid, ret);
- svc_xprt_deferred_close(&rdma->sc_xprt);
- wake_up(&rdma->sc_send_wait);
- return ret;
+ return -ENOTCONN;
}
/**
@@ -839,16 +870,10 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
* in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
*
* Depending on whether a Write list or Reply chunk is present,
- * the server may send all, a portion of, or none of the xdr_buf.
+ * the server may Send all, a portion of, or none of the xdr_buf.
* In the latter case, only the transport header (sc_sges[0]) is
* transmitted.
*
- * RDMA Send is the last step of transmitting an RPC reply. Pages
- * involved in the earlier RDMA Writes are here transferred out
- * of the rqstp and into the sctxt's page array. These pages are
- * DMA unmapped by each Write completion, but the subsequent Send
- * completion finally releases these pages.
- *
* Assumptions:
* - The Reply's transport header will never be larger than a page.
*/
@@ -857,6 +882,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
const struct svc_rdma_recv_ctxt *rctxt,
struct svc_rqst *rqstp)
{
+ struct ib_send_wr *send_wr = &sctxt->sc_send_wr;
int ret;
ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl,
@@ -864,16 +890,19 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
if (ret < 0)
return ret;
+ /* Transfer pages involved in RDMA Writes to the sctxt's
+ * page array. Completion handling releases these pages.
+ */
svc_rdma_save_io_pages(rqstp, sctxt);
if (rctxt->rc_inv_rkey) {
- sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
- sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
+ send_wr->opcode = IB_WR_SEND_WITH_INV;
+ send_wr->ex.invalidate_rkey = rctxt->rc_inv_rkey;
} else {
- sctxt->sc_send_wr.opcode = IB_WR_SEND;
+ send_wr->opcode = IB_WR_SEND;
}
- return svc_rdma_send(rdma, sctxt);
+ return svc_rdma_post_send(rdma, sctxt);
}
/**
@@ -937,7 +966,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
sctxt->sc_send_wr.num_sge = 1;
sctxt->sc_send_wr.opcode = IB_WR_SEND;
sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
- if (svc_rdma_send(rdma, sctxt))
+ if (svc_rdma_post_send(rdma, sctxt))
goto put_ctxt;
return;
@@ -984,10 +1013,19 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (!p)
goto put_ctxt;
- ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
+ ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res);
if (ret < 0)
- goto reply_chunk;
- rc_size = ret;
+ goto put_ctxt;
+
+ rc_size = 0;
+ if (!pcl_is_empty(&rctxt->rc_reply_pcl)) {
+ ret = svc_rdma_prepare_reply_chunk(rdma, &rctxt->rc_write_pcl,
+ &rctxt->rc_reply_pcl, sctxt,
+ &rqstp->rq_res);
+ if (ret < 0)
+ goto reply_chunk;
+ rc_size = ret;
+ }
*p++ = *rdma_argp;
*p++ = *(rdma_argp + 1);
@@ -1030,45 +1068,33 @@ drop_connection:
/**
* svc_rdma_result_payload - special processing for a result payload
- * @rqstp: svc_rqst to operate on
- * @offset: payload's byte offset in @xdr
+ * @rqstp: RPC transaction context
+ * @offset: payload's byte offset in @rqstp->rq_res
* @length: size of payload, in bytes
*
+ * Assign the passed-in result payload to the current Write chunk,
+ * and advance to cur_result_payload to the next Write chunk, if
+ * there is one.
+ *
* Return values:
* %0 if successful or nothing needed to be done
- * %-EMSGSIZE on XDR buffer overflow
* %-E2BIG if the payload was larger than the Write chunk
- * %-EINVAL if client provided too many segments
- * %-ENOMEM if rdma_rw context pool was exhausted
- * %-ENOTCONN if posting failed (connection is lost)
- * %-EIO if rdma_rw initialization failed (DMA mapping, etc)
*/
int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
unsigned int length)
{
struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
struct svc_rdma_chunk *chunk;
- struct svcxprt_rdma *rdma;
- struct xdr_buf subbuf;
- int ret;
chunk = rctxt->rc_cur_result_payload;
if (!length || !chunk)
return 0;
rctxt->rc_cur_result_payload =
pcl_next_chunk(&rctxt->rc_write_pcl, chunk);
+
if (length > chunk->ch_length)
return -E2BIG;
-
chunk->ch_position = offset;
chunk->ch_payload_length = length;
-
- if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length))
- return -EMSGSIZE;
-
- rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
- ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf);
- if (ret < 0)
- return ret;
return 0;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4f27325ace4a..c3fbf0779d4a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -65,6 +65,8 @@
static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
struct net *net, int node);
+static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event);
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -122,6 +124,41 @@ static void qp_event_handler(struct ib_event *event, void *context)
}
}
+static struct rdma_cm_id *
+svc_rdma_create_listen_id(struct net *net, struct sockaddr *sap,
+ void *context)
+{
+ struct rdma_cm_id *listen_id;
+ int ret;
+
+ listen_id = rdma_create_id(net, svc_rdma_listen_handler, context,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(listen_id))
+ return listen_id;
+
+ /* Allow both IPv4 and IPv6 sockets to bind a single port
+ * at the same time.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = rdma_set_afonly(listen_id, 1);
+ if (ret)
+ goto out_destroy;
+#endif
+ ret = rdma_bind_addr(listen_id, sap);
+ if (ret)
+ goto out_destroy;
+
+ ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
+ if (ret)
+ goto out_destroy;
+
+ return listen_id;
+
+out_destroy:
+ rdma_destroy_id(listen_id);
+ return ERR_PTR(ret);
+}
+
static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
struct net *net, int node)
{
@@ -247,17 +284,31 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
*
* Return values:
* %0: Do not destroy @cma_id
- * %1: Destroy @cma_id (never returned here)
+ * %1: Destroy @cma_id
*
* NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners.
*/
static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id,
struct rdma_cm_event *event)
{
+ struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr;
+ struct svcxprt_rdma *cma_xprt = cma_id->context;
+ struct svc_xprt *cma_rdma = &cma_xprt->sc_xprt;
+ struct rdma_cm_id *listen_id;
+
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
handle_connect_req(cma_id, &event->param.conn);
break;
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ listen_id = svc_rdma_create_listen_id(cma_rdma->xpt_net,
+ sap, cma_xprt);
+ if (IS_ERR(listen_id)) {
+ pr_err("Listener dead, address change failed for device %s\n",
+ cma_id->device->name);
+ } else
+ cma_xprt->sc_cm_id = listen_id;
+ return 1;
default:
break;
}
@@ -288,7 +339,6 @@ static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id,
svc_xprt_enqueue(xprt);
break;
case RDMA_CM_EVENT_DISCONNECTED:
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
svc_xprt_deferred_close(xprt);
break;
default:
@@ -307,7 +357,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
{
struct rdma_cm_id *listen_id;
struct svcxprt_rdma *cma_xprt;
- int ret;
if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
return ERR_PTR(-EAFNOSUPPORT);
@@ -317,30 +366,13 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener");
- listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt,
- RDMA_PS_TCP, IB_QPT_RC);
+ listen_id = svc_rdma_create_listen_id(net, sa, cma_xprt);
if (IS_ERR(listen_id)) {
- ret = PTR_ERR(listen_id);
- goto err0;
+ kfree(cma_xprt);
+ return ERR_CAST(listen_id);
}
-
- /* Allow both IPv4 and IPv6 sockets to bind a single port
- * at the same time.
- */
-#if IS_ENABLED(CONFIG_IPV6)
- ret = rdma_set_afonly(listen_id, 1);
- if (ret)
- goto err1;
-#endif
- ret = rdma_bind_addr(listen_id, sa);
- if (ret)
- goto err1;
cma_xprt->sc_cm_id = listen_id;
- ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
- if (ret)
- goto err1;
-
/*
* We need to use the address from the cm_id in case the
* caller specified 0 for the port number.
@@ -349,12 +381,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
return &cma_xprt->sc_xprt;
+}
- err1:
- rdma_destroy_id(listen_id);
- err0:
- kfree(cma_xprt);
- return ERR_PTR(ret);
+static void svc_rdma_xprt_done(struct rpcrdma_notification *rn)
+{
+ struct svcxprt_rdma *rdma = container_of(rn, struct svcxprt_rdma,
+ sc_rn);
+ struct rdma_cm_id *id = rdma->sc_cm_id;
+
+ trace_svcrdma_device_removal(id);
+ svc_xprt_close(&rdma->sc_xprt);
}
/*
@@ -398,6 +434,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dev = newxprt->sc_cm_id->device;
newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
+ if (rpcrdma_rn_register(dev, &newxprt->sc_rn, svc_rdma_xprt_done))
+ goto errout;
+
newxprt->sc_max_req_size = svcrdma_max_req_size;
newxprt->sc_max_requests = svcrdma_max_requests;
newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
@@ -415,15 +454,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests +
- newxprt->sc_recv_batch;
+ newxprt->sc_recv_batch + 1 /* drain */;
if (rq_depth > dev->attrs.max_qp_wr) {
rq_depth = dev->attrs.max_qp_wr;
newxprt->sc_recv_batch = 1;
newxprt->sc_max_requests = rq_depth - 2;
newxprt->sc_max_bc_requests = 2;
}
- ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
- ctxts *= newxprt->sc_max_requests;
+
+ /* Arbitrarily estimate the number of rw_ctxs needed for
+ * this transport. This is enough rw_ctxs to make forward
+ * progress even if the client is using one rkey per page
+ * in each Read chunk.
+ */
+ ctxts = 3 * RPCSVC_MAXPAGES;
newxprt->sc_sq_depth = rq_depth + ctxts;
if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
@@ -460,12 +504,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
-
+ dprintk(" send CQ depth = %u, recv CQ depth = %u\n",
+ newxprt->sc_sq_depth, rq_depth);
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
trace_svcrdma_qp_err(newxprt, ret);
goto errout;
}
+ newxprt->sc_max_send_sges = qp_attr.cap.max_send_sge;
newxprt->sc_qp = newxprt->sc_cm_id->qp;
if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
@@ -546,6 +592,7 @@ static void __svc_rdma_free(struct work_struct *work)
{
struct svcxprt_rdma *rdma =
container_of(work, struct svcxprt_rdma, sc_work);
+ struct ib_device *device = rdma->sc_cm_id->device;
/* This blocks until the Completion Queues are empty */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -574,6 +621,7 @@ static void __svc_rdma_free(struct work_struct *work)
/* Destroy the CM ID */
rdma_destroy_id(rdma->sc_cm_id);
+ rpcrdma_rn_unregister(device, &rdma->sc_rn);
kfree(rdma);
}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 29b0562d62e7..9a8ce5df83ca 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -137,7 +137,6 @@ static struct ctl_table xr_tunables_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { },
};
#endif
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4f8d7efa469f..63262ef0c2e3 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -49,14 +49,14 @@
* o buffer memory
*/
+#include <linux/bitops.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc_rdma.h>
#include <linux/log2.h>
-#include <asm-generic/barrier.h>
-#include <asm/bitops.h>
+#include <asm/barrier.h>
#include <rdma/ib_cm.h>
@@ -69,13 +69,15 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_sendctx *sc);
static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
+ int node);
+static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction);
static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb);
static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
@@ -222,7 +224,6 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
static int
rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
struct rpcrdma_ep *ep = id->context;
might_sleep();
@@ -241,10 +242,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
ep->re_async_rc = -ENETUNREACH;
complete(&ep->re_done);
return 0;
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- pr_info("rpcrdma: removing device %s for %pISpc\n",
- ep->re_id->device->name, sap);
- fallthrough;
case RDMA_CM_EVENT_ADDR_CHANGE:
ep->re_connect_status = -ENODEV;
goto disconnected;
@@ -280,6 +277,14 @@ disconnected:
return 0;
}
+static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn)
+{
+ struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn);
+
+ trace_xprtrdma_device_removal(ep->re_id);
+ xprt_force_disconnect(ep->re_xprt);
+}
+
static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_ep *ep)
{
@@ -319,6 +324,10 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
if (rc)
goto out;
+ rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done);
+ if (rc)
+ goto out;
+
return id;
out:
@@ -346,6 +355,8 @@ static void rpcrdma_ep_destroy(struct kref *kref)
ib_dealloc_pd(ep->re_pd);
ep->re_pd = NULL;
+ rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn);
+
kfree(ep);
module_put(THIS_MODULE);
}
@@ -501,7 +512,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
* outstanding Receives.
*/
rpcrdma_ep_get(ep);
- rpcrdma_post_recvs(r_xprt, 1, true);
+ rpcrdma_post_recvs(r_xprt, 1);
rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
if (rc)
@@ -893,6 +904,8 @@ static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt)
static void rpcrdma_req_reset(struct rpcrdma_req *req)
{
+ struct rpcrdma_mr *mr;
+
/* Credits are valid for only one connection */
req->rl_slot.rq_cong = 0;
@@ -902,7 +915,19 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req)
rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
- frwr_reset(req);
+ /* The verbs consumer can't know the state of an MR on the
+ * req->rl_registered list unless a successful completion
+ * has occurred, so they cannot be re-used.
+ */
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
+ struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf;
+
+ spin_lock(&buf->rb_lock);
+ list_del(&mr->mr_all);
+ spin_unlock(&buf->rb_lock);
+
+ frwr_mr_release(mr);
+ }
}
/* ASSUMPTION: the rb_allreqs list is stable for the duration,
@@ -920,18 +945,20 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
}
static noinline
-struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
- bool temp)
+struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct ib_device *device = ep->re_id->device;
struct rpcrdma_rep *rep;
rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS);
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
- DMA_FROM_DEVICE);
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc_node(ep->re_inline_recv,
+ DMA_FROM_DEVICE,
+ ibdev_to_node(device));
if (!rep->rr_rdmabuf)
goto out_free;
@@ -946,7 +973,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
- rep->rr_temp = temp;
spin_lock(&buf->rb_lock);
list_add(&rep->rr_all, &buf->rb_all_reps);
@@ -965,17 +991,6 @@ static void rpcrdma_rep_free(struct rpcrdma_rep *rep)
kfree(rep);
}
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
-{
- struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf;
-
- spin_lock(&buf->rb_lock);
- list_del(&rep->rr_all);
- spin_unlock(&buf->rb_lock);
-
- rpcrdma_rep_free(rep);
-}
-
static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
{
struct llist_node *node;
@@ -1007,10 +1022,8 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;
- list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
+ list_for_each_entry(rep, &buf->rb_all_reps, rr_all)
rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
- rep->rr_temp = true; /* Mark this rep for destruction */
- }
}
static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
@@ -1227,14 +1240,15 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
* or Replies they may be registered externally via frwr_map.
*/
static struct rpcrdma_regbuf *
-rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
+rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction,
+ int node)
{
struct rpcrdma_regbuf *rb;
- rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS);
+ rb = kmalloc_node(sizeof(*rb), XPRTRDMA_GFP_FLAGS, node);
if (!rb)
return NULL;
- rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS);
+ rb->rg_data = kmalloc_node(size, XPRTRDMA_GFP_FLAGS, node);
if (!rb->rg_data) {
kfree(rb);
return NULL;
@@ -1246,6 +1260,12 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
return rb;
}
+static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction)
+{
+ return rpcrdma_regbuf_alloc_node(size, direction, NUMA_NO_NODE);
+}
+
/**
* rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer
* @rb: regbuf to reallocate
@@ -1323,10 +1343,9 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
* rpcrdma_post_recvs - Refill the Receive Queue
* @r_xprt: controlling transport instance
* @needed: current credit grant
- * @temp: mark Receive buffers to be deleted after one use
*
*/
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = r_xprt->rx_ep;
@@ -1340,8 +1359,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
if (likely(ep->re_receive_count > needed))
goto out;
needed -= ep->re_receive_count;
- if (!temp)
- needed += RPCRDMA_MAX_RECV_BATCH;
+ needed += RPCRDMA_MAX_RECV_BATCH;
if (atomic_inc_return(&ep->re_receiving) > 1)
goto out;
@@ -1350,12 +1368,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
wr = NULL;
while (needed) {
rep = rpcrdma_rep_get_locked(buf);
- if (rep && rep->rr_temp) {
- rpcrdma_rep_destroy(rep);
- continue;
- }
if (!rep)
- rep = rpcrdma_rep_create(r_xprt, temp);
+ rep = rpcrdma_rep_create(r_xprt);
if (!rep)
break;
if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index da409450dfc0..8147d2b41494 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -56,6 +56,7 @@
#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
+#include <linux/sunrpc/rdma_rn.h> /* removal notifications */
#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
@@ -92,6 +93,7 @@ struct rpcrdma_ep {
struct rpcrdma_connect_private
re_cm_private;
struct rdma_conn_param re_remote_cma;
+ struct rpcrdma_notification re_rn;
int re_receive_count;
unsigned int re_max_requests; /* depends on device */
unsigned int re_inline_send; /* negotiated */
@@ -198,7 +200,6 @@ struct rpcrdma_rep {
__be32 rr_proc;
int rr_wc_flags;
u32 rr_inv_rkey;
- bool rr_temp;
struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
struct rpc_rqst *rr_rqst;
@@ -466,7 +467,7 @@ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc);
int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed);
/*
* Buffer calls - xprtrdma/verbs.c
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 58f3dc8d0d71..83cc095846d3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -62,6 +62,7 @@
#include "sunrpc.h"
static void xs_close(struct rpc_xprt *xprt);
+static void xs_reset_srcport(struct sock_xprt *transport);
static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock);
static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
struct socket *sock);
@@ -159,7 +160,6 @@ static struct ctl_table xs_tunables_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { },
};
/*
@@ -883,6 +883,17 @@ static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf)
return xdr_alloc_bvec(buf, rpc_task_gfp_mask());
}
+static void xs_stream_abort_send_request(struct rpc_rqst *req)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct sock_xprt *transport =
+ container_of(xprt, struct sock_xprt, xprt);
+
+ if (transport->xmit.offset != 0 &&
+ !test_bit(XPRT_CLOSE_WAIT, &xprt->state))
+ xprt_force_disconnect(xprt);
+}
+
/*
* Determine if the previous message in the stream was aborted before it
* could complete transmission.
@@ -1187,6 +1198,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state);
+ clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
}
static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
@@ -1267,6 +1279,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
transport->file = NULL;
sk->sk_user_data = NULL;
+ sk->sk_sndtimeo = 0;
xs_restore_old_callbacks(transport, sk);
xprt_clear_connected(xprt);
@@ -1565,8 +1578,10 @@ static void xs_tcp_state_change(struct sock *sk)
break;
case TCP_CLOSE:
if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
- &transport->sock_state))
+ &transport->sock_state)) {
+ xs_reset_srcport(transport);
xprt_clear_connecting(xprt);
+ }
clear_bit(XPRT_CLOSING, &xprt->state);
/* Trigger the socket release */
xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
@@ -1722,6 +1737,11 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
xs_update_peer_port(xprt);
}
+static void xs_reset_srcport(struct sock_xprt *transport)
+{
+ transport->srcport = 0;
+}
+
static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
{
if (transport->srcport == 0 && transport->xprt.reuseport)
@@ -1921,6 +1941,9 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt,
goto out;
}
+ if (protocol == IPPROTO_TCP)
+ sk_net_refcnt_upgrade(sock->sk);
+
filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
if (IS_ERR(filp))
return ERR_CAST(filp);
@@ -2423,6 +2446,13 @@ static void xs_tcp_setup_socket(struct work_struct *work)
transport->srcport = 0;
status = -EAGAIN;
break;
+ case -EPERM:
+ /* Happens, for instance, if a BPF program is preventing
+ * the connect. Remap the error so upper layers can better
+ * deal with it.
+ */
+ status = -ECONNREFUSED;
+ fallthrough;
case -EINVAL:
/* Happens, for instance, if the user specified a link
* local IPv6 address without a scope-id.
@@ -2434,6 +2464,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -EHOSTUNREACH:
case -EADDRINUSE:
case -ENOBUFS:
+ case -ENOTCONN:
break;
default:
printk("%s: connect returned unhandled error %d\n",
@@ -2546,7 +2577,15 @@ static void xs_tls_handshake_done(void *data, int status, key_serial_t peerid)
struct sock_xprt *lower_transport =
container_of(lower_xprt, struct sock_xprt, xprt);
- lower_transport->xprt_err = status ? -EACCES : 0;
+ switch (status) {
+ case 0:
+ case -EACCES:
+ case -ETIMEDOUT:
+ lower_transport->xprt_err = status;
+ break;
+ default:
+ lower_transport->xprt_err = -EACCES;
+ }
complete(&lower_transport->handshake_done);
xprt_put(lower_xprt);
}
@@ -2588,11 +2627,10 @@ static int xs_tls_handshake_sync(struct rpc_xprt *lower_xprt, struct xprtsec_par
rc = wait_for_completion_interruptible_timeout(&lower_transport->handshake_done,
XS_TLS_HANDSHAKE_TO);
if (rc <= 0) {
- if (!tls_handshake_cancel(sk)) {
- if (rc == 0)
- rc = -ETIMEDOUT;
- goto out_put_xprt;
- }
+ tls_handshake_cancel(sk);
+ if (rc == 0)
+ rc = -ETIMEDOUT;
+ goto out_put_xprt;
}
rc = lower_transport->xprt_err;
@@ -2645,6 +2683,7 @@ static void xs_tcp_tls_setup_socket(struct work_struct *work)
.xprtsec = {
.policy = RPC_XPRTSEC_NONE,
},
+ .stats = upper_clnt->cl_stats,
};
unsigned int pflags = current->flags;
struct rpc_clnt *lower_clnt;
@@ -2987,20 +3026,11 @@ static int bc_send_request(struct rpc_rqst *req)
return len;
}
-/*
- * The close routine. Since this is client initiated, we do nothing
- */
-
static void bc_close(struct rpc_xprt *xprt)
{
xprt_disconnect_done(xprt);
}
-/*
- * The xprt destroy routine. Again, because this connection is client
- * initiated, we do nothing
- */
-
static void bc_destroy(struct rpc_xprt *xprt)
{
dprintk("RPC: bc_destroy xprt %p\n", xprt);
@@ -3021,6 +3051,7 @@ static const struct rpc_xprt_ops xs_local_ops = {
.buf_free = rpc_free,
.prepare_request = xs_stream_prepare_request,
.send_request = xs_local_send_request,
+ .abort_send_request = xs_stream_abort_send_request,
.wait_for_reply_request = xprt_wait_for_reply_request_def,
.close = xs_close,
.destroy = xs_destroy,
@@ -3068,6 +3099,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
.buf_free = rpc_free,
.prepare_request = xs_stream_prepare_request,
.send_request = xs_tcp_send_request,
+ .abort_send_request = xs_stream_abort_send_request,
.wait_for_reply_request = xprt_wait_for_reply_request_def,
.close = xs_tcp_shutdown,
.destroy = xs_destroy,