diff options
Diffstat (limited to 'net/sunrpc')
61 files changed, 8387 insertions, 5036 deletions
diff --git a/net/sunrpc/.kunitconfig b/net/sunrpc/.kunitconfig new file mode 100644 index 000000000000..eb02b906c295 --- /dev/null +++ b/net/sunrpc/.kunitconfig @@ -0,0 +1,29 @@ +CONFIG_KUNIT=y +CONFIG_UBSAN=y +CONFIG_STACKTRACE=y +CONFIG_NET=y +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_INET=y +CONFIG_FILE_LOCKING=y +CONFIG_MULTIUSER=y +CONFIG_CRYPTO=y +CONFIG_CRYPTO_CBC=y +CONFIG_CRYPTO_CTS=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_CMAC=y +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_DES=y +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_CAMELLIA=y +CONFIG_NFS_FS=y +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_RPCSEC_GSS_KRB5=y +CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1=y +CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA=y +CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2=y +CONFIG_RPCSEC_GSS_KRB5_KUNIT_TEST=y diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index bbbb5af0af13..a570e7adf270 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -19,10 +19,10 @@ config SUNRPC_SWAP config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism" depends on SUNRPC && CRYPTO - depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS - depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES default y select SUNRPC_GSS + select CRYPTO_SKCIPHER + select CRYPTO_HASH help Choose Y here to enable Secure RPC using the Kerberos version 5 GSS-API mechanism (RFC 1964). @@ -34,21 +34,58 @@ config RPCSEC_GSS_KRB5 If unsure, say Y. -config SUNRPC_DISABLE_INSECURE_ENCTYPES - bool "Secure RPC: Disable insecure Kerberos encryption types" +config RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1 + bool "Enable Kerberos enctypes based on AES and SHA-1" depends on RPCSEC_GSS_KRB5 + depends on CRYPTO_CBC && CRYPTO_CTS + depends on CRYPTO_HMAC && CRYPTO_SHA1 + depends on CRYPTO_AES + default y + help + Choose Y to enable the use of Kerberos 5 encryption types + that utilize Advanced Encryption Standard (AES) ciphers and + SHA-1 digests. These include aes128-cts-hmac-sha1-96 and + aes256-cts-hmac-sha1-96. + +config RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA + bool "Enable Kerberos encryption types based on Camellia and CMAC" + depends on RPCSEC_GSS_KRB5 + depends on CRYPTO_CBC && CRYPTO_CTS && CRYPTO_CAMELLIA + depends on CRYPTO_CMAC default n help - Choose Y here to disable the use of deprecated encryption types - with the Kerberos version 5 GSS-API mechanism (RFC 1964). The - deprecated encryption types include DES-CBC-MD5, DES-CBC-CRC, - and DES-CBC-MD4. These types were deprecated by RFC 6649 because - they were found to be insecure. + Choose Y to enable the use of Kerberos 5 encryption types + that utilize Camellia ciphers (RFC 3713) and CMAC digests + (NIST Special Publication 800-38B). These include + camellia128-cts-cmac and camellia256-cts-cmac. - N is the default because many sites have deployed KDCs and - keytabs that contain only these deprecated encryption types. - Choosing Y prevents the use of known-insecure encryption types - but might result in compatibility problems. +config RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2 + bool "Enable Kerberos enctypes based on AES and SHA-2" + depends on RPCSEC_GSS_KRB5 + depends on CRYPTO_CBC && CRYPTO_CTS + depends on CRYPTO_HMAC && CRYPTO_SHA256 && CRYPTO_SHA512 + depends on CRYPTO_AES + default n + help + Choose Y to enable the use of Kerberos 5 encryption types + that utilize Advanced Encryption Standard (AES) ciphers and + SHA-2 digests. These include aes128-cts-hmac-sha256-128 and + aes256-cts-hmac-sha384-192. + +config RPCSEC_GSS_KRB5_KUNIT_TEST + tristate "KUnit tests for RPCSEC GSS Kerberos" if !KUNIT_ALL_TESTS + depends on RPCSEC_GSS_KRB5 && KUNIT + default KUNIT_ALL_TESTS + help + This builds the KUnit tests for RPCSEC GSS Kerberos 5. + + KUnit tests run during boot and output the results to the debug + log in TAP format (https://testanything.org/). Only useful for + kernel devs running KUnit test harness and are not for inclusion + into a production build. + + For more information on KUnit and unit tests in general, refer + to the KUnit documentation in Documentation/dev-tools/kunit/. config SUNRPC_DEBUG bool "RPC: Enable dprintk debugging" @@ -64,6 +101,20 @@ config SUNRPC_DEBUG If unsure, say Y. +config SUNRPC_DEBUG_TRACE + bool "RPC: Send dfprintk() output to the trace buffer" + depends on SUNRPC_DEBUG && TRACING + default n + help + dprintk() output can be voluminous, which can overwhelm the + kernel's logging facility as it must be sent to the console. + This option causes dprintk() output to go to the trace buffer + instead of the kernel log. + + This will cause warnings about trace_printk() being used to be + logged at boot time, so say N unless you are debugging a problem + with sunrpc-based clients or services. + config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 1c8de397d6ad..f89c10fe7e6a 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ - auth.o auth_null.o auth_unix.o \ + auth.o auth_null.o auth_tls.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o sysfs.o \ diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index d435bffc6199..97ff11973c49 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -284,10 +284,10 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags) } if (snprintf(portbuf, sizeof(portbuf), - ".%u.%u", port >> 8, port & 0xff) > (int)sizeof(portbuf)) + ".%u.%u", port >> 8, port & 0xff) >= (int)sizeof(portbuf)) return NULL; - if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) > sizeof(addrbuf)) + if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) >= sizeof(addrbuf)) return NULL; return kstrdup(addrbuf, gfp_flags); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index fb75a883503f..5a827afd8e3b 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -32,7 +32,7 @@ static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, - NULL, /* others can be loadable modules */ + [RPC_AUTH_TLS] = (const struct rpc_authops __force __rcu *)&authtls_ops, }; static LIST_HEAD(cred_unused); @@ -40,9 +40,6 @@ static unsigned long number_cred_unused; static struct cred machine_cred = { .usage = ATOMIC_INIT(1), -#ifdef CONFIG_DEBUG_CREDENTIALS - .magic = CRED_MAGIC, -#endif }; /* @@ -492,7 +489,7 @@ static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return number_cred_unused * sysctl_vfs_cache_pressure / 100; + return number_cred_unused; } static void @@ -769,9 +766,14 @@ int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) * @task: controlling RPC task * @xdr: xdr_stream containing RPC Reply header * - * On success, @xdr is updated to point past the verifier and - * zero is returned. Otherwise, @xdr is in an undefined state - * and a negative errno is returned. + * Return values: + * %0: Verifier is valid. @xdr now points past the verifier. + * %-EIO: Verifier is corrupted or message ended early. + * %-EACCES: Verifier is intact but not valid. + * %-EPROTONOSUPPORT: Server does not support the requested auth type. + * + * When a negative errno is returned, @xdr is left in an undefined + * state. */ int rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr) @@ -861,11 +863,7 @@ rpcauth_uptodatecred(struct rpc_task *task) test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; } -static struct shrinker rpc_cred_shrinker = { - .count_objects = rpcauth_cache_shrink_count, - .scan_objects = rpcauth_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *rpc_cred_shrinker; int __init rpcauth_init_module(void) { @@ -874,9 +872,17 @@ int __init rpcauth_init_module(void) err = rpc_init_authunix(); if (err < 0) goto out1; - err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred"); - if (err < 0) + rpc_cred_shrinker = shrinker_alloc(0, "sunrpc_cred"); + if (!rpc_cred_shrinker) { + err = -ENOMEM; goto out2; + } + + rpc_cred_shrinker->count_objects = rpcauth_cache_shrink_count; + rpc_cred_shrinker->scan_objects = rpcauth_cache_shrink_scan; + + shrinker_register(rpc_cred_shrinker); + return 0; out2: rpc_destroy_authunix(); @@ -887,5 +893,5 @@ out1: void rpcauth_remove_module(void) { rpc_destroy_authunix(); - unregister_shrinker(&rpc_cred_shrinker); + shrinker_free(rpc_cred_shrinker); } diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index 4a29f4c5dac4..452f67deebc6 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -5,11 +5,13 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o -auth_rpcgss-y := auth_gss.o gss_generic_token.o \ +auth_rpcgss-y := auth_gss.o \ gss_mech_switch.o svcauth_gss.o \ gss_rpc_upcall.o gss_rpc_xdr.o trace.o obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-y := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o + gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o + +obj-$(CONFIG_RPCSEC_GSS_KRB5_KUNIT_TEST) += gss_krb5_test.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 2d7b1e03110a..5c095cb8cb20 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -49,6 +49,22 @@ static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO; # define RPCDBG_FACILITY RPCDBG_AUTH #endif +/* + * This compile-time check verifies that we will not exceed the + * slack space allotted by the client and server auth_gss code + * before they call gss_wrap(). + */ +#define GSS_KRB5_MAX_SLACK_NEEDED \ + (GSS_KRB5_TOK_HDR_LEN /* gss token header */ \ + + GSS_KRB5_MAX_CKSUM_LEN /* gss token checksum */ \ + + GSS_KRB5_MAX_BLOCKSIZE /* confounder */ \ + + GSS_KRB5_MAX_BLOCKSIZE /* possible padding */ \ + + GSS_KRB5_TOK_HDR_LEN /* encrypted hdr in v2 token */ \ + + GSS_KRB5_MAX_CKSUM_LEN /* encryption hmac */ \ + + XDR_UNIT * 2 /* RPC verifier */ \ + + GSS_KRB5_TOK_HDR_LEN \ + + GSS_KRB5_MAX_CKSUM_LEN) + #define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) /* length of a krb5 verifier (48), plus data added before arguments when * using integrity (two 4-byte integers): */ @@ -871,25 +887,16 @@ static void gss_pipe_dentry_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *gss_pipe = pdo->pdo_data; - struct rpc_pipe *pipe = gss_pipe->pipe; - if (pipe->dentry != NULL) { - rpc_unlink(pipe->dentry); - pipe->dentry = NULL; - } + rpc_unlink(gss_pipe->pipe); } static int gss_pipe_dentry_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *p = pdo->pdo_data; - struct dentry *dentry; - dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - p->pipe->dentry = dentry; - return 0; + return rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); } static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = { @@ -1042,6 +1049,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; + BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; @@ -1528,6 +1536,7 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) struct kvec iov; struct xdr_buf verf_buf; int status; + u32 seqno; /* Credential */ @@ -1539,15 +1548,16 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) cred_len = p++; spin_lock(&ctx->gc_seq_lock); - req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + xprt_rqst_add_seqno(req, seqno); spin_unlock(&ctx->gc_seq_lock); - if (req->rq_seqno == MAXSEQ) + if (*req->rq_seqnos == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); - *p++ = cpu_to_be32(req->rq_seqno); + *p++ = cpu_to_be32(*req->rq_seqnos); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); @@ -1661,17 +1671,31 @@ gss_refresh_null(struct rpc_task *task) return 0; } +static u32 +gss_validate_seqno_mic(struct gss_cl_ctx *ctx, u32 seqno, __be32 *seq, __be32 *p, u32 len) +{ + struct kvec iov; + struct xdr_buf verf_buf; + struct xdr_netobj mic; + + *seq = cpu_to_be32(seqno); + iov.iov_base = seq; + iov.iov_len = 4; + xdr_buf_from_iov(&iov, &verf_buf); + mic.data = (u8 *)p; + mic.len = len; + return gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); +} + static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; - struct kvec iov; - struct xdr_buf verf_buf; - struct xdr_netobj mic; u32 len, maj_stat; int status; + int i = 1; /* don't recheck the first item */ p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) @@ -1688,13 +1712,10 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr) seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; - *seq = cpu_to_be32(task->tk_rqstp->rq_seqno); - iov.iov_base = seq; - iov.iov_len = 4; - xdr_buf_from_iov(&iov, &verf_buf); - mic.data = (u8 *)p; - mic.len = len; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[0], seq, p, len); + /* RFC 2203 5.3.3.1 - compute the checksum of each sequence number in the cache */ + while (unlikely(maj_stat == GSS_S_BAD_SIG && i < task->tk_rqstp->rq_seqno_count)) + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[i++], seq, p, len); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) @@ -1733,7 +1754,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; integ_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -1830,7 +1851,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; opaque_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -1858,8 +1879,10 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* slack space should prevent this ever happening: */ - if (unlikely(snd_buf->len > snd_buf->buflen)) + if (unlikely(snd_buf->len > snd_buf->buflen)) { + status = -EIO; goto wrap_failed; + } /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was * done anyway, so it's safe to put the request on the wire: */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) @@ -1982,7 +2005,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; - if (seqno != rqstp->rq_seqno) + if (seqno != *rqstp->rq_seqnos) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; @@ -2026,7 +2049,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); @@ -2058,7 +2081,7 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ - if (be32_to_cpup(p++) != rqstp->rq_seqno) + if (be32_to_cpup(p++) != *rqstp->rq_seqnos) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. @@ -2074,7 +2097,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p)); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); @@ -2099,14 +2122,14 @@ gss_xmit_need_reencode(struct rpc_task *task) if (!ctx) goto out; - if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + if (gss_seq_is_newer(*req->rq_seqnos, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); - while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + while (gss_seq_is_newer(*req->rq_seqnos, seq_xmit)) { u32 tmp = seq_xmit; - seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, *req->rq_seqnos); if (seq_xmit == tmp) { ret = false; goto out_ctx; @@ -2115,7 +2138,7 @@ gss_xmit_need_reencode(struct rpc_task *task) win = ctx->gc_win; if (win > 0) - ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); + ret = !gss_seq_is_newer(*req->rq_seqnos, seq_xmit - win); out_ctx: gss_put_ctx(ctx); @@ -2263,6 +2286,7 @@ static void __exit exit_rpcsec_gss(void) } MODULE_ALIAS("rpc-auth-6"); +MODULE_DESCRIPTION("Sun RPC Kerberos RPCSEC_GSS client authentication"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h index c53b329092d4..4ebc1b7043d9 100644 --- a/net/sunrpc/auth_gss/auth_gss_internal.h +++ b/net/sunrpc/auth_gss/auth_gss_internal.h @@ -23,7 +23,7 @@ simple_get_bytes(const void *p, const void *end, void *res, size_t len) } static inline const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +simple_get_netobj_noprof(const void *p, const void *end, struct xdr_netobj *dest) { const void *q; unsigned int len; @@ -35,7 +35,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) if (unlikely(q > end || q < p)) return ERR_PTR(-EFAULT); if (len) { - dest->data = kmemdup(p, len, GFP_KERNEL); + dest->data = kmemdup_noprof(p, len, GFP_KERNEL); if (unlikely(dest->data == NULL)) return ERR_PTR(-ENOMEM); } else @@ -43,3 +43,5 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) dest->len = len; return q; } + +#define simple_get_netobj(...) alloc_hooks(simple_get_netobj_noprof(__VA_ARGS__)) diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c deleted file mode 100644 index 4a4082bb22ad..000000000000 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ /dev/null @@ -1,231 +0,0 @@ -/* - * linux/net/sunrpc/gss_generic_token.c - * - * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c - * - * Copyright (c) 2000 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - */ - -/* - * Copyright 1993 by OpenVision Technologies, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software - * and its documentation for any purpose is hereby granted without fee, - * provided that the above copyright notice appears in all copies and - * that both that copyright notice and this permission notice appear in - * supporting documentation, and that the name of OpenVision not be used - * in advertising or publicity pertaining to distribution of the software - * without specific, written prior permission. OpenVision makes no - * representations about the suitability of this software for any - * purpose. It is provided "as is" without express or implied warranty. - * - * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO - * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - */ - -#include <linux/types.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/sunrpc/sched.h> -#include <linux/sunrpc/gss_asn1.h> - - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - - -/* TWRITE_STR from gssapiP_generic.h */ -#define TWRITE_STR(ptr, str, len) \ - memcpy((ptr), (char *) (str), (len)); \ - (ptr) += (len); - -/* XXXX this code currently makes the assumption that a mech oid will - never be longer than 127 bytes. This assumption is not inherent in - the interfaces, so the code can be fixed if the OSI namespace - balloons unexpectedly. */ - -/* Each token looks like this: - -0x60 tag for APPLICATION 0, SEQUENCE - (constructed, definite-length) - <length> possible multiple bytes, need to parse/generate - 0x06 tag for OBJECT IDENTIFIER - <moid_length> compile-time constant string (assume 1 byte) - <moid_bytes> compile-time constant string - <inner_bytes> the ANY containing the application token - bytes 0,1 are the token type - bytes 2,n are the token data - -For the purposes of this abstraction, the token "header" consists of -the sequence tag and length octets, the mech OID DER encoding, and the -first two inner bytes, which indicate the token type. The token -"body" consists of everything else. - -*/ - -static int -der_length_size( int length) -{ - if (length < (1<<7)) - return 1; - else if (length < (1<<8)) - return 2; -#if (SIZEOF_INT == 2) - else - return 3; -#else - else if (length < (1<<16)) - return 3; - else if (length < (1<<24)) - return 4; - else - return 5; -#endif -} - -static void -der_write_length(unsigned char **buf, int length) -{ - if (length < (1<<7)) { - *(*buf)++ = (unsigned char) length; - } else { - *(*buf)++ = (unsigned char) (der_length_size(length)+127); -#if (SIZEOF_INT > 2) - if (length >= (1<<24)) - *(*buf)++ = (unsigned char) (length>>24); - if (length >= (1<<16)) - *(*buf)++ = (unsigned char) ((length>>16)&0xff); -#endif - if (length >= (1<<8)) - *(*buf)++ = (unsigned char) ((length>>8)&0xff); - *(*buf)++ = (unsigned char) (length&0xff); - } -} - -/* returns decoded length, or < 0 on failure. Advances buf and - decrements bufsize */ - -static int -der_read_length(unsigned char **buf, int *bufsize) -{ - unsigned char sf; - int ret; - - if (*bufsize < 1) - return -1; - sf = *(*buf)++; - (*bufsize)--; - if (sf & 0x80) { - if ((sf &= 0x7f) > ((*bufsize)-1)) - return -1; - if (sf > SIZEOF_INT) - return -1; - ret = 0; - for (; sf; sf--) { - ret = (ret<<8) + (*(*buf)++); - (*bufsize)--; - } - } else { - ret = sf; - } - - return ret; -} - -/* returns the length of a token, given the mech oid and the body size */ - -int -g_token_size(struct xdr_netobj *mech, unsigned int body_size) -{ - /* set body_size to sequence contents size */ - body_size += 2 + (int) mech->len; /* NEED overflow check */ - return 1 + der_length_size(body_size) + body_size; -} - -EXPORT_SYMBOL_GPL(g_token_size); - -/* fills in a buffer with the token header. The buffer is assumed to - be the right size. buf is advanced past the token header */ - -void -g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf) -{ - *(*buf)++ = 0x60; - der_write_length(buf, 2 + mech->len + body_size); - *(*buf)++ = 0x06; - *(*buf)++ = (unsigned char) mech->len; - TWRITE_STR(*buf, mech->data, ((int) mech->len)); -} - -EXPORT_SYMBOL_GPL(g_make_token_header); - -/* - * Given a buffer containing a token, reads and verifies the token, - * leaving buf advanced past the token header, and setting body_size - * to the number of remaining bytes. Returns 0 on success, - * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the - * mechanism in the token does not match the mech argument. buf and - * *body_size are left unmodified on error. - */ -u32 -g_verify_token_header(struct xdr_netobj *mech, int *body_size, - unsigned char **buf_in, int toksize) -{ - unsigned char *buf = *buf_in; - int seqsize; - struct xdr_netobj toid; - int ret = 0; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - if (*buf++ != 0x60) - return G_BAD_TOK_HEADER; - - if ((seqsize = der_read_length(&buf, &toksize)) < 0) - return G_BAD_TOK_HEADER; - - if (seqsize != toksize) - return G_BAD_TOK_HEADER; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - if (*buf++ != 0x06) - return G_BAD_TOK_HEADER; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - toid.len = *buf++; - - if ((toksize-=toid.len) < 0) - return G_BAD_TOK_HEADER; - toid.data = buf; - buf+=toid.len; - - if (! g_OID_equal(&toid, mech)) - ret = G_WRONG_MECH; - - /* G_WRONG_MECH is not returned immediately because it's more important - to return G_BAD_TOK_HEADER if the token header is in fact bad */ - - if ((toksize-=2) < 0) - return G_BAD_TOK_HEADER; - - if (ret) - return ret; - - *buf_in = buf; - *body_size = toksize; - - return ret; -} - -EXPORT_SYMBOL_GPL(g_verify_token_header); diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 3ea58175e159..16dcf115de1e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -34,9 +34,9 @@ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ -#include <crypto/algapi.h> #include <crypto/hash.h> #include <crypto/skcipher.h> +#include <crypto/utils.h> #include <linux/err.h> #include <linux/types.h> #include <linux/mm.h> @@ -46,11 +46,59 @@ #include <linux/random.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> +#include <kunit/visibility.h> + +#include "gss_krb5_internal.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif +/** + * krb5_make_confounder - Generate a confounder string + * @p: memory location into which to write the string + * @conflen: string length to write, in octets + * + * RFCs 1964 and 3961 mention only "a random confounder" without going + * into detail about its function or cryptographic requirements. The + * assumed purpose is to prevent repeated encryption of a plaintext with + * the same key from generating the same ciphertext. It is also used to + * pad minimum plaintext length to at least a single cipher block. + * + * However, in situations like the GSS Kerberos 5 mechanism, where the + * encryption IV is always all zeroes, the confounder also effectively + * functions like an IV. Thus, not only must it be unique from message + * to message, but it must also be difficult to predict. Otherwise an + * attacker can correlate the confounder to previous or future values, + * making the encryption easier to break. + * + * Given that the primary consumer of this encryption mechanism is a + * network storage protocol, a type of traffic that often carries + * predictable payloads (eg, all zeroes when reading unallocated blocks + * from a file), our confounder generation has to be cryptographically + * strong. + */ +void krb5_make_confounder(u8 *p, int conflen) +{ + get_random_bytes(p, conflen); +} + +/** + * krb5_encrypt - simple encryption of an RPCSEC GSS payload + * @tfm: initialized cipher transform + * @iv: pointer to an IV + * @in: plaintext to encrypt + * @out: OUT: ciphertext + * @length: length of input and output buffers, in bytes + * + * @iv may be NULL to force the use of an all-zero IV. + * The buffer containing the IV must be as large as the + * cipher's ivsize. + * + * Return values: + * %0: @in successfully encrypted into @out + * negative errno: @in not encrypted + */ u32 krb5_encrypt( struct crypto_sync_skcipher *tfm, @@ -90,44 +138,6 @@ out: return ret; } -u32 -krb5_decrypt( - struct crypto_sync_skcipher *tfm, - void * iv, - void * in, - void * out, - int length) -{ - u32 ret = -EINVAL; - struct scatterlist sg[1]; - u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - if (length % crypto_sync_skcipher_blocksize(tfm) != 0) - goto out; - - if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { - dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", - crypto_sync_skcipher_ivsize(tfm)); - goto out; - } - if (iv) - memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); - - memcpy(out, in, length); - sg_init_one(sg, out, length); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, length, local_iv); - - ret = crypto_skcipher_decrypt(req); - skcipher_request_zero(req); -out: - dprintk("RPC: gss_k5decrypt returns %d\n",ret); - return ret; -} - static int checksummer(struct scatterlist *sg, void *data) { @@ -138,182 +148,78 @@ checksummer(struct scatterlist *sg, void *data) return crypto_ahash_update(req); } -/* - * checksum the plaintext data and hdrlen bytes of the token header - * The checksum is performed over the first 8 bytes of the - * gss token header and then over the data body +/** + * gss_krb5_checksum - Compute the MAC for a GSS Wrap or MIC token + * @tfm: an initialized hash transform + * @header: pointer to a buffer containing the token header, or NULL + * @hdrlen: number of octets in @header + * @body: xdr_buf containing an RPC message (body.len is the message length) + * @body_offset: byte offset into @body to start checksumming + * @cksumout: OUT: a buffer to be filled in with the computed HMAC + * + * Usually expressed as H = HMAC(K, message)[1..h] . + * + * Caller provides the truncation length of the output token (h) in + * cksumout.len. + * + * Return values: + * %GSS_S_COMPLETE: Digest computed, @cksumout filled in + * %GSS_S_FAILURE: Call failed */ u32 -make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, - struct xdr_buf *body, int body_offset, u8 *cksumkey, - unsigned int usage, struct xdr_netobj *cksumout) +gss_krb5_checksum(struct crypto_ahash *tfm, char *header, int hdrlen, + const struct xdr_buf *body, int body_offset, + struct xdr_netobj *cksumout) { - struct crypto_ahash *tfm; struct ahash_request *req; - struct scatterlist sg[1]; - int err = -1; + int err = -ENOMEM; u8 *checksumdata; - unsigned int checksumlen; - - if (cksumout->len < kctx->gk5e->cksumlength) { - dprintk("%s: checksum buffer length, %u, too small for %s\n", - __func__, cksumout->len, kctx->gk5e->name); - return GSS_S_FAILURE; - } - checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL); - if (checksumdata == NULL) + checksumdata = kmalloc(crypto_ahash_digestsize(tfm), GFP_KERNEL); + if (!checksumdata) return GSS_S_FAILURE; - tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - goto out_free_cksum; - req = ahash_request_alloc(tfm, GFP_KERNEL); if (!req) - goto out_free_ahash; - + goto out_free_cksum; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - - checksumlen = crypto_ahash_digestsize(tfm); - - if (cksumkey != NULL) { - err = crypto_ahash_setkey(tfm, cksumkey, - kctx->gk5e->keylength); - if (err) - goto out; - } - err = crypto_ahash_init(req); if (err) - goto out; - sg_init_one(sg, header, hdrlen); - ahash_request_set_crypt(req, sg, NULL, hdrlen); - err = crypto_ahash_update(req); - if (err) - goto out; - err = xdr_process_buf(body, body_offset, body->len - body_offset, - checksummer, req); - if (err) - goto out; - ahash_request_set_crypt(req, NULL, checksumdata, 0); - err = crypto_ahash_final(req); - if (err) - goto out; - - switch (kctx->gk5e->ctype) { - case CKSUMTYPE_RSA_MD5: - err = kctx->gk5e->encrypt(kctx->seq, NULL, checksumdata, - checksumdata, checksumlen); - if (err) - goto out; - memcpy(cksumout->data, - checksumdata + checksumlen - kctx->gk5e->cksumlength, - kctx->gk5e->cksumlength); - break; - case CKSUMTYPE_HMAC_SHA1_DES3: - memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); - break; - default: - BUG(); - break; - } - cksumout->len = kctx->gk5e->cksumlength; -out: - ahash_request_free(req); -out_free_ahash: - crypto_free_ahash(tfm); -out_free_cksum: - kfree(checksumdata); - return err ? GSS_S_FAILURE : 0; -} - -/* - * checksum the plaintext data and hdrlen bytes of the token header - * Per rfc4121, sec. 4.2.4, the checksum is performed over the data - * body then over the first 16 octets of the MIC token - * Inclusion of the header data in the calculation of the - * checksum is optional. - */ -u32 -make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, - struct xdr_buf *body, int body_offset, u8 *cksumkey, - unsigned int usage, struct xdr_netobj *cksumout) -{ - struct crypto_ahash *tfm; - struct ahash_request *req; - struct scatterlist sg[1]; - int err = -1; - u8 *checksumdata; - - if (kctx->gk5e->keyed_cksum == 0) { - dprintk("%s: expected keyed hash for %s\n", - __func__, kctx->gk5e->name); - return GSS_S_FAILURE; - } - if (cksumkey == NULL) { - dprintk("%s: no key supplied for %s\n", - __func__, kctx->gk5e->name); - return GSS_S_FAILURE; - } - - checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL); - if (!checksumdata) - return GSS_S_FAILURE; - - tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - goto out_free_cksum; - - req = ahash_request_alloc(tfm, GFP_KERNEL); - if (!req) goto out_free_ahash; - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - - err = crypto_ahash_setkey(tfm, cksumkey, kctx->gk5e->keylength); - if (err) - goto out; - - err = crypto_ahash_init(req); - if (err) - goto out; + /* + * Per RFC 4121 Section 4.2.4, the checksum is performed over the + * data body first, then over the octets in "header". + */ err = xdr_process_buf(body, body_offset, body->len - body_offset, checksummer, req); if (err) - goto out; - if (header != NULL) { + goto out_free_ahash; + if (header) { + struct scatterlist sg[1]; + sg_init_one(sg, header, hdrlen); ahash_request_set_crypt(req, sg, NULL, hdrlen); err = crypto_ahash_update(req); if (err) - goto out; + goto out_free_ahash; } + ahash_request_set_crypt(req, NULL, checksumdata, 0); err = crypto_ahash_final(req); if (err) - goto out; + goto out_free_ahash; + + memcpy(cksumout->data, checksumdata, + min_t(int, cksumout->len, crypto_ahash_digestsize(tfm))); - cksumout->len = kctx->gk5e->cksumlength; - - switch (kctx->gk5e->ctype) { - case CKSUMTYPE_HMAC_SHA1_96_AES128: - case CKSUMTYPE_HMAC_SHA1_96_AES256: - /* note that this truncates the hash */ - memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); - break; - default: - BUG(); - break; - } -out: - ahash_request_free(req); out_free_ahash: - crypto_free_ahash(tfm); + ahash_request_free(req); out_free_cksum: - kfree(checksumdata); - return err ? GSS_S_FAILURE : 0; + kfree_sensitive(checksumdata); + return err ? GSS_S_FAILURE : GSS_S_COMPLETE; } +EXPORT_SYMBOL_IF_KUNIT(gss_krb5_checksum); struct encryptor_desc { u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; @@ -392,35 +298,6 @@ encryptor(struct scatterlist *sg, void *data) return 0; } -int -gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, - int offset, struct page **pages) -{ - int ret; - struct encryptor_desc desc; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - - memset(desc.iv, 0, sizeof(desc.iv)); - desc.req = req; - desc.pos = offset; - desc.outbuf = buf; - desc.pages = pages; - desc.fragno = 0; - desc.fraglen = 0; - - sg_init_table(desc.infrags, 4); - sg_init_table(desc.outfrags, 4); - - ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc); - skcipher_request_zero(req); - return ret; -} - struct decryptor_desc { u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; struct skcipher_request *req; @@ -475,32 +352,6 @@ decryptor(struct scatterlist *sg, void *data) return 0; } -int -gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, - int offset) -{ - int ret; - struct decryptor_desc desc; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - /* XXXJBF: */ - BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - - memset(desc.iv, 0, sizeof(desc.iv)); - desc.req = req; - desc.fragno = 0; - desc.fraglen = 0; - - sg_init_table(desc.frags, 4); - - ret = xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); - skcipher_request_zero(req); - return ret; -} - /* * This function makes the assumption that it was ultimately called * from gss_wrap(). @@ -526,7 +377,6 @@ xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen) if (shiftlen == 0) return 0; - BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); BUG_ON(shiftlen > RPC_MAX_AUTH_SIZE); p = buf->head[0].iov_base + base; @@ -590,45 +440,172 @@ gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf, ret = write_bytes_to_xdr_buf(buf, offset, data, len); +#if IS_ENABLED(CONFIG_KUNIT) + /* + * CBC-CTS does not define an output IV but RFC 3962 defines it as the + * penultimate block of ciphertext, so copy that into the IV buffer + * before returning. + */ + if (encrypt) + memcpy(iv, data, crypto_sync_skcipher_ivsize(cipher)); +#endif + out: kfree(data); return ret; } +/** + * krb5_cbc_cts_encrypt - encrypt in CBC mode with CTS + * @cts_tfm: CBC cipher with CTS + * @cbc_tfm: base CBC cipher + * @offset: starting byte offset for plaintext + * @buf: OUT: output buffer + * @pages: plaintext + * @iv: output CBC initialization vector, or NULL + * @ivsize: size of @iv, in octets + * + * To provide confidentiality, encrypt using cipher block chaining + * with ciphertext stealing. Message integrity is handled separately. + * + * Return values: + * %0: encryption successful + * negative errno: encryption could not be completed + */ +VISIBLE_IF_KUNIT +int krb5_cbc_cts_encrypt(struct crypto_sync_skcipher *cts_tfm, + struct crypto_sync_skcipher *cbc_tfm, + u32 offset, struct xdr_buf *buf, struct page **pages, + u8 *iv, unsigned int ivsize) +{ + u32 blocksize, nbytes, nblocks, cbcbytes; + struct encryptor_desc desc; + int err; + + blocksize = crypto_sync_skcipher_blocksize(cts_tfm); + nbytes = buf->len - offset; + nblocks = (nbytes + blocksize - 1) / blocksize; + cbcbytes = 0; + if (nblocks > 2) + cbcbytes = (nblocks - 2) * blocksize; + + memset(desc.iv, 0, sizeof(desc.iv)); + + /* Handle block-sized chunks of plaintext with CBC. */ + if (cbcbytes) { + SYNC_SKCIPHER_REQUEST_ON_STACK(req, cbc_tfm); + + desc.pos = offset; + desc.fragno = 0; + desc.fraglen = 0; + desc.pages = pages; + desc.outbuf = buf; + desc.req = req; + + skcipher_request_set_sync_tfm(req, cbc_tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + + sg_init_table(desc.infrags, 4); + sg_init_table(desc.outfrags, 4); + + err = xdr_process_buf(buf, offset, cbcbytes, encryptor, &desc); + skcipher_request_zero(req); + if (err) + return err; + } + + /* Remaining plaintext is handled with CBC-CTS. */ + err = gss_krb5_cts_crypt(cts_tfm, buf, offset + cbcbytes, + desc.iv, pages, 1); + if (err) + return err; + + if (unlikely(iv)) + memcpy(iv, desc.iv, ivsize); + return 0; +} +EXPORT_SYMBOL_IF_KUNIT(krb5_cbc_cts_encrypt); + +/** + * krb5_cbc_cts_decrypt - decrypt in CBC mode with CTS + * @cts_tfm: CBC cipher with CTS + * @cbc_tfm: base CBC cipher + * @offset: starting byte offset for plaintext + * @buf: OUT: output buffer + * + * Return values: + * %0: decryption successful + * negative errno: decryption could not be completed + */ +VISIBLE_IF_KUNIT +int krb5_cbc_cts_decrypt(struct crypto_sync_skcipher *cts_tfm, + struct crypto_sync_skcipher *cbc_tfm, + u32 offset, struct xdr_buf *buf) +{ + u32 blocksize, nblocks, cbcbytes; + struct decryptor_desc desc; + int err; + + blocksize = crypto_sync_skcipher_blocksize(cts_tfm); + nblocks = (buf->len + blocksize - 1) / blocksize; + cbcbytes = 0; + if (nblocks > 2) + cbcbytes = (nblocks - 2) * blocksize; + + memset(desc.iv, 0, sizeof(desc.iv)); + + /* Handle block-sized chunks of plaintext with CBC. */ + if (cbcbytes) { + SYNC_SKCIPHER_REQUEST_ON_STACK(req, cbc_tfm); + + desc.fragno = 0; + desc.fraglen = 0; + desc.req = req; + + skcipher_request_set_sync_tfm(req, cbc_tfm); + skcipher_request_set_callback(req, 0, NULL, NULL); + + sg_init_table(desc.frags, 4); + + err = xdr_process_buf(buf, 0, cbcbytes, decryptor, &desc); + skcipher_request_zero(req); + if (err) + return err; + } + + /* Remaining plaintext is handled with CBC-CTS. */ + return gss_krb5_cts_crypt(cts_tfm, buf, cbcbytes, desc.iv, NULL, 0); +} +EXPORT_SYMBOL_IF_KUNIT(krb5_cbc_cts_decrypt); + u32 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct page **pages) { u32 err; struct xdr_netobj hmac; - u8 *cksumkey; u8 *ecptr; struct crypto_sync_skcipher *cipher, *aux_cipher; - int blocksize; + struct crypto_ahash *ahash; struct page **save_pages; - int nblocks, nbytes; - struct encryptor_desc desc; - u32 cbcbytes; - unsigned int usage; + unsigned int conflen; if (kctx->initiate) { cipher = kctx->initiator_enc; aux_cipher = kctx->initiator_enc_aux; - cksumkey = kctx->initiator_integ; - usage = KG_USAGE_INITIATOR_SEAL; + ahash = kctx->initiator_integ; } else { cipher = kctx->acceptor_enc; aux_cipher = kctx->acceptor_enc_aux; - cksumkey = kctx->acceptor_integ; - usage = KG_USAGE_ACCEPTOR_SEAL; + ahash = kctx->acceptor_integ; } - blocksize = crypto_sync_skcipher_blocksize(cipher); + conflen = crypto_sync_skcipher_blocksize(cipher); /* hide the gss token header and insert the confounder */ offset += GSS_KRB5_TOK_HDR_LEN; - if (xdr_extend_head(buf, offset, kctx->gk5e->conflen)) + if (xdr_extend_head(buf, offset, conflen)) return GSS_S_FAILURE; - gss_krb5_make_confounder(buf->head[0].iov_base + offset, kctx->gk5e->conflen); + krb5_make_confounder(buf->head[0].iov_base + offset, conflen); offset -= GSS_KRB5_TOK_HDR_LEN; if (buf->tail[0].iov_base != NULL) { @@ -645,8 +622,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; buf->len += GSS_KRB5_TOK_HDR_LEN; - /* Do the HMAC */ - hmac.len = GSS_KRB5_MAX_CKSUM_LEN; + hmac.len = kctx->gk5e->cksumlength; hmac.data = buf->tail[0].iov_base + buf->tail[0].iov_len; /* @@ -659,152 +635,321 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, save_pages = buf->pages; buf->pages = pages; - err = make_checksum_v2(kctx, NULL, 0, buf, - offset + GSS_KRB5_TOK_HDR_LEN, - cksumkey, usage, &hmac); + err = gss_krb5_checksum(ahash, NULL, 0, buf, + offset + GSS_KRB5_TOK_HDR_LEN, &hmac); buf->pages = save_pages; if (err) return GSS_S_FAILURE; - nbytes = buf->len - offset - GSS_KRB5_TOK_HDR_LEN; - nblocks = (nbytes + blocksize - 1) / blocksize; - cbcbytes = 0; - if (nblocks > 2) - cbcbytes = (nblocks - 2) * blocksize; - - memset(desc.iv, 0, sizeof(desc.iv)); - - if (cbcbytes) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); - - desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; - desc.fragno = 0; - desc.fraglen = 0; - desc.pages = pages; - desc.outbuf = buf; - desc.req = req; - - skcipher_request_set_sync_tfm(req, aux_cipher); - skcipher_request_set_callback(req, 0, NULL, NULL); - - sg_init_table(desc.infrags, 4); - sg_init_table(desc.outfrags, 4); - - err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN, - cbcbytes, encryptor, &desc); - skcipher_request_zero(req); - if (err) - goto out_err; - } - - /* Make sure IV carries forward from any CBC results. */ - err = gss_krb5_cts_crypt(cipher, buf, - offset + GSS_KRB5_TOK_HDR_LEN + cbcbytes, - desc.iv, pages, 1); - if (err) { - err = GSS_S_FAILURE; - goto out_err; - } + err = krb5_cbc_cts_encrypt(cipher, aux_cipher, + offset + GSS_KRB5_TOK_HDR_LEN, + buf, pages, NULL, 0); + if (err) + return GSS_S_FAILURE; /* Now update buf to account for HMAC */ buf->tail[0].iov_len += kctx->gk5e->cksumlength; buf->len += kctx->gk5e->cksumlength; -out_err: - if (err) - err = GSS_S_FAILURE; - return err; + return GSS_S_COMPLETE; } u32 gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len, struct xdr_buf *buf, u32 *headskip, u32 *tailskip) { - struct xdr_buf subbuf; - u32 ret = 0; - u8 *cksum_key; struct crypto_sync_skcipher *cipher, *aux_cipher; + struct crypto_ahash *ahash; struct xdr_netobj our_hmac_obj; u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; - int nblocks, blocksize, cbcbytes; - struct decryptor_desc desc; - unsigned int usage; + struct xdr_buf subbuf; + u32 ret = 0; if (kctx->initiate) { cipher = kctx->acceptor_enc; aux_cipher = kctx->acceptor_enc_aux; - cksum_key = kctx->acceptor_integ; - usage = KG_USAGE_ACCEPTOR_SEAL; + ahash = kctx->acceptor_integ; } else { cipher = kctx->initiator_enc; aux_cipher = kctx->initiator_enc_aux; - cksum_key = kctx->initiator_integ; - usage = KG_USAGE_INITIATOR_SEAL; + ahash = kctx->initiator_integ; } - blocksize = crypto_sync_skcipher_blocksize(cipher); - /* create a segment skipping the header and leaving out the checksum */ xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN, (len - offset - GSS_KRB5_TOK_HDR_LEN - kctx->gk5e->cksumlength)); - nblocks = (subbuf.len + blocksize - 1) / blocksize; + ret = krb5_cbc_cts_decrypt(cipher, aux_cipher, 0, &subbuf); + if (ret) + goto out_err; - cbcbytes = 0; - if (nblocks > 2) - cbcbytes = (nblocks - 2) * blocksize; + our_hmac_obj.len = kctx->gk5e->cksumlength; + our_hmac_obj.data = our_hmac; + ret = gss_krb5_checksum(ahash, NULL, 0, &subbuf, 0, &our_hmac_obj); + if (ret) + goto out_err; - memset(desc.iv, 0, sizeof(desc.iv)); + /* Get the packet's hmac value */ + ret = read_bytes_from_xdr_buf(buf, len - kctx->gk5e->cksumlength, + pkt_hmac, kctx->gk5e->cksumlength); + if (ret) + goto out_err; - if (cbcbytes) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); + if (crypto_memneq(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) { + ret = GSS_S_BAD_SIG; + goto out_err; + } + *headskip = crypto_sync_skcipher_blocksize(cipher); + *tailskip = kctx->gk5e->cksumlength; +out_err: + if (ret && ret != GSS_S_BAD_SIG) + ret = GSS_S_FAILURE; + return ret; +} - desc.fragno = 0; - desc.fraglen = 0; - desc.req = req; +/** + * krb5_etm_checksum - Compute a MAC for a GSS Wrap token + * @cipher: an initialized cipher transform + * @tfm: an initialized hash transform + * @body: xdr_buf containing an RPC message (body.len is the message length) + * @body_offset: byte offset into @body to start checksumming + * @cksumout: OUT: a buffer to be filled in with the computed HMAC + * + * Usually expressed as H = HMAC(K, IV | ciphertext)[1..h] . + * + * Caller provides the truncation length of the output token (h) in + * cksumout.len. + * + * Return values: + * %GSS_S_COMPLETE: Digest computed, @cksumout filled in + * %GSS_S_FAILURE: Call failed + */ +VISIBLE_IF_KUNIT +u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher, + struct crypto_ahash *tfm, const struct xdr_buf *body, + int body_offset, struct xdr_netobj *cksumout) +{ + unsigned int ivsize = crypto_sync_skcipher_ivsize(cipher); + struct ahash_request *req; + struct scatterlist sg[1]; + u8 *iv, *checksumdata; + int err = -ENOMEM; - skcipher_request_set_sync_tfm(req, aux_cipher); - skcipher_request_set_callback(req, 0, NULL, NULL); + checksumdata = kmalloc(crypto_ahash_digestsize(tfm), GFP_KERNEL); + if (!checksumdata) + return GSS_S_FAILURE; + /* For RPCSEC, the "initial cipher state" is always all zeroes. */ + iv = kzalloc(ivsize, GFP_KERNEL); + if (!iv) + goto out_free_mem; - sg_init_table(desc.frags, 4); + req = ahash_request_alloc(tfm, GFP_KERNEL); + if (!req) + goto out_free_mem; + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + err = crypto_ahash_init(req); + if (err) + goto out_free_ahash; - ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc); - skcipher_request_zero(req); - if (ret) - goto out_err; + sg_init_one(sg, iv, ivsize); + ahash_request_set_crypt(req, sg, NULL, ivsize); + err = crypto_ahash_update(req); + if (err) + goto out_free_ahash; + err = xdr_process_buf(body, body_offset, body->len - body_offset, + checksummer, req); + if (err) + goto out_free_ahash; + + ahash_request_set_crypt(req, NULL, checksumdata, 0); + err = crypto_ahash_final(req); + if (err) + goto out_free_ahash; + memcpy(cksumout->data, checksumdata, cksumout->len); + +out_free_ahash: + ahash_request_free(req); +out_free_mem: + kfree(iv); + kfree_sensitive(checksumdata); + return err ? GSS_S_FAILURE : GSS_S_COMPLETE; +} +EXPORT_SYMBOL_IF_KUNIT(krb5_etm_checksum); + +/** + * krb5_etm_encrypt - Encrypt using the RFC 8009 rules + * @kctx: Kerberos context + * @offset: starting offset of the payload, in bytes + * @buf: OUT: send buffer to contain the encrypted payload + * @pages: plaintext payload + * + * The main difference with aes_encrypt is that "The HMAC is + * calculated over the cipher state concatenated with the AES + * output, instead of being calculated over the confounder and + * plaintext. This allows the message receiver to verify the + * integrity of the message before decrypting the message." + * + * RFC 8009 Section 5: + * + * encryption function: as follows, where E() is AES encryption in + * CBC-CS3 mode, and h is the size of truncated HMAC (128 bits or + * 192 bits as described above). + * + * N = random value of length 128 bits (the AES block size) + * IV = cipher state + * C = E(Ke, N | plaintext, IV) + * H = HMAC(Ki, IV | C) + * ciphertext = C | H[1..h] + * + * This encryption formula provides AEAD EtM with key separation. + * + * Return values: + * %GSS_S_COMPLETE: Encryption successful + * %GSS_S_FAILURE: Encryption failed + */ +u32 +krb5_etm_encrypt(struct krb5_ctx *kctx, u32 offset, + struct xdr_buf *buf, struct page **pages) +{ + struct crypto_sync_skcipher *cipher, *aux_cipher; + struct crypto_ahash *ahash; + struct xdr_netobj hmac; + unsigned int conflen; + u8 *ecptr; + u32 err; + + if (kctx->initiate) { + cipher = kctx->initiator_enc; + aux_cipher = kctx->initiator_enc_aux; + ahash = kctx->initiator_integ; + } else { + cipher = kctx->acceptor_enc; + aux_cipher = kctx->acceptor_enc_aux; + ahash = kctx->acceptor_integ; } + conflen = crypto_sync_skcipher_blocksize(cipher); - /* Make sure IV carries forward from any CBC results. */ - ret = gss_krb5_cts_crypt(cipher, &subbuf, cbcbytes, desc.iv, NULL, 0); - if (ret) + offset += GSS_KRB5_TOK_HDR_LEN; + if (xdr_extend_head(buf, offset, conflen)) + return GSS_S_FAILURE; + krb5_make_confounder(buf->head[0].iov_base + offset, conflen); + offset -= GSS_KRB5_TOK_HDR_LEN; + + if (buf->tail[0].iov_base) { + ecptr = buf->tail[0].iov_base + buf->tail[0].iov_len; + } else { + buf->tail[0].iov_base = buf->head[0].iov_base + + buf->head[0].iov_len; + buf->tail[0].iov_len = 0; + ecptr = buf->tail[0].iov_base; + } + + memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN); + buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; + buf->len += GSS_KRB5_TOK_HDR_LEN; + + err = krb5_cbc_cts_encrypt(cipher, aux_cipher, + offset + GSS_KRB5_TOK_HDR_LEN, + buf, pages, NULL, 0); + if (err) + return GSS_S_FAILURE; + + hmac.data = buf->tail[0].iov_base + buf->tail[0].iov_len; + hmac.len = kctx->gk5e->cksumlength; + err = krb5_etm_checksum(cipher, ahash, + buf, offset + GSS_KRB5_TOK_HDR_LEN, &hmac); + if (err) goto out_err; + buf->tail[0].iov_len += kctx->gk5e->cksumlength; + buf->len += kctx->gk5e->cksumlength; + return GSS_S_COMPLETE; - /* Calculate our hmac over the plaintext data */ - our_hmac_obj.len = sizeof(our_hmac); - our_hmac_obj.data = our_hmac; +out_err: + return GSS_S_FAILURE; +} + +/** + * krb5_etm_decrypt - Decrypt using the RFC 8009 rules + * @kctx: Kerberos context + * @offset: starting offset of the ciphertext, in bytes + * @len: size of ciphertext to unwrap + * @buf: ciphertext to unwrap + * @headskip: OUT: the enctype's confounder length, in octets + * @tailskip: OUT: the enctype's HMAC length, in octets + * + * RFC 8009 Section 5: + * + * decryption function: as follows, where D() is AES decryption in + * CBC-CS3 mode, and h is the size of truncated HMAC. + * + * (C, H) = ciphertext + * (Note: H is the last h bits of the ciphertext.) + * IV = cipher state + * if H != HMAC(Ki, IV | C)[1..h] + * stop, report error + * (N, P) = D(Ke, C, IV) + * + * Return values: + * %GSS_S_COMPLETE: Decryption successful + * %GSS_S_BAD_SIG: computed HMAC != received HMAC + * %GSS_S_FAILURE: Decryption failed + */ +u32 +krb5_etm_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len, + struct xdr_buf *buf, u32 *headskip, u32 *tailskip) +{ + struct crypto_sync_skcipher *cipher, *aux_cipher; + u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; + u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; + struct xdr_netobj our_hmac_obj; + struct crypto_ahash *ahash; + struct xdr_buf subbuf; + u32 ret = 0; - ret = make_checksum_v2(kctx, NULL, 0, &subbuf, 0, - cksum_key, usage, &our_hmac_obj); + if (kctx->initiate) { + cipher = kctx->acceptor_enc; + aux_cipher = kctx->acceptor_enc_aux; + ahash = kctx->acceptor_integ; + } else { + cipher = kctx->initiator_enc; + aux_cipher = kctx->initiator_enc_aux; + ahash = kctx->initiator_integ; + } + + /* Extract the ciphertext into @subbuf. */ + xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN, + (len - offset - GSS_KRB5_TOK_HDR_LEN - + kctx->gk5e->cksumlength)); + + our_hmac_obj.data = our_hmac; + our_hmac_obj.len = kctx->gk5e->cksumlength; + ret = krb5_etm_checksum(cipher, ahash, &subbuf, 0, &our_hmac_obj); if (ret) goto out_err; - - /* Get the packet's hmac value */ ret = read_bytes_from_xdr_buf(buf, len - kctx->gk5e->cksumlength, pkt_hmac, kctx->gk5e->cksumlength); if (ret) goto out_err; - if (crypto_memneq(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) { ret = GSS_S_BAD_SIG; goto out_err; } - *headskip = kctx->gk5e->conflen; + + ret = krb5_cbc_cts_decrypt(cipher, aux_cipher, 0, &subbuf); + if (ret) { + ret = GSS_S_FAILURE; + goto out_err; + } + + *headskip = crypto_sync_skcipher_blocksize(cipher); *tailskip = kctx->gk5e->cksumlength; + return GSS_S_COMPLETE; + out_err: - if (ret && ret != GSS_S_BAD_SIG) + if (ret != GSS_S_BAD_SIG) ret = GSS_S_FAILURE; return ret; } diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h new file mode 100644 index 000000000000..8769e9e705bf --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_internal.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * SunRPC GSS Kerberos 5 mechanism internal definitions + * + * Copyright (c) 2022 Oracle and/or its affiliates. + */ + +#ifndef _NET_SUNRPC_AUTH_GSS_KRB5_INTERNAL_H +#define _NET_SUNRPC_AUTH_GSS_KRB5_INTERNAL_H + +/* + * The RFCs often specify payload lengths in bits. This helper + * converts a specified bit-length to the number of octets/bytes. + */ +#define BITS2OCTETS(x) ((x) / 8) + +struct krb5_ctx; + +struct gss_krb5_enctype { + const u32 etype; /* encryption (key) type */ + const u32 ctype; /* checksum type */ + const char *name; /* "friendly" name */ + const char *encrypt_name; /* crypto encrypt name */ + const char *aux_cipher; /* aux encrypt cipher name */ + const char *cksum_name; /* crypto checksum name */ + const u16 signalg; /* signing algorithm */ + const u16 sealalg; /* sealing algorithm */ + const u32 cksumlength; /* checksum length */ + const u32 keyed_cksum; /* is it a keyed cksum? */ + const u32 keybytes; /* raw key len, in bytes */ + const u32 keylength; /* protocol key length, in octets */ + const u32 Kc_length; /* checksum subkey length, in octets */ + const u32 Ke_length; /* encryption subkey length, in octets */ + const u32 Ki_length; /* integrity subkey length, in octets */ + + int (*derive_key)(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *in, + struct xdr_netobj *out, + const struct xdr_netobj *label, + gfp_t gfp_mask); + u32 (*encrypt)(struct krb5_ctx *kctx, u32 offset, + struct xdr_buf *buf, struct page **pages); + u32 (*decrypt)(struct krb5_ctx *kctx, u32 offset, u32 len, + struct xdr_buf *buf, u32 *headskip, u32 *tailskip); + u32 (*get_mic)(struct krb5_ctx *kctx, struct xdr_buf *text, + struct xdr_netobj *token); + u32 (*verify_mic)(struct krb5_ctx *kctx, struct xdr_buf *message_buffer, + struct xdr_netobj *read_token); + u32 (*wrap)(struct krb5_ctx *kctx, int offset, + struct xdr_buf *buf, struct page **pages); + u32 (*unwrap)(struct krb5_ctx *kctx, int offset, int len, + struct xdr_buf *buf, unsigned int *slack, + unsigned int *align); +}; + +/* krb5_ctx flags definitions */ +#define KRB5_CTX_FLAG_INITIATOR 0x00000001 +#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY 0x00000004 + +struct krb5_ctx { + int initiate; /* 1 = initiating, 0 = accepting */ + u32 enctype; + u32 flags; + const struct gss_krb5_enctype *gk5e; /* enctype-specific info */ + struct crypto_sync_skcipher *enc; + struct crypto_sync_skcipher *seq; + struct crypto_sync_skcipher *acceptor_enc; + struct crypto_sync_skcipher *initiator_enc; + struct crypto_sync_skcipher *acceptor_enc_aux; + struct crypto_sync_skcipher *initiator_enc_aux; + struct crypto_ahash *acceptor_sign; + struct crypto_ahash *initiator_sign; + struct crypto_ahash *initiator_integ; + struct crypto_ahash *acceptor_integ; + u8 Ksess[GSS_KRB5_MAX_KEYLEN]; /* session key */ + u8 cksum[GSS_KRB5_MAX_KEYLEN]; + atomic_t seq_send; + atomic64_t seq_send64; + time64_t endtime; + struct xdr_netobj mech_used; +}; + +/* + * GSS Kerberos 5 mechanism Per-Message calls. + */ + +u32 gss_krb5_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, + struct xdr_netobj *token); + +u32 gss_krb5_verify_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, + struct xdr_netobj *read_token); + +u32 gss_krb5_wrap_v2(struct krb5_ctx *kctx, int offset, + struct xdr_buf *buf, struct page **pages); + +u32 gss_krb5_unwrap_v2(struct krb5_ctx *kctx, int offset, int len, + struct xdr_buf *buf, unsigned int *slack, + unsigned int *align); + +/* + * Implementation internal functions + */ + +/* Key Derivation Functions */ + +int krb5_derive_key_v2(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *inkey, + struct xdr_netobj *outkey, + const struct xdr_netobj *label, + gfp_t gfp_mask); + +int krb5_kdf_hmac_sha2(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *inkey, + struct xdr_netobj *outkey, + const struct xdr_netobj *in_constant, + gfp_t gfp_mask); + +int krb5_kdf_feedback_cmac(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *inkey, + struct xdr_netobj *outkey, + const struct xdr_netobj *in_constant, + gfp_t gfp_mask); + +/** + * krb5_derive_key - Derive a subkey from a protocol key + * @kctx: Kerberos 5 context + * @inkey: base protocol key + * @outkey: OUT: derived key + * @usage: key usage value + * @seed: key usage seed (one octet) + * @gfp_mask: memory allocation control flags + * + * Caller sets @outkey->len to the desired length of the derived key. + * + * On success, returns 0 and fills in @outkey. A negative errno value + * is returned on failure. + */ +static inline int krb5_derive_key(struct krb5_ctx *kctx, + const struct xdr_netobj *inkey, + struct xdr_netobj *outkey, + u32 usage, u8 seed, gfp_t gfp_mask) +{ + const struct gss_krb5_enctype *gk5e = kctx->gk5e; + u8 label_data[GSS_KRB5_K5CLENGTH]; + struct xdr_netobj label = { + .len = sizeof(label_data), + .data = label_data, + }; + __be32 *p = (__be32 *)label_data; + + *p = cpu_to_be32(usage); + label_data[4] = seed; + return gk5e->derive_key(gk5e, inkey, outkey, &label, gfp_mask); +} + +void krb5_make_confounder(u8 *p, int conflen); + +u32 gss_krb5_checksum(struct crypto_ahash *tfm, char *header, int hdrlen, + const struct xdr_buf *body, int body_offset, + struct xdr_netobj *cksumout); + +u32 krb5_encrypt(struct crypto_sync_skcipher *key, void *iv, void *in, + void *out, int length); + +int xdr_extend_head(struct xdr_buf *buf, unsigned int base, + unsigned int shiftlen); + +u32 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, + struct xdr_buf *buf, struct page **pages); + +u32 gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len, + struct xdr_buf *buf, u32 *plainoffset, u32 *plainlen); + +u32 krb5_etm_encrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, + struct page **pages); + +u32 krb5_etm_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len, + struct xdr_buf *buf, u32 *headskip, u32 *tailskip); + +#if IS_ENABLED(CONFIG_KUNIT) +void krb5_nfold(u32 inbits, const u8 *in, u32 outbits, u8 *out); +const struct gss_krb5_enctype *gss_krb5_lookup_enctype(u32 etype); +int krb5_cbc_cts_encrypt(struct crypto_sync_skcipher *cts_tfm, + struct crypto_sync_skcipher *cbc_tfm, u32 offset, + struct xdr_buf *buf, struct page **pages, + u8 *iv, unsigned int ivsize); +int krb5_cbc_cts_decrypt(struct crypto_sync_skcipher *cts_tfm, + struct crypto_sync_skcipher *cbc_tfm, + u32 offset, struct xdr_buf *buf); +u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher, + struct crypto_ahash *tfm, const struct xdr_buf *body, + int body_offset, struct xdr_netobj *cksumout); +#endif + +#endif /* _NET_SUNRPC_AUTH_GSS_KRB5_INTERNAL_H */ diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 726c076950c0..4eb19c3a54c7 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -60,18 +60,27 @@ #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> #include <linux/lcm.h> +#include <crypto/hash.h> +#include <kunit/visibility.h> + +#include "gss_krb5_internal.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif -/* +/** + * krb5_nfold - n-fold function + * @inbits: number of bits in @in + * @in: buffer containing input to fold + * @outbits: number of bits in the output buffer + * @out: buffer to hold the result + * * This is the n-fold function as described in rfc3961, sec 5.1 * Taken from MIT Kerberos and modified. */ - -static void krb5_nfold(u32 inbits, const u8 *in, - u32 outbits, u8 *out) +VISIBLE_IF_KUNIT +void krb5_nfold(u32 inbits, const u8 *in, u32 outbits, u8 *out) { unsigned long ulcm; int byte, i, msbit; @@ -132,40 +141,36 @@ static void krb5_nfold(u32 inbits, const u8 *in, } } } +EXPORT_SYMBOL_IF_KUNIT(krb5_nfold); /* * This is the DK (derive_key) function as described in rfc3961, sec 5.1 * Taken from MIT Kerberos and modified. */ - -u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, - const struct xdr_netobj *inkey, - struct xdr_netobj *outkey, - const struct xdr_netobj *in_constant, - gfp_t gfp_mask) +static int krb5_DK(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *inkey, u8 *rawkey, + const struct xdr_netobj *in_constant, gfp_t gfp_mask) { size_t blocksize, keybytes, keylength, n; - unsigned char *inblockdata, *outblockdata, *rawkey; + unsigned char *inblockdata, *outblockdata; struct xdr_netobj inblock, outblock; struct crypto_sync_skcipher *cipher; - u32 ret = EINVAL; + int ret = -EINVAL; - blocksize = gk5e->blocksize; keybytes = gk5e->keybytes; keylength = gk5e->keylength; - if ((inkey->len != keylength) || (outkey->len != keylength)) + if (inkey->len != keylength) goto err_return; cipher = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0); if (IS_ERR(cipher)) goto err_return; + blocksize = crypto_sync_skcipher_blocksize(cipher); if (crypto_sync_skcipher_setkey(cipher, inkey->data, inkey->len)) - goto err_return; - - /* allocate and set up buffers */ + goto err_free_cipher; - ret = ENOMEM; + ret = -ENOMEM; inblockdata = kmalloc(blocksize, gfp_mask); if (inblockdata == NULL) goto err_free_cipher; @@ -174,10 +179,6 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, if (outblockdata == NULL) goto err_free_in; - rawkey = kmalloc(keybytes, gfp_mask); - if (rawkey == NULL) - goto err_free_out; - inblock.data = (char *) inblockdata; inblock.len = blocksize; @@ -197,8 +198,8 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, n = 0; while (n < keybytes) { - (*(gk5e->encrypt))(cipher, NULL, inblock.data, - outblock.data, inblock.len); + krb5_encrypt(cipher, NULL, inblock.data, outblock.data, + inblock.len); if ((keybytes - n) <= outblock.len) { memcpy(rawkey + n, outblock.data, (keybytes - n)); @@ -210,26 +211,8 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, n += outblock.len; } - /* postprocess the key */ - - inblock.data = (char *) rawkey; - inblock.len = keybytes; - - BUG_ON(gk5e->mk_key == NULL); - ret = (*(gk5e->mk_key))(gk5e, &inblock, outkey); - if (ret) { - dprintk("%s: got %d from mk_key function for '%s'\n", - __func__, ret, gk5e->encrypt_name); - goto err_free_raw; - } - - /* clean memory, free resources and exit */ - ret = 0; -err_free_raw: - kfree_sensitive(rawkey); -err_free_out: kfree_sensitive(outblockdata); err_free_in: kfree_sensitive(inblockdata); @@ -239,84 +222,325 @@ err_return: return ret; } -#define smask(step) ((1<<step)-1) -#define pstep(x, step) (((x)&smask(step))^(((x)>>step)&smask(step))) -#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1) - -static void mit_des_fixup_key_parity(u8 key[8]) -{ - int i; - for (i = 0; i < 8; i++) { - key[i] &= 0xfe; - key[i] |= 1^parity_char(key[i]); - } -} - /* - * This is the des3 key derivation postprocess function + * This is the identity function, with some sanity checking. */ -u32 gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e, - struct xdr_netobj *randombits, - struct xdr_netobj *key) +static int krb5_random_to_key_v2(const struct gss_krb5_enctype *gk5e, + struct xdr_netobj *randombits, + struct xdr_netobj *key) { - int i; - u32 ret = EINVAL; + int ret = -EINVAL; - if (key->len != 24) { + if (key->len != 16 && key->len != 32) { dprintk("%s: key->len is %d\n", __func__, key->len); goto err_out; } - if (randombits->len != 21) { + if (randombits->len != 16 && randombits->len != 32) { dprintk("%s: randombits->len is %d\n", __func__, randombits->len); goto err_out; } + if (randombits->len != key->len) { + dprintk("%s: randombits->len is %d, key->len is %d\n", + __func__, randombits->len, key->len); + goto err_out; + } + memcpy(key->data, randombits->data, key->len); + ret = 0; +err_out: + return ret; +} + +/** + * krb5_derive_key_v2 - Derive a subkey for an RFC 3962 enctype + * @gk5e: Kerberos 5 enctype profile + * @inkey: base protocol key + * @outkey: OUT: derived key + * @label: subkey usage label + * @gfp_mask: memory allocation control flags + * + * Caller sets @outkey->len to the desired length of the derived key. + * + * On success, returns 0 and fills in @outkey. A negative errno value + * is returned on failure. + */ +int krb5_derive_key_v2(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *inkey, + struct xdr_netobj *outkey, + const struct xdr_netobj *label, + gfp_t gfp_mask) +{ + struct xdr_netobj inblock; + int ret; + + inblock.len = gk5e->keybytes; + inblock.data = kmalloc(inblock.len, gfp_mask); + if (!inblock.data) + return -ENOMEM; - /* take the seven bytes, move them around into the top 7 bits of the - 8 key bytes, then compute the parity bits. Do this three times. */ + ret = krb5_DK(gk5e, inkey, inblock.data, label, gfp_mask); + if (!ret) + ret = krb5_random_to_key_v2(gk5e, &inblock, outkey); + + kfree_sensitive(inblock.data); + return ret; +} - for (i = 0; i < 3; i++) { - memcpy(key->data + i*8, randombits->data + i*7, 7); - key->data[i*8+7] = (((key->data[i*8]&1)<<1) | - ((key->data[i*8+1]&1)<<2) | - ((key->data[i*8+2]&1)<<3) | - ((key->data[i*8+3]&1)<<4) | - ((key->data[i*8+4]&1)<<5) | - ((key->data[i*8+5]&1)<<6) | - ((key->data[i*8+6]&1)<<7)); +/* + * K(i) = CMAC(key, K(i-1) | i | constant | 0x00 | k) + * + * i: A block counter is used with a length of 4 bytes, represented + * in big-endian order. + * + * constant: The label input to the KDF is the usage constant supplied + * to the key derivation function + * + * k: The length of the output key in bits, represented as a 4-byte + * string in big-endian order. + * + * Caller fills in K(i-1) in @step, and receives the result K(i) + * in the same buffer. + */ +static int +krb5_cmac_Ki(struct crypto_shash *tfm, const struct xdr_netobj *constant, + u32 outlen, u32 count, struct xdr_netobj *step) +{ + __be32 k = cpu_to_be32(outlen * 8); + SHASH_DESC_ON_STACK(desc, tfm); + __be32 i = cpu_to_be32(count); + u8 zero = 0; + int ret; + + desc->tfm = tfm; + ret = crypto_shash_init(desc); + if (ret) + goto out_err; + + ret = crypto_shash_update(desc, step->data, step->len); + if (ret) + goto out_err; + ret = crypto_shash_update(desc, (u8 *)&i, sizeof(i)); + if (ret) + goto out_err; + ret = crypto_shash_update(desc, constant->data, constant->len); + if (ret) + goto out_err; + ret = crypto_shash_update(desc, &zero, sizeof(zero)); + if (ret) + goto out_err; + ret = crypto_shash_update(desc, (u8 *)&k, sizeof(k)); + if (ret) + goto out_err; + ret = crypto_shash_final(desc, step->data); + if (ret) + goto out_err; + +out_err: + shash_desc_zero(desc); + return ret; +} - mit_des_fixup_key_parity(key->data + i*8); +/** + * krb5_kdf_feedback_cmac - Derive a subkey for a Camellia/CMAC-based enctype + * @gk5e: Kerberos 5 enctype parameters + * @inkey: base protocol key + * @outkey: OUT: derived key + * @constant: subkey usage label + * @gfp_mask: memory allocation control flags + * + * RFC 6803 Section 3: + * + * "We use a key derivation function from the family specified in + * [SP800-108], Section 5.2, 'KDF in Feedback Mode'." + * + * n = ceiling(k / 128) + * K(0) = zeros + * K(i) = CMAC(key, K(i-1) | i | constant | 0x00 | k) + * DR(key, constant) = k-truncate(K(1) | K(2) | ... | K(n)) + * KDF-FEEDBACK-CMAC(key, constant) = random-to-key(DR(key, constant)) + * + * Caller sets @outkey->len to the desired length of the derived key (k). + * + * On success, returns 0 and fills in @outkey. A negative errno value + * is returned on failure. + */ +int +krb5_kdf_feedback_cmac(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *inkey, + struct xdr_netobj *outkey, + const struct xdr_netobj *constant, + gfp_t gfp_mask) +{ + struct xdr_netobj step = { .data = NULL }; + struct xdr_netobj DR = { .data = NULL }; + unsigned int blocksize, offset; + struct crypto_shash *tfm; + int n, count, ret; + + /* + * This implementation assumes the CMAC used for an enctype's + * key derivation is the same as the CMAC used for its + * checksumming. This happens to be true for enctypes that + * are currently supported by this implementation. + */ + tfm = crypto_alloc_shash(gk5e->cksum_name, 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + ret = crypto_shash_setkey(tfm, inkey->data, inkey->len); + if (ret) + goto out_free_tfm; + + blocksize = crypto_shash_digestsize(tfm); + n = (outkey->len + blocksize - 1) / blocksize; + + /* K(0) is all zeroes */ + ret = -ENOMEM; + step.len = blocksize; + step.data = kzalloc(step.len, gfp_mask); + if (!step.data) + goto out_free_tfm; + + DR.len = blocksize * n; + DR.data = kmalloc(DR.len, gfp_mask); + if (!DR.data) + goto out_free_tfm; + + /* XXX: Does not handle partial-block key sizes */ + for (offset = 0, count = 1; count <= n; count++) { + ret = krb5_cmac_Ki(tfm, constant, outkey->len, count, &step); + if (ret) + goto out_free_tfm; + + memcpy(DR.data + offset, step.data, blocksize); + offset += blocksize; } + + /* k-truncate and random-to-key */ + memcpy(outkey->data, DR.data, outkey->len); ret = 0; -err_out: + +out_free_tfm: + crypto_free_shash(tfm); +out: + kfree_sensitive(step.data); + kfree_sensitive(DR.data); return ret; } /* - * This is the aes key derivation postprocess function + * K1 = HMAC-SHA(key, 0x00000001 | label | 0x00 | k) + * + * key: The source of entropy from which subsequent keys are derived. + * + * label: An octet string describing the intended usage of the + * derived key. + * + * k: Length in bits of the key to be outputted, expressed in + * big-endian binary representation in 4 bytes. */ -u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e, - struct xdr_netobj *randombits, - struct xdr_netobj *key) +static int +krb5_hmac_K1(struct crypto_shash *tfm, const struct xdr_netobj *label, + u32 outlen, struct xdr_netobj *K1) { - u32 ret = EINVAL; + __be32 k = cpu_to_be32(outlen * 8); + SHASH_DESC_ON_STACK(desc, tfm); + __be32 one = cpu_to_be32(1); + u8 zero = 0; + int ret; + + desc->tfm = tfm; + ret = crypto_shash_init(desc); + if (ret) + goto out_err; + ret = crypto_shash_update(desc, (u8 *)&one, sizeof(one)); + if (ret) + goto out_err; + ret = crypto_shash_update(desc, label->data, label->len); + if (ret) + goto out_err; + ret = crypto_shash_update(desc, &zero, sizeof(zero)); + if (ret) + goto out_err; + ret = crypto_shash_update(desc, (u8 *)&k, sizeof(k)); + if (ret) + goto out_err; + ret = crypto_shash_final(desc, K1->data); + if (ret) + goto out_err; + +out_err: + shash_desc_zero(desc); + return ret; +} - if (key->len != 16 && key->len != 32) { - dprintk("%s: key->len is %d\n", __func__, key->len); - goto err_out; - } - if (randombits->len != 16 && randombits->len != 32) { - dprintk("%s: randombits->len is %d\n", - __func__, randombits->len); - goto err_out; +/** + * krb5_kdf_hmac_sha2 - Derive a subkey for an AES/SHA2-based enctype + * @gk5e: Kerberos 5 enctype policy parameters + * @inkey: base protocol key + * @outkey: OUT: derived key + * @label: subkey usage label + * @gfp_mask: memory allocation control flags + * + * RFC 8009 Section 3: + * + * "We use a key derivation function from Section 5.1 of [SP800-108], + * which uses the HMAC algorithm as the PRF." + * + * function KDF-HMAC-SHA2(key, label, [context,] k): + * k-truncate(K1) + * + * Caller sets @outkey->len to the desired length of the derived key. + * + * On success, returns 0 and fills in @outkey. A negative errno value + * is returned on failure. + */ +int +krb5_kdf_hmac_sha2(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *inkey, + struct xdr_netobj *outkey, + const struct xdr_netobj *label, + gfp_t gfp_mask) +{ + struct crypto_shash *tfm; + struct xdr_netobj K1 = { + .data = NULL, + }; + int ret; + + /* + * This implementation assumes the HMAC used for an enctype's + * key derivation is the same as the HMAC used for its + * checksumming. This happens to be true for enctypes that + * are currently supported by this implementation. + */ + tfm = crypto_alloc_shash(gk5e->cksum_name, 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; } - if (randombits->len != key->len) { - dprintk("%s: randombits->len is %d, key->len is %d\n", - __func__, randombits->len, key->len); - goto err_out; + ret = crypto_shash_setkey(tfm, inkey->data, inkey->len); + if (ret) + goto out_free_tfm; + + K1.len = crypto_shash_digestsize(tfm); + K1.data = kmalloc(K1.len, gfp_mask); + if (!K1.data) { + ret = -ENOMEM; + goto out_free_tfm; } - memcpy(key->data, randombits->data, key->len); - ret = 0; -err_out: + + ret = krb5_hmac_K1(tfm, label, outkey->len, &K1); + if (ret) + goto out_free_tfm; + + /* k-truncate and random-to-key */ + memcpy(outkey->data, K1.data, outkey->len); + +out_free_tfm: + kfree_sensitive(K1.data); + crypto_free_shash(tfm); +out: return ret; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 1c092b05c2bb..3366505bc669 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -19,449 +19,376 @@ #include <linux/sunrpc/auth.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/gss_krb5_enctypes.h> +#include <kunit/visibility.h> #include "auth_gss_internal.h" +#include "gss_krb5_internal.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif -static struct gss_api_mech gss_kerberos_mech; /* forward declaration */ +static struct gss_api_mech gss_kerberos_mech; static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { -#ifndef CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES +#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1) /* - * DES (All DES enctypes are mapped to the same gss functionality) - */ - { - .etype = ENCTYPE_DES_CBC_RAW, - .ctype = CKSUMTYPE_RSA_MD5, - .name = "des-cbc-crc", - .encrypt_name = "cbc(des)", - .cksum_name = "md5", - .encrypt = krb5_encrypt, - .decrypt = krb5_decrypt, - .mk_key = NULL, - .signalg = SGN_ALG_DES_MAC_MD5, - .sealalg = SEAL_ALG_DES, - .keybytes = 7, - .keylength = 8, - .blocksize = 8, - .conflen = 8, - .cksumlength = 8, - .keyed_cksum = 0, - }, -#endif /* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */ - /* - * 3DES - */ - { - .etype = ENCTYPE_DES3_CBC_RAW, - .ctype = CKSUMTYPE_HMAC_SHA1_DES3, - .name = "des3-hmac-sha1", - .encrypt_name = "cbc(des3_ede)", - .cksum_name = "hmac(sha1)", - .encrypt = krb5_encrypt, - .decrypt = krb5_decrypt, - .mk_key = gss_krb5_des3_make_key, - .signalg = SGN_ALG_HMAC_SHA1_DES3_KD, - .sealalg = SEAL_ALG_DES3KD, - .keybytes = 21, - .keylength = 24, - .blocksize = 8, - .conflen = 8, - .cksumlength = 20, - .keyed_cksum = 1, - }, - /* - * AES128 + * AES-128 with SHA-1 (RFC 3962) */ { .etype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, .ctype = CKSUMTYPE_HMAC_SHA1_96_AES128, .name = "aes128-cts", .encrypt_name = "cts(cbc(aes))", + .aux_cipher = "cbc(aes)", .cksum_name = "hmac(sha1)", - .encrypt = krb5_encrypt, - .decrypt = krb5_decrypt, - .mk_key = gss_krb5_aes_make_key, - .encrypt_v2 = gss_krb5_aes_encrypt, - .decrypt_v2 = gss_krb5_aes_decrypt, + .derive_key = krb5_derive_key_v2, + .encrypt = gss_krb5_aes_encrypt, + .decrypt = gss_krb5_aes_decrypt, + + .get_mic = gss_krb5_get_mic_v2, + .verify_mic = gss_krb5_verify_mic_v2, + .wrap = gss_krb5_wrap_v2, + .unwrap = gss_krb5_unwrap_v2, + .signalg = -1, .sealalg = -1, .keybytes = 16, - .keylength = 16, - .blocksize = 16, - .conflen = 16, - .cksumlength = 12, + .keylength = BITS2OCTETS(128), + .Kc_length = BITS2OCTETS(128), + .Ke_length = BITS2OCTETS(128), + .Ki_length = BITS2OCTETS(128), + .cksumlength = BITS2OCTETS(96), .keyed_cksum = 1, }, /* - * AES256 + * AES-256 with SHA-1 (RFC 3962) */ { .etype = ENCTYPE_AES256_CTS_HMAC_SHA1_96, .ctype = CKSUMTYPE_HMAC_SHA1_96_AES256, .name = "aes256-cts", .encrypt_name = "cts(cbc(aes))", + .aux_cipher = "cbc(aes)", .cksum_name = "hmac(sha1)", - .encrypt = krb5_encrypt, - .decrypt = krb5_decrypt, - .mk_key = gss_krb5_aes_make_key, - .encrypt_v2 = gss_krb5_aes_encrypt, - .decrypt_v2 = gss_krb5_aes_decrypt, + .derive_key = krb5_derive_key_v2, + .encrypt = gss_krb5_aes_encrypt, + .decrypt = gss_krb5_aes_decrypt, + + .get_mic = gss_krb5_get_mic_v2, + .verify_mic = gss_krb5_verify_mic_v2, + .wrap = gss_krb5_wrap_v2, + .unwrap = gss_krb5_unwrap_v2, + .signalg = -1, .sealalg = -1, .keybytes = 32, - .keylength = 32, - .blocksize = 16, - .conflen = 16, - .cksumlength = 12, + .keylength = BITS2OCTETS(256), + .Kc_length = BITS2OCTETS(256), + .Ke_length = BITS2OCTETS(256), + .Ki_length = BITS2OCTETS(256), + .cksumlength = BITS2OCTETS(96), .keyed_cksum = 1, }, -}; +#endif -static const int num_supported_enctypes = - ARRAY_SIZE(supported_gss_krb5_enctypes); +#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA) + /* + * Camellia-128 with CMAC (RFC 6803) + */ + { + .etype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .ctype = CKSUMTYPE_CMAC_CAMELLIA128, + .name = "camellia128-cts-cmac", + .encrypt_name = "cts(cbc(camellia))", + .aux_cipher = "cbc(camellia)", + .cksum_name = "cmac(camellia)", + .cksumlength = BITS2OCTETS(128), + .keyed_cksum = 1, + .keylength = BITS2OCTETS(128), + .Kc_length = BITS2OCTETS(128), + .Ke_length = BITS2OCTETS(128), + .Ki_length = BITS2OCTETS(128), + + .derive_key = krb5_kdf_feedback_cmac, + .encrypt = gss_krb5_aes_encrypt, + .decrypt = gss_krb5_aes_decrypt, + + .get_mic = gss_krb5_get_mic_v2, + .verify_mic = gss_krb5_verify_mic_v2, + .wrap = gss_krb5_wrap_v2, + .unwrap = gss_krb5_unwrap_v2, + }, + /* + * Camellia-256 with CMAC (RFC 6803) + */ + { + .etype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .ctype = CKSUMTYPE_CMAC_CAMELLIA256, + .name = "camellia256-cts-cmac", + .encrypt_name = "cts(cbc(camellia))", + .aux_cipher = "cbc(camellia)", + .cksum_name = "cmac(camellia)", + .cksumlength = BITS2OCTETS(128), + .keyed_cksum = 1, + .keylength = BITS2OCTETS(256), + .Kc_length = BITS2OCTETS(256), + .Ke_length = BITS2OCTETS(256), + .Ki_length = BITS2OCTETS(256), + + .derive_key = krb5_kdf_feedback_cmac, + .encrypt = gss_krb5_aes_encrypt, + .decrypt = gss_krb5_aes_decrypt, + + .get_mic = gss_krb5_get_mic_v2, + .verify_mic = gss_krb5_verify_mic_v2, + .wrap = gss_krb5_wrap_v2, + .unwrap = gss_krb5_unwrap_v2, + }, +#endif -static int -supported_gss_krb5_enctype(int etype) -{ - int i; - for (i = 0; i < num_supported_enctypes; i++) - if (supported_gss_krb5_enctypes[i].etype == etype) - return 1; - return 0; -} +#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2) + /* + * AES-128 with SHA-256 (RFC 8009) + */ + { + .etype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .ctype = CKSUMTYPE_HMAC_SHA256_128_AES128, + .name = "aes128-cts-hmac-sha256-128", + .encrypt_name = "cts(cbc(aes))", + .aux_cipher = "cbc(aes)", + .cksum_name = "hmac(sha256)", + .cksumlength = BITS2OCTETS(128), + .keyed_cksum = 1, + .keylength = BITS2OCTETS(128), + .Kc_length = BITS2OCTETS(128), + .Ke_length = BITS2OCTETS(128), + .Ki_length = BITS2OCTETS(128), + + .derive_key = krb5_kdf_hmac_sha2, + .encrypt = krb5_etm_encrypt, + .decrypt = krb5_etm_decrypt, + + .get_mic = gss_krb5_get_mic_v2, + .verify_mic = gss_krb5_verify_mic_v2, + .wrap = gss_krb5_wrap_v2, + .unwrap = gss_krb5_unwrap_v2, + }, + /* + * AES-256 with SHA-384 (RFC 8009) + */ + { + .etype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .ctype = CKSUMTYPE_HMAC_SHA384_192_AES256, + .name = "aes256-cts-hmac-sha384-192", + .encrypt_name = "cts(cbc(aes))", + .aux_cipher = "cbc(aes)", + .cksum_name = "hmac(sha384)", + .cksumlength = BITS2OCTETS(192), + .keyed_cksum = 1, + .keylength = BITS2OCTETS(256), + .Kc_length = BITS2OCTETS(192), + .Ke_length = BITS2OCTETS(256), + .Ki_length = BITS2OCTETS(192), + + .derive_key = krb5_kdf_hmac_sha2, + .encrypt = krb5_etm_encrypt, + .decrypt = krb5_etm_decrypt, + + .get_mic = gss_krb5_get_mic_v2, + .verify_mic = gss_krb5_verify_mic_v2, + .wrap = gss_krb5_wrap_v2, + .unwrap = gss_krb5_unwrap_v2, + }, +#endif +}; -static const struct gss_krb5_enctype * -get_gss_krb5_enctype(int etype) -{ - int i; - for (i = 0; i < num_supported_enctypes; i++) - if (supported_gss_krb5_enctypes[i].etype == etype) - return &supported_gss_krb5_enctypes[i]; - return NULL; -} +/* + * The list of advertised enctypes is specified in order of most + * preferred to least. + */ +static char gss_krb5_enctype_priority_list[64]; -static inline const void * -get_key(const void *p, const void *end, - struct krb5_ctx *ctx, struct crypto_sync_skcipher **res) +static void gss_krb5_prepare_enctype_priority_list(void) { - struct xdr_netobj key; - int alg; - - p = simple_get_bytes(p, end, &alg, sizeof(alg)); - if (IS_ERR(p)) - goto out_err; - - switch (alg) { - case ENCTYPE_DES_CBC_CRC: - case ENCTYPE_DES_CBC_MD4: - case ENCTYPE_DES_CBC_MD5: - /* Map all these key types to ENCTYPE_DES_CBC_RAW */ - alg = ENCTYPE_DES_CBC_RAW; - break; - } - - if (!supported_gss_krb5_enctype(alg)) { - printk(KERN_WARNING "gss_kerberos_mech: unsupported " - "encryption key algorithm %d\n", alg); - p = ERR_PTR(-EINVAL); - goto out_err; - } - p = simple_get_netobj(p, end, &key); - if (IS_ERR(p)) - goto out_err; - - *res = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); - if (IS_ERR(*res)) { - printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " - "crypto algorithm %s\n", ctx->gk5e->encrypt_name); - *res = NULL; - goto out_err_free_key; - } - if (crypto_sync_skcipher_setkey(*res, key.data, key.len)) { - printk(KERN_WARNING "gss_kerberos_mech: error setting key for " - "crypto algorithm %s\n", ctx->gk5e->encrypt_name); - goto out_err_free_tfm; + static const u32 gss_krb5_enctypes[] = { +#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA2) + ENCTYPE_AES256_CTS_HMAC_SHA384_192, + ENCTYPE_AES128_CTS_HMAC_SHA256_128, +#endif +#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_CAMELLIA) + ENCTYPE_CAMELLIA256_CTS_CMAC, + ENCTYPE_CAMELLIA128_CTS_CMAC, +#endif +#if defined(CONFIG_RPCSEC_GSS_KRB5_ENCTYPES_AES_SHA1) + ENCTYPE_AES256_CTS_HMAC_SHA1_96, + ENCTYPE_AES128_CTS_HMAC_SHA1_96, +#endif + }; + size_t total, i; + char buf[16]; + char *sep; + int n; + + sep = ""; + gss_krb5_enctype_priority_list[0] = '\0'; + for (total = 0, i = 0; i < ARRAY_SIZE(gss_krb5_enctypes); i++) { + n = sprintf(buf, "%s%u", sep, gss_krb5_enctypes[i]); + if (n < 0) + break; + if (total + n >= sizeof(gss_krb5_enctype_priority_list)) + break; + strcat(gss_krb5_enctype_priority_list, buf); + sep = ","; + total += n; } - - kfree(key.data); - return p; - -out_err_free_tfm: - crypto_free_sync_skcipher(*res); -out_err_free_key: - kfree(key.data); - p = ERR_PTR(-EINVAL); -out_err: - return p; } -static int -gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) +/** + * gss_krb5_lookup_enctype - Retrieve profile information for a given enctype + * @etype: ENCTYPE value + * + * Returns a pointer to a gss_krb5_enctype structure, or NULL if no + * matching etype is found. + */ +VISIBLE_IF_KUNIT +const struct gss_krb5_enctype *gss_krb5_lookup_enctype(u32 etype) { - u32 seq_send; - int tmp; - u32 time32; - - p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); - if (IS_ERR(p)) - goto out_err; - - /* Old format supports only DES! Any other enctype uses new format */ - ctx->enctype = ENCTYPE_DES_CBC_RAW; - - ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); - if (ctx->gk5e == NULL) { - p = ERR_PTR(-EINVAL); - goto out_err; - } - - /* The downcall format was designed before we completely understood - * the uses of the context fields; so it includes some stuff we - * just give some minimal sanity-checking, and some we ignore - * completely (like the next twenty bytes): */ - if (unlikely(p + 20 > end || p + 20 < p)) { - p = ERR_PTR(-EFAULT); - goto out_err; - } - p += 20; - p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); - if (IS_ERR(p)) - goto out_err; - if (tmp != SGN_ALG_DES_MAC_MD5) { - p = ERR_PTR(-ENOSYS); - goto out_err; - } - p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); - if (IS_ERR(p)) - goto out_err; - if (tmp != SEAL_ALG_DES) { - p = ERR_PTR(-ENOSYS); - goto out_err; - } - p = simple_get_bytes(p, end, &time32, sizeof(time32)); - if (IS_ERR(p)) - goto out_err; - /* unsigned 32-bit time overflows in year 2106 */ - ctx->endtime = (time64_t)time32; - p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send)); - if (IS_ERR(p)) - goto out_err; - atomic_set(&ctx->seq_send, seq_send); - p = simple_get_netobj(p, end, &ctx->mech_used); - if (IS_ERR(p)) - goto out_err; - p = get_key(p, end, ctx, &ctx->enc); - if (IS_ERR(p)) - goto out_err_free_mech; - p = get_key(p, end, ctx, &ctx->seq); - if (IS_ERR(p)) - goto out_err_free_key1; - if (p != end) { - p = ERR_PTR(-EFAULT); - goto out_err_free_key2; - } + size_t i; - return 0; - -out_err_free_key2: - crypto_free_sync_skcipher(ctx->seq); -out_err_free_key1: - crypto_free_sync_skcipher(ctx->enc); -out_err_free_mech: - kfree(ctx->mech_used.data); -out_err: - return PTR_ERR(p); + for (i = 0; i < ARRAY_SIZE(supported_gss_krb5_enctypes); i++) + if (supported_gss_krb5_enctypes[i].etype == etype) + return &supported_gss_krb5_enctypes[i]; + return NULL; } +EXPORT_SYMBOL_IF_KUNIT(gss_krb5_lookup_enctype); static struct crypto_sync_skcipher * -context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) +gss_krb5_alloc_cipher_v2(const char *cname, const struct xdr_netobj *key) { - struct crypto_sync_skcipher *cp; + struct crypto_sync_skcipher *tfm; - cp = crypto_alloc_sync_skcipher(cname, 0, 0); - if (IS_ERR(cp)) { - dprintk("gss_kerberos_mech: unable to initialize " - "crypto algorithm %s\n", cname); + tfm = crypto_alloc_sync_skcipher(cname, 0, 0); + if (IS_ERR(tfm)) return NULL; - } - if (crypto_sync_skcipher_setkey(cp, key, ctx->gk5e->keylength)) { - dprintk("gss_kerberos_mech: error setting key for " - "crypto algorithm %s\n", cname); - crypto_free_sync_skcipher(cp); + if (crypto_sync_skcipher_setkey(tfm, key->data, key->len)) { + crypto_free_sync_skcipher(tfm); return NULL; } - return cp; -} - -static inline void -set_cdata(u8 cdata[GSS_KRB5_K5CLENGTH], u32 usage, u8 seed) -{ - cdata[0] = (usage>>24)&0xff; - cdata[1] = (usage>>16)&0xff; - cdata[2] = (usage>>8)&0xff; - cdata[3] = usage&0xff; - cdata[4] = seed; + return tfm; } -static int -context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask) +static struct crypto_ahash * +gss_krb5_alloc_hash_v2(struct krb5_ctx *kctx, const struct xdr_netobj *key) { - struct xdr_netobj c, keyin, keyout; - u8 cdata[GSS_KRB5_K5CLENGTH]; - u32 err; - - c.len = GSS_KRB5_K5CLENGTH; - c.data = cdata; - - keyin.data = ctx->Ksess; - keyin.len = ctx->gk5e->keylength; - keyout.len = ctx->gk5e->keylength; + struct crypto_ahash *tfm; - /* seq uses the raw key */ - ctx->seq = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, - ctx->Ksess); - if (ctx->seq == NULL) - goto out_err; - - ctx->enc = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, - ctx->Ksess); - if (ctx->enc == NULL) - goto out_free_seq; - - /* derive cksum */ - set_cdata(cdata, KG_USAGE_SIGN, KEY_USAGE_SEED_CHECKSUM); - keyout.data = ctx->cksum; - err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); - if (err) { - dprintk("%s: Error %d deriving cksum key\n", - __func__, err); - goto out_free_enc; + tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return NULL; + if (crypto_ahash_setkey(tfm, key->data, key->len)) { + crypto_free_ahash(tfm); + return NULL; } - - return 0; - -out_free_enc: - crypto_free_sync_skcipher(ctx->enc); -out_free_seq: - crypto_free_sync_skcipher(ctx->seq); -out_err: - return -EINVAL; + return tfm; } static int -context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) +gss_krb5_import_ctx_v2(struct krb5_ctx *ctx, gfp_t gfp_mask) { - struct xdr_netobj c, keyin, keyout; - u8 cdata[GSS_KRB5_K5CLENGTH]; - u32 err; - - c.len = GSS_KRB5_K5CLENGTH; - c.data = cdata; - - keyin.data = ctx->Ksess; - keyin.len = ctx->gk5e->keylength; - keyout.len = ctx->gk5e->keylength; + struct xdr_netobj keyin = { + .len = ctx->gk5e->keylength, + .data = ctx->Ksess, + }; + struct xdr_netobj keyout; + int ret = -EINVAL; + + keyout.data = kmalloc(GSS_KRB5_MAX_KEYLEN, gfp_mask); + if (!keyout.data) + return -ENOMEM; /* initiator seal encryption */ - set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); - keyout.data = ctx->initiator_seal; - err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); - if (err) { - dprintk("%s: Error %d deriving initiator_seal key\n", - __func__, err); - goto out_err; - } - ctx->initiator_enc = context_v2_alloc_cipher(ctx, - ctx->gk5e->encrypt_name, - ctx->initiator_seal); + keyout.len = ctx->gk5e->Ke_length; + if (krb5_derive_key(ctx, &keyin, &keyout, KG_USAGE_INITIATOR_SEAL, + KEY_USAGE_SEED_ENCRYPTION, gfp_mask)) + goto out; + ctx->initiator_enc = gss_krb5_alloc_cipher_v2(ctx->gk5e->encrypt_name, + &keyout); if (ctx->initiator_enc == NULL) - goto out_err; + goto out; + if (ctx->gk5e->aux_cipher) { + ctx->initiator_enc_aux = + gss_krb5_alloc_cipher_v2(ctx->gk5e->aux_cipher, + &keyout); + if (ctx->initiator_enc_aux == NULL) + goto out_free; + } /* acceptor seal encryption */ - set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); - keyout.data = ctx->acceptor_seal; - err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); - if (err) { - dprintk("%s: Error %d deriving acceptor_seal key\n", - __func__, err); - goto out_free_initiator_enc; - } - ctx->acceptor_enc = context_v2_alloc_cipher(ctx, - ctx->gk5e->encrypt_name, - ctx->acceptor_seal); + if (krb5_derive_key(ctx, &keyin, &keyout, KG_USAGE_ACCEPTOR_SEAL, + KEY_USAGE_SEED_ENCRYPTION, gfp_mask)) + goto out_free; + ctx->acceptor_enc = gss_krb5_alloc_cipher_v2(ctx->gk5e->encrypt_name, + &keyout); if (ctx->acceptor_enc == NULL) - goto out_free_initiator_enc; + goto out_free; + if (ctx->gk5e->aux_cipher) { + ctx->acceptor_enc_aux = + gss_krb5_alloc_cipher_v2(ctx->gk5e->aux_cipher, + &keyout); + if (ctx->acceptor_enc_aux == NULL) + goto out_free; + } /* initiator sign checksum */ - set_cdata(cdata, KG_USAGE_INITIATOR_SIGN, KEY_USAGE_SEED_CHECKSUM); - keyout.data = ctx->initiator_sign; - err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); - if (err) { - dprintk("%s: Error %d deriving initiator_sign key\n", - __func__, err); - goto out_free_acceptor_enc; - } + keyout.len = ctx->gk5e->Kc_length; + if (krb5_derive_key(ctx, &keyin, &keyout, KG_USAGE_INITIATOR_SIGN, + KEY_USAGE_SEED_CHECKSUM, gfp_mask)) + goto out_free; + ctx->initiator_sign = gss_krb5_alloc_hash_v2(ctx, &keyout); + if (ctx->initiator_sign == NULL) + goto out_free; /* acceptor sign checksum */ - set_cdata(cdata, KG_USAGE_ACCEPTOR_SIGN, KEY_USAGE_SEED_CHECKSUM); - keyout.data = ctx->acceptor_sign; - err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); - if (err) { - dprintk("%s: Error %d deriving acceptor_sign key\n", - __func__, err); - goto out_free_acceptor_enc; - } + if (krb5_derive_key(ctx, &keyin, &keyout, KG_USAGE_ACCEPTOR_SIGN, + KEY_USAGE_SEED_CHECKSUM, gfp_mask)) + goto out_free; + ctx->acceptor_sign = gss_krb5_alloc_hash_v2(ctx, &keyout); + if (ctx->acceptor_sign == NULL) + goto out_free; /* initiator seal integrity */ - set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_INTEGRITY); - keyout.data = ctx->initiator_integ; - err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); - if (err) { - dprintk("%s: Error %d deriving initiator_integ key\n", - __func__, err); - goto out_free_acceptor_enc; - } + keyout.len = ctx->gk5e->Ki_length; + if (krb5_derive_key(ctx, &keyin, &keyout, KG_USAGE_INITIATOR_SEAL, + KEY_USAGE_SEED_INTEGRITY, gfp_mask)) + goto out_free; + ctx->initiator_integ = gss_krb5_alloc_hash_v2(ctx, &keyout); + if (ctx->initiator_integ == NULL) + goto out_free; /* acceptor seal integrity */ - set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_INTEGRITY); - keyout.data = ctx->acceptor_integ; - err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); - if (err) { - dprintk("%s: Error %d deriving acceptor_integ key\n", - __func__, err); - goto out_free_acceptor_enc; - } - - switch (ctx->enctype) { - case ENCTYPE_AES128_CTS_HMAC_SHA1_96: - case ENCTYPE_AES256_CTS_HMAC_SHA1_96: - ctx->initiator_enc_aux = - context_v2_alloc_cipher(ctx, "cbc(aes)", - ctx->initiator_seal); - if (ctx->initiator_enc_aux == NULL) - goto out_free_acceptor_enc; - ctx->acceptor_enc_aux = - context_v2_alloc_cipher(ctx, "cbc(aes)", - ctx->acceptor_seal); - if (ctx->acceptor_enc_aux == NULL) { - crypto_free_sync_skcipher(ctx->initiator_enc_aux); - goto out_free_acceptor_enc; - } - } - - return 0; + if (krb5_derive_key(ctx, &keyin, &keyout, KG_USAGE_ACCEPTOR_SEAL, + KEY_USAGE_SEED_INTEGRITY, gfp_mask)) + goto out_free; + ctx->acceptor_integ = gss_krb5_alloc_hash_v2(ctx, &keyout); + if (ctx->acceptor_integ == NULL) + goto out_free; + + ret = 0; +out: + kfree_sensitive(keyout.data); + return ret; -out_free_acceptor_enc: +out_free: + crypto_free_ahash(ctx->acceptor_integ); + crypto_free_ahash(ctx->initiator_integ); + crypto_free_ahash(ctx->acceptor_sign); + crypto_free_ahash(ctx->initiator_sign); + crypto_free_sync_skcipher(ctx->acceptor_enc_aux); crypto_free_sync_skcipher(ctx->acceptor_enc); -out_free_initiator_enc: + crypto_free_sync_skcipher(ctx->initiator_enc_aux); crypto_free_sync_skcipher(ctx->initiator_enc); -out_err: - return -EINVAL; + goto out; } static int @@ -471,6 +398,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, u64 seq_send64; int keylen; u32 time32; + int ret; p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); if (IS_ERR(p)) @@ -497,10 +425,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); if (IS_ERR(p)) goto out_err; - /* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */ - if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1) - ctx->enctype = ENCTYPE_DES3_CBC_RAW; - ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); + ctx->gk5e = gss_krb5_lookup_enctype(ctx->enctype); if (ctx->gk5e == NULL) { dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n", ctx->enctype); @@ -526,25 +451,23 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, } ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; - switch (ctx->enctype) { - case ENCTYPE_DES3_CBC_RAW: - return context_derive_keys_des3(ctx, gfp_mask); - case ENCTYPE_AES128_CTS_HMAC_SHA1_96: - case ENCTYPE_AES256_CTS_HMAC_SHA1_96: - return context_derive_keys_new(ctx, gfp_mask); - default: - return -EINVAL; + ret = gss_krb5_import_ctx_v2(ctx, gfp_mask); + if (ret) { + p = ERR_PTR(ret); + goto out_free; } + return 0; + +out_free: + kfree(ctx->mech_used.data); out_err: return PTR_ERR(p); } static int -gss_import_sec_context_kerberos(const void *p, size_t len, - struct gss_ctx *ctx_id, - time64_t *endtime, - gfp_t gfp_mask) +gss_krb5_import_sec_context(const void *p, size_t len, struct gss_ctx *ctx_id, + time64_t *endtime, gfp_t gfp_mask) { const void *end = (const void *)((const char *)p + len); struct krb5_ctx *ctx; @@ -554,24 +477,22 @@ gss_import_sec_context_kerberos(const void *p, size_t len, if (ctx == NULL) return -ENOMEM; - if (len == 85) - ret = gss_import_v1_context(p, end, ctx); - else - ret = gss_import_v2_context(p, end, ctx, gfp_mask); - - if (ret == 0) { - ctx_id->internal_ctx_id = ctx; - if (endtime) - *endtime = ctx->endtime; - } else + ret = gss_import_v2_context(p, end, ctx, gfp_mask); + memzero_explicit(&ctx->Ksess, sizeof(ctx->Ksess)); + if (ret) { kfree(ctx); + return ret; + } - dprintk("RPC: %s: returning %d\n", __func__, ret); - return ret; + ctx_id->internal_ctx_id = ctx; + if (endtime) + *endtime = ctx->endtime; + return 0; } static void -gss_delete_sec_context_kerberos(void *internal_ctx) { +gss_krb5_delete_sec_context(void *internal_ctx) +{ struct krb5_ctx *kctx = internal_ctx; crypto_free_sync_skcipher(kctx->seq); @@ -580,17 +501,105 @@ gss_delete_sec_context_kerberos(void *internal_ctx) { crypto_free_sync_skcipher(kctx->initiator_enc); crypto_free_sync_skcipher(kctx->acceptor_enc_aux); crypto_free_sync_skcipher(kctx->initiator_enc_aux); + crypto_free_ahash(kctx->acceptor_sign); + crypto_free_ahash(kctx->initiator_sign); + crypto_free_ahash(kctx->acceptor_integ); + crypto_free_ahash(kctx->initiator_integ); kfree(kctx->mech_used.data); kfree(kctx); } +/** + * gss_krb5_get_mic - get_mic for the Kerberos GSS mechanism + * @gctx: GSS context + * @text: plaintext to checksum + * @token: buffer into which to write the computed checksum + * + * Return values: + * %GSS_S_COMPLETE - success, and @token is filled in + * %GSS_S_FAILURE - checksum could not be generated + * %GSS_S_CONTEXT_EXPIRED - Kerberos context is no longer valid + */ +static u32 gss_krb5_get_mic(struct gss_ctx *gctx, struct xdr_buf *text, + struct xdr_netobj *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + return kctx->gk5e->get_mic(kctx, text, token); +} + +/** + * gss_krb5_verify_mic - verify_mic for the Kerberos GSS mechanism + * @gctx: GSS context + * @message_buffer: plaintext to check + * @read_token: received checksum to check + * + * Return values: + * %GSS_S_COMPLETE - computed and received checksums match + * %GSS_S_DEFECTIVE_TOKEN - received checksum is not valid + * %GSS_S_BAD_SIG - computed and received checksums do not match + * %GSS_S_FAILURE - received checksum could not be checked + * %GSS_S_CONTEXT_EXPIRED - Kerberos context is no longer valid + */ +static u32 gss_krb5_verify_mic(struct gss_ctx *gctx, + struct xdr_buf *message_buffer, + struct xdr_netobj *read_token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + return kctx->gk5e->verify_mic(kctx, message_buffer, read_token); +} + +/** + * gss_krb5_wrap - gss_wrap for the Kerberos GSS mechanism + * @gctx: initialized GSS context + * @offset: byte offset in @buf to start writing the cipher text + * @buf: OUT: send buffer + * @pages: plaintext to wrap + * + * Return values: + * %GSS_S_COMPLETE - success, @buf has been updated + * %GSS_S_FAILURE - @buf could not be wrapped + * %GSS_S_CONTEXT_EXPIRED - Kerberos context is no longer valid + */ +static u32 gss_krb5_wrap(struct gss_ctx *gctx, int offset, + struct xdr_buf *buf, struct page **pages) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + return kctx->gk5e->wrap(kctx, offset, buf, pages); +} + +/** + * gss_krb5_unwrap - gss_unwrap for the Kerberos GSS mechanism + * @gctx: initialized GSS context + * @offset: starting byte offset into @buf + * @len: size of ciphertext to unwrap + * @buf: ciphertext to unwrap + * + * Return values: + * %GSS_S_COMPLETE - success, @buf has been updated + * %GSS_S_DEFECTIVE_TOKEN - received blob is not valid + * %GSS_S_BAD_SIG - computed and received checksums do not match + * %GSS_S_FAILURE - @buf could not be unwrapped + * %GSS_S_CONTEXT_EXPIRED - Kerberos context is no longer valid + */ +static u32 gss_krb5_unwrap(struct gss_ctx *gctx, int offset, + int len, struct xdr_buf *buf) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + return kctx->gk5e->unwrap(kctx, offset, len, buf, + &gctx->slack, &gctx->align); +} + static const struct gss_api_ops gss_kerberos_ops = { - .gss_import_sec_context = gss_import_sec_context_kerberos, - .gss_get_mic = gss_get_mic_kerberos, - .gss_verify_mic = gss_verify_mic_kerberos, - .gss_wrap = gss_wrap_kerberos, - .gss_unwrap = gss_unwrap_kerberos, - .gss_delete_sec_context = gss_delete_sec_context_kerberos, + .gss_import_sec_context = gss_krb5_import_sec_context, + .gss_get_mic = gss_krb5_get_mic, + .gss_verify_mic = gss_krb5_verify_mic, + .gss_wrap = gss_krb5_wrap, + .gss_unwrap = gss_krb5_unwrap, + .gss_delete_sec_context = gss_krb5_delete_sec_context, }; static struct pf_desc gss_kerberos_pfs[] = { @@ -631,13 +640,14 @@ static struct gss_api_mech gss_kerberos_mech = { .gm_ops = &gss_kerberos_ops, .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), .gm_pfs = gss_kerberos_pfs, - .gm_upcall_enctypes = KRB5_SUPPORTED_ENCTYPES, + .gm_upcall_enctypes = gss_krb5_enctype_priority_list, }; static int __init init_kerberos_module(void) { int status; + gss_krb5_prepare_enctype_priority_list(); status = gss_mech_register(&gss_kerberos_mech); if (status) printk("Failed to register kerberos gss mechanism!\n"); @@ -649,6 +659,7 @@ static void __exit cleanup_kerberos_module(void) gss_mech_unregister(&gss_kerberos_mech); } +MODULE_DESCRIPTION("Sun RPC Kerberos 5 module"); MODULE_LICENSE("GPL"); module_init(init_kerberos_module); module_exit(cleanup_kerberos_module); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 33061417ec97..ce540df9bce4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -65,37 +65,13 @@ #include <linux/crypto.h> #include <linux/atomic.h> +#include "gss_krb5_internal.h" + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif static void * -setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) -{ - u16 *ptr; - void *krb5_hdr; - int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; - - token->len = g_token_size(&ctx->mech_used, body_size); - - ptr = (u16 *)token->data; - g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); - - /* ptr now at start of header described in rfc 1964, section 1.2.1: */ - krb5_hdr = ptr; - *ptr++ = KG_TOK_MIC_MSG; - /* - * signalg is stored as if it were converted from LE to host endian, even - * though it's an opaque pair of bytes according to the RFC. - */ - *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg); - *ptr++ = SEAL_ALG_NONE; - *ptr = 0xffff; - - return krb5_hdr; -} - -static void * setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) { u16 *ptr; @@ -108,8 +84,10 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) flags |= 0x04; /* Per rfc 4121, sec 4.2.6.1, there is no header, - * just start the token */ - krb5_hdr = ptr = (u16 *)token->data; + * just start the token. + */ + krb5_hdr = (u16 *)token->data; + ptr = krb5_hdr; *ptr++ = KG2_TOK_MIC; p = (u8 *)ptr; @@ -123,57 +101,18 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) return krb5_hdr; } -static u32 -gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, - struct xdr_netobj *token) -{ - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - void *ptr; - time64_t now; - u32 seq_send; - u8 *cksumkey; - - dprintk("RPC: %s\n", __func__); - BUG_ON(ctx == NULL); - - now = ktime_get_real_seconds(); - - ptr = setup_token(ctx, token); - - if (ctx->gk5e->keyed_cksum) - cksumkey = ctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(ctx, ptr, 8, text, 0, cksumkey, - KG_USAGE_SIGN, &md5cksum)) - return GSS_S_FAILURE; - - memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - - seq_send = atomic_fetch_inc(&ctx->seq_send); - - if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, - seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) - return GSS_S_FAILURE; - - return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; -} - -static u32 -gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, - struct xdr_netobj *token) +u32 +gss_krb5_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, + struct xdr_netobj *token) { - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj cksumobj = { .len = sizeof(cksumdata), - .data = cksumdata}; + struct crypto_ahash *tfm = ctx->initiate ? + ctx->initiator_sign : ctx->acceptor_sign; + struct xdr_netobj cksumobj = { + .len = ctx->gk5e->cksumlength, + }; + __be64 seq_send_be64; void *krb5_hdr; time64_t now; - u8 *cksumkey; - unsigned int cksum_usage; - __be64 seq_send_be64; dprintk("RPC: %s\n", __func__); @@ -184,39 +123,11 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, seq_send_be64 = cpu_to_be64(atomic64_fetch_inc(&ctx->seq_send64)); memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); - if (ctx->initiate) { - cksumkey = ctx->initiator_sign; - cksum_usage = KG_USAGE_INITIATOR_SIGN; - } else { - cksumkey = ctx->acceptor_sign; - cksum_usage = KG_USAGE_ACCEPTOR_SIGN; - } - - if (make_checksum_v2(ctx, krb5_hdr, GSS_KRB5_TOK_HDR_LEN, - text, 0, cksumkey, cksum_usage, &cksumobj)) + cksumobj.data = krb5_hdr + GSS_KRB5_TOK_HDR_LEN; + if (gss_krb5_checksum(tfm, krb5_hdr, GSS_KRB5_TOK_HDR_LEN, + text, 0, &cksumobj)) return GSS_S_FAILURE; - memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len); - now = ktime_get_real_seconds(); - return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; } - -u32 -gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, - struct xdr_netobj *token) -{ - struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; - - switch (ctx->enctype) { - default: - BUG(); - case ENCTYPE_DES_CBC_RAW: - case ENCTYPE_DES3_CBC_RAW: - return gss_get_mic_v1(ctx, text, token); - case ENCTYPE_AES128_CTS_HMAC_SHA1_96: - case ENCTYPE_AES256_CTS_HMAC_SHA1_96: - return gss_get_mic_v2(ctx, text, token); - } -} diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c deleted file mode 100644 index 3200b971a814..000000000000 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * linux/net/sunrpc/gss_krb5_seqnum.c - * - * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c - * - * Copyright (c) 2000 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - */ - -/* - * Copyright 1993 by OpenVision Technologies, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software - * and its documentation for any purpose is hereby granted without fee, - * provided that the above copyright notice appears in all copies and - * that both that copyright notice and this permission notice appear in - * supporting documentation, and that the name of OpenVision not be used - * in advertising or publicity pertaining to distribution of the software - * without specific, written prior permission. OpenVision makes no - * representations about the suitability of this software for any - * purpose. It is provided "as is" without express or implied warranty. - * - * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO - * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - */ - -#include <crypto/skcipher.h> -#include <linux/types.h> -#include <linux/sunrpc/gss_krb5.h> - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -s32 -krb5_make_seq_num(struct krb5_ctx *kctx, - struct crypto_sync_skcipher *key, - int direction, - u32 seqnum, - unsigned char *cksum, unsigned char *buf) -{ - unsigned char *plain; - s32 code; - - plain = kmalloc(8, GFP_KERNEL); - if (!plain) - return -ENOMEM; - - plain[0] = (unsigned char) (seqnum & 0xff); - plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); - plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); - plain[3] = (unsigned char) ((seqnum >> 24) & 0xff); - - plain[4] = direction; - plain[5] = direction; - plain[6] = direction; - plain[7] = direction; - - code = krb5_encrypt(key, cksum, plain, buf, 8); - kfree(plain); - return code; -} - -s32 -krb5_get_seq_num(struct krb5_ctx *kctx, - unsigned char *cksum, - unsigned char *buf, - int *direction, u32 *seqnum) -{ - s32 code; - unsigned char *plain; - struct crypto_sync_skcipher *key = kctx->seq; - - dprintk("RPC: krb5_get_seq_num:\n"); - - plain = kmalloc(8, GFP_KERNEL); - if (!plain) - return -ENOMEM; - - if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) - goto out; - - if ((plain[4] != plain[5]) || (plain[4] != plain[6]) || - (plain[4] != plain[7])) { - code = (s32)KG_BAD_SEQ; - goto out; - } - - *direction = plain[4]; - - *seqnum = ((plain[0]) | - (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); - -out: - kfree(plain); - return code; -} diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c new file mode 100644 index 000000000000..a5bff02cd7ba --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -0,0 +1,1859 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Oracle and/or its affiliates. + * + * KUnit test of SunRPC's GSS Kerberos mechanism. Subsystem + * name is "rpcsec_gss_krb5". + */ + +#include <kunit/test.h> +#include <kunit/visibility.h> + +#include <linux/kernel.h> +#include <crypto/hash.h> + +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/gss_krb5.h> + +#include "gss_krb5_internal.h" + +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); + +struct gss_krb5_test_param { + const char *desc; + u32 enctype; + u32 nfold; + u32 constant; + const struct xdr_netobj *base_key; + const struct xdr_netobj *Ke; + const struct xdr_netobj *usage; + const struct xdr_netobj *plaintext; + const struct xdr_netobj *confounder; + const struct xdr_netobj *expected_result; + const struct xdr_netobj *expected_hmac; + const struct xdr_netobj *next_iv; +}; + +static inline void gss_krb5_get_desc(const struct gss_krb5_test_param *param, + char *desc) +{ + strscpy(desc, param->desc, KUNIT_PARAM_DESC_SIZE); +} + +static void kdf_case(struct kunit *test) +{ + const struct gss_krb5_test_param *param = test->param_value; + const struct gss_krb5_enctype *gk5e; + struct xdr_netobj derivedkey; + int err; + + /* Arrange */ + gk5e = gss_krb5_lookup_enctype(param->enctype); + if (!gk5e) + kunit_skip(test, "Encryption type is not available"); + + derivedkey.data = kunit_kzalloc(test, param->expected_result->len, + GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, derivedkey.data); + derivedkey.len = param->expected_result->len; + + /* Act */ + err = gk5e->derive_key(gk5e, param->base_key, &derivedkey, + param->usage, GFP_KERNEL); + KUNIT_ASSERT_EQ(test, err, 0); + + /* Assert */ + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->expected_result->data, + derivedkey.data, derivedkey.len), 0, + "key mismatch"); +} + +static void checksum_case(struct kunit *test) +{ + const struct gss_krb5_test_param *param = test->param_value; + struct xdr_buf buf = { + .head[0].iov_len = param->plaintext->len, + .len = param->plaintext->len, + }; + const struct gss_krb5_enctype *gk5e; + struct xdr_netobj Kc, checksum; + struct crypto_ahash *tfm; + int err; + + /* Arrange */ + gk5e = gss_krb5_lookup_enctype(param->enctype); + if (!gk5e) + kunit_skip(test, "Encryption type is not available"); + + Kc.len = gk5e->Kc_length; + Kc.data = kunit_kzalloc(test, Kc.len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, Kc.data); + err = gk5e->derive_key(gk5e, param->base_key, &Kc, + param->usage, GFP_KERNEL); + KUNIT_ASSERT_EQ(test, err, 0); + + tfm = crypto_alloc_ahash(gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tfm); + err = crypto_ahash_setkey(tfm, Kc.data, Kc.len); + KUNIT_ASSERT_EQ(test, err, 0); + + buf.head[0].iov_base = kunit_kzalloc(test, buf.head[0].iov_len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf.head[0].iov_base); + memcpy(buf.head[0].iov_base, param->plaintext->data, buf.head[0].iov_len); + + checksum.len = gk5e->cksumlength; + checksum.data = kunit_kzalloc(test, checksum.len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, checksum.data); + + /* Act */ + err = gss_krb5_checksum(tfm, NULL, 0, &buf, 0, &checksum); + KUNIT_ASSERT_EQ(test, err, 0); + + /* Assert */ + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->expected_result->data, + checksum.data, checksum.len), 0, + "checksum mismatch"); + + crypto_free_ahash(tfm); +} + +#define DEFINE_HEX_XDR_NETOBJ(name, hex_array...) \ + static const u8 name ## _data[] = { hex_array }; \ + static const struct xdr_netobj name = { \ + .data = (u8 *)name##_data, \ + .len = sizeof(name##_data), \ + } + +#define DEFINE_STR_XDR_NETOBJ(name, string) \ + static const u8 name ## _str[] = string; \ + static const struct xdr_netobj name = { \ + .data = (u8 *)name##_str, \ + .len = sizeof(name##_str) - 1, \ + } + +/* + * RFC 3961 Appendix A.1. n-fold + * + * The n-fold function is defined in section 5.1 of RFC 3961. + * + * This test material is copyright (C) The Internet Society (2005). + */ + +DEFINE_HEX_XDR_NETOBJ(nfold_test1_plaintext, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test1_expected_result, + 0xbe, 0x07, 0x26, 0x31, 0x27, 0x6b, 0x19, 0x55 +); + +DEFINE_HEX_XDR_NETOBJ(nfold_test2_plaintext, + 0x70, 0x61, 0x73, 0x73, 0x77, 0x6f, 0x72, 0x64 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test2_expected_result, + 0x78, 0xa0, 0x7b, 0x6c, 0xaf, 0x85, 0xfa +); + +DEFINE_HEX_XDR_NETOBJ(nfold_test3_plaintext, + 0x52, 0x6f, 0x75, 0x67, 0x68, 0x20, 0x43, 0x6f, + 0x6e, 0x73, 0x65, 0x6e, 0x73, 0x75, 0x73, 0x2c, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x52, 0x75, 0x6e, + 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x43, 0x6f, 0x64, + 0x65 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test3_expected_result, + 0xbb, 0x6e, 0xd3, 0x08, 0x70, 0xb7, 0xf0, 0xe0 +); + +DEFINE_HEX_XDR_NETOBJ(nfold_test4_plaintext, + 0x70, 0x61, 0x73, 0x73, 0x77, 0x6f, 0x72, 0x64 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test4_expected_result, + 0x59, 0xe4, 0xa8, 0xca, 0x7c, 0x03, 0x85, 0xc3, + 0xc3, 0x7b, 0x3f, 0x6d, 0x20, 0x00, 0x24, 0x7c, + 0xb6, 0xe6, 0xbd, 0x5b, 0x3e +); + +DEFINE_HEX_XDR_NETOBJ(nfold_test5_plaintext, + 0x4d, 0x41, 0x53, 0x53, 0x41, 0x43, 0x48, 0x56, + 0x53, 0x45, 0x54, 0x54, 0x53, 0x20, 0x49, 0x4e, + 0x53, 0x54, 0x49, 0x54, 0x56, 0x54, 0x45, 0x20, + 0x4f, 0x46, 0x20, 0x54, 0x45, 0x43, 0x48, 0x4e, + 0x4f, 0x4c, 0x4f, 0x47, 0x59 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test5_expected_result, + 0xdb, 0x3b, 0x0d, 0x8f, 0x0b, 0x06, 0x1e, 0x60, + 0x32, 0x82, 0xb3, 0x08, 0xa5, 0x08, 0x41, 0x22, + 0x9a, 0xd7, 0x98, 0xfa, 0xb9, 0x54, 0x0c, 0x1b +); + +DEFINE_HEX_XDR_NETOBJ(nfold_test6_plaintext, + 0x51 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test6_expected_result, + 0x51, 0x8a, 0x54, 0xa2, 0x15, 0xa8, 0x45, 0x2a, + 0x51, 0x8a, 0x54, 0xa2, 0x15, 0xa8, 0x45, 0x2a, + 0x51, 0x8a, 0x54, 0xa2, 0x15 +); + +DEFINE_HEX_XDR_NETOBJ(nfold_test7_plaintext, + 0x62, 0x61 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test7_expected_result, + 0xfb, 0x25, 0xd5, 0x31, 0xae, 0x89, 0x74, 0x49, + 0x9f, 0x52, 0xfd, 0x92, 0xea, 0x98, 0x57, 0xc4, + 0xba, 0x24, 0xcf, 0x29, 0x7e +); + +DEFINE_HEX_XDR_NETOBJ(nfold_test_kerberos, + 0x6b, 0x65, 0x72, 0x62, 0x65, 0x72, 0x6f, 0x73 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test8_expected_result, + 0x6b, 0x65, 0x72, 0x62, 0x65, 0x72, 0x6f, 0x73 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test9_expected_result, + 0x6b, 0x65, 0x72, 0x62, 0x65, 0x72, 0x6f, 0x73, + 0x7b, 0x9b, 0x5b, 0x2b, 0x93, 0x13, 0x2b, 0x93 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test10_expected_result, + 0x83, 0x72, 0xc2, 0x36, 0x34, 0x4e, 0x5f, 0x15, + 0x50, 0xcd, 0x07, 0x47, 0xe1, 0x5d, 0x62, 0xca, + 0x7a, 0x5a, 0x3b, 0xce, 0xa4 +); +DEFINE_HEX_XDR_NETOBJ(nfold_test11_expected_result, + 0x6b, 0x65, 0x72, 0x62, 0x65, 0x72, 0x6f, 0x73, + 0x7b, 0x9b, 0x5b, 0x2b, 0x93, 0x13, 0x2b, 0x93, + 0x5c, 0x9b, 0xdc, 0xda, 0xd9, 0x5c, 0x98, 0x99, + 0xc4, 0xca, 0xe4, 0xde, 0xe6, 0xd6, 0xca, 0xe4 +); + +static const struct gss_krb5_test_param rfc3961_nfold_test_params[] = { + { + .desc = "64-fold(\"012345\")", + .nfold = 64, + .plaintext = &nfold_test1_plaintext, + .expected_result = &nfold_test1_expected_result, + }, + { + .desc = "56-fold(\"password\")", + .nfold = 56, + .plaintext = &nfold_test2_plaintext, + .expected_result = &nfold_test2_expected_result, + }, + { + .desc = "64-fold(\"Rough Consensus, and Running Code\")", + .nfold = 64, + .plaintext = &nfold_test3_plaintext, + .expected_result = &nfold_test3_expected_result, + }, + { + .desc = "168-fold(\"password\")", + .nfold = 168, + .plaintext = &nfold_test4_plaintext, + .expected_result = &nfold_test4_expected_result, + }, + { + .desc = "192-fold(\"MASSACHVSETTS INSTITVTE OF TECHNOLOGY\")", + .nfold = 192, + .plaintext = &nfold_test5_plaintext, + .expected_result = &nfold_test5_expected_result, + }, + { + .desc = "168-fold(\"Q\")", + .nfold = 168, + .plaintext = &nfold_test6_plaintext, + .expected_result = &nfold_test6_expected_result, + }, + { + .desc = "168-fold(\"ba\")", + .nfold = 168, + .plaintext = &nfold_test7_plaintext, + .expected_result = &nfold_test7_expected_result, + }, + { + .desc = "64-fold(\"kerberos\")", + .nfold = 64, + .plaintext = &nfold_test_kerberos, + .expected_result = &nfold_test8_expected_result, + }, + { + .desc = "128-fold(\"kerberos\")", + .nfold = 128, + .plaintext = &nfold_test_kerberos, + .expected_result = &nfold_test9_expected_result, + }, + { + .desc = "168-fold(\"kerberos\")", + .nfold = 168, + .plaintext = &nfold_test_kerberos, + .expected_result = &nfold_test10_expected_result, + }, + { + .desc = "256-fold(\"kerberos\")", + .nfold = 256, + .plaintext = &nfold_test_kerberos, + .expected_result = &nfold_test11_expected_result, + }, +}; + +/* Creates the function rfc3961_nfold_gen_params */ +KUNIT_ARRAY_PARAM(rfc3961_nfold, rfc3961_nfold_test_params, gss_krb5_get_desc); + +static void rfc3961_nfold_case(struct kunit *test) +{ + const struct gss_krb5_test_param *param = test->param_value; + u8 *result; + + /* Arrange */ + result = kunit_kzalloc(test, 4096, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, result); + + /* Act */ + krb5_nfold(param->plaintext->len * 8, param->plaintext->data, + param->expected_result->len * 8, result); + + /* Assert */ + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->expected_result->data, + result, param->expected_result->len), 0, + "result mismatch"); +} + +static struct kunit_case rfc3961_test_cases[] = { + { + .name = "RFC 3961 n-fold", + .run_case = rfc3961_nfold_case, + .generate_params = rfc3961_nfold_gen_params, + }, + {} +}; + +static struct kunit_suite rfc3961_suite = { + .name = "RFC 3961 tests", + .test_cases = rfc3961_test_cases, +}; + +/* + * From RFC 3962 Appendix B: Sample Test Vectors + * + * Some test vectors for CBC with ciphertext stealing, using an + * initial vector of all-zero. + * + * This test material is copyright (C) The Internet Society (2005). + */ + +DEFINE_HEX_XDR_NETOBJ(rfc3962_encryption_key, + 0x63, 0x68, 0x69, 0x63, 0x6b, 0x65, 0x6e, 0x20, + 0x74, 0x65, 0x72, 0x69, 0x79, 0x61, 0x6b, 0x69 +); + +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test1_plaintext, + 0x49, 0x20, 0x77, 0x6f, 0x75, 0x6c, 0x64, 0x20, + 0x6c, 0x69, 0x6b, 0x65, 0x20, 0x74, 0x68, 0x65, + 0x20 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test1_expected_result, + 0xc6, 0x35, 0x35, 0x68, 0xf2, 0xbf, 0x8c, 0xb4, + 0xd8, 0xa5, 0x80, 0x36, 0x2d, 0xa7, 0xff, 0x7f, + 0x97 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test1_next_iv, + 0xc6, 0x35, 0x35, 0x68, 0xf2, 0xbf, 0x8c, 0xb4, + 0xd8, 0xa5, 0x80, 0x36, 0x2d, 0xa7, 0xff, 0x7f +); + +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test2_plaintext, + 0x49, 0x20, 0x77, 0x6f, 0x75, 0x6c, 0x64, 0x20, + 0x6c, 0x69, 0x6b, 0x65, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x6c, + 0x20, 0x47, 0x61, 0x75, 0x27, 0x73, 0x20 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test2_expected_result, + 0xfc, 0x00, 0x78, 0x3e, 0x0e, 0xfd, 0xb2, 0xc1, + 0xd4, 0x45, 0xd4, 0xc8, 0xef, 0xf7, 0xed, 0x22, + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test2_next_iv, + 0xfc, 0x00, 0x78, 0x3e, 0x0e, 0xfd, 0xb2, 0xc1, + 0xd4, 0x45, 0xd4, 0xc8, 0xef, 0xf7, 0xed, 0x22 +); + +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test3_plaintext, + 0x49, 0x20, 0x77, 0x6f, 0x75, 0x6c, 0x64, 0x20, + 0x6c, 0x69, 0x6b, 0x65, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x6c, + 0x20, 0x47, 0x61, 0x75, 0x27, 0x73, 0x20, 0x43 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test3_expected_result, + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8, + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test3_next_iv, + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8 +); + +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test4_plaintext, + 0x49, 0x20, 0x77, 0x6f, 0x75, 0x6c, 0x64, 0x20, + 0x6c, 0x69, 0x6b, 0x65, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x6c, + 0x20, 0x47, 0x61, 0x75, 0x27, 0x73, 0x20, 0x43, + 0x68, 0x69, 0x63, 0x6b, 0x65, 0x6e, 0x2c, 0x20, + 0x70, 0x6c, 0x65, 0x61, 0x73, 0x65, 0x2c +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test4_expected_result, + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, + 0xb3, 0xff, 0xfd, 0x94, 0x0c, 0x16, 0xa1, 0x8c, + 0x1b, 0x55, 0x49, 0xd2, 0xf8, 0x38, 0x02, 0x9e, + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test4_next_iv, + 0xb3, 0xff, 0xfd, 0x94, 0x0c, 0x16, 0xa1, 0x8c, + 0x1b, 0x55, 0x49, 0xd2, 0xf8, 0x38, 0x02, 0x9e +); + +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test5_plaintext, + 0x49, 0x20, 0x77, 0x6f, 0x75, 0x6c, 0x64, 0x20, + 0x6c, 0x69, 0x6b, 0x65, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x6c, + 0x20, 0x47, 0x61, 0x75, 0x27, 0x73, 0x20, 0x43, + 0x68, 0x69, 0x63, 0x6b, 0x65, 0x6e, 0x2c, 0x20, + 0x70, 0x6c, 0x65, 0x61, 0x73, 0x65, 0x2c, 0x20 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test5_expected_result, + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, + 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0, + 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8, + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test5_next_iv, + 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0, + 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8 +); + +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test6_plaintext, + 0x49, 0x20, 0x77, 0x6f, 0x75, 0x6c, 0x64, 0x20, + 0x6c, 0x69, 0x6b, 0x65, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x6c, + 0x20, 0x47, 0x61, 0x75, 0x27, 0x73, 0x20, 0x43, + 0x68, 0x69, 0x63, 0x6b, 0x65, 0x6e, 0x2c, 0x20, + 0x70, 0x6c, 0x65, 0x61, 0x73, 0x65, 0x2c, 0x20, + 0x61, 0x6e, 0x64, 0x20, 0x77, 0x6f, 0x6e, 0x74, + 0x6f, 0x6e, 0x20, 0x73, 0x6f, 0x75, 0x70, 0x2e +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test6_expected_result, + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8, + 0x48, 0x07, 0xef, 0xe8, 0x36, 0xee, 0x89, 0xa5, + 0x26, 0x73, 0x0d, 0xbc, 0x2f, 0x7b, 0xc8, 0x40, + 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0, + 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8 +); +DEFINE_HEX_XDR_NETOBJ(rfc3962_enc_test6_next_iv, + 0x48, 0x07, 0xef, 0xe8, 0x36, 0xee, 0x89, 0xa5, + 0x26, 0x73, 0x0d, 0xbc, 0x2f, 0x7b, 0xc8, 0x40 +); + +static const struct gss_krb5_test_param rfc3962_encrypt_test_params[] = { + { + .desc = "Encrypt with aes128-cts-hmac-sha1-96 case 1", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, + .Ke = &rfc3962_encryption_key, + .plaintext = &rfc3962_enc_test1_plaintext, + .expected_result = &rfc3962_enc_test1_expected_result, + .next_iv = &rfc3962_enc_test1_next_iv, + }, + { + .desc = "Encrypt with aes128-cts-hmac-sha1-96 case 2", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, + .Ke = &rfc3962_encryption_key, + .plaintext = &rfc3962_enc_test2_plaintext, + .expected_result = &rfc3962_enc_test2_expected_result, + .next_iv = &rfc3962_enc_test2_next_iv, + }, + { + .desc = "Encrypt with aes128-cts-hmac-sha1-96 case 3", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, + .Ke = &rfc3962_encryption_key, + .plaintext = &rfc3962_enc_test3_plaintext, + .expected_result = &rfc3962_enc_test3_expected_result, + .next_iv = &rfc3962_enc_test3_next_iv, + }, + { + .desc = "Encrypt with aes128-cts-hmac-sha1-96 case 4", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, + .Ke = &rfc3962_encryption_key, + .plaintext = &rfc3962_enc_test4_plaintext, + .expected_result = &rfc3962_enc_test4_expected_result, + .next_iv = &rfc3962_enc_test4_next_iv, + }, + { + .desc = "Encrypt with aes128-cts-hmac-sha1-96 case 5", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, + .Ke = &rfc3962_encryption_key, + .plaintext = &rfc3962_enc_test5_plaintext, + .expected_result = &rfc3962_enc_test5_expected_result, + .next_iv = &rfc3962_enc_test5_next_iv, + }, + { + .desc = "Encrypt with aes128-cts-hmac-sha1-96 case 6", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, + .Ke = &rfc3962_encryption_key, + .plaintext = &rfc3962_enc_test6_plaintext, + .expected_result = &rfc3962_enc_test6_expected_result, + .next_iv = &rfc3962_enc_test6_next_iv, + }, +}; + +/* Creates the function rfc3962_encrypt_gen_params */ +KUNIT_ARRAY_PARAM(rfc3962_encrypt, rfc3962_encrypt_test_params, + gss_krb5_get_desc); + +/* + * This tests the implementation of the encryption part of the mechanism. + * It does not apply a confounder or test the result of HMAC over the + * plaintext. + */ +static void rfc3962_encrypt_case(struct kunit *test) +{ + const struct gss_krb5_test_param *param = test->param_value; + struct crypto_sync_skcipher *cts_tfm, *cbc_tfm; + const struct gss_krb5_enctype *gk5e; + struct xdr_buf buf; + void *iv, *text; + u32 err; + + /* Arrange */ + gk5e = gss_krb5_lookup_enctype(param->enctype); + if (!gk5e) + kunit_skip(test, "Encryption type is not available"); + + cbc_tfm = crypto_alloc_sync_skcipher(gk5e->aux_cipher, 0, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cbc_tfm); + err = crypto_sync_skcipher_setkey(cbc_tfm, param->Ke->data, param->Ke->len); + KUNIT_ASSERT_EQ(test, err, 0); + + cts_tfm = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cts_tfm); + err = crypto_sync_skcipher_setkey(cts_tfm, param->Ke->data, param->Ke->len); + KUNIT_ASSERT_EQ(test, err, 0); + + iv = kunit_kzalloc(test, crypto_sync_skcipher_ivsize(cts_tfm), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, iv); + + text = kunit_kzalloc(test, param->plaintext->len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, text); + + memcpy(text, param->plaintext->data, param->plaintext->len); + memset(&buf, 0, sizeof(buf)); + buf.head[0].iov_base = text; + buf.head[0].iov_len = param->plaintext->len; + buf.len = buf.head[0].iov_len; + + /* Act */ + err = krb5_cbc_cts_encrypt(cts_tfm, cbc_tfm, 0, &buf, NULL, + iv, crypto_sync_skcipher_ivsize(cts_tfm)); + KUNIT_ASSERT_EQ(test, err, 0); + + /* Assert */ + KUNIT_EXPECT_EQ_MSG(test, + param->expected_result->len, buf.len, + "ciphertext length mismatch"); + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->expected_result->data, + text, param->expected_result->len), 0, + "ciphertext mismatch"); + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->next_iv->data, iv, + param->next_iv->len), 0, + "IV mismatch"); + + crypto_free_sync_skcipher(cts_tfm); + crypto_free_sync_skcipher(cbc_tfm); +} + +static struct kunit_case rfc3962_test_cases[] = { + { + .name = "RFC 3962 encryption", + .run_case = rfc3962_encrypt_case, + .generate_params = rfc3962_encrypt_gen_params, + }, + {} +}; + +static struct kunit_suite rfc3962_suite = { + .name = "RFC 3962 suite", + .test_cases = rfc3962_test_cases, +}; + +/* + * From RFC 6803 Section 10. Test vectors + * + * Sample results for key derivation + * + * Copyright (c) 2012 IETF Trust and the persons identified as the + * document authors. All rights reserved. + */ + +DEFINE_HEX_XDR_NETOBJ(camellia128_cts_cmac_basekey, + 0x57, 0xd0, 0x29, 0x72, 0x98, 0xff, 0xd9, 0xd3, + 0x5d, 0xe5, 0xa4, 0x7f, 0xb4, 0xbd, 0xe2, 0x4b +); +DEFINE_HEX_XDR_NETOBJ(camellia128_cts_cmac_Kc, + 0xd1, 0x55, 0x77, 0x5a, 0x20, 0x9d, 0x05, 0xf0, + 0x2b, 0x38, 0xd4, 0x2a, 0x38, 0x9e, 0x5a, 0x56 +); +DEFINE_HEX_XDR_NETOBJ(camellia128_cts_cmac_Ke, + 0x64, 0xdf, 0x83, 0xf8, 0x5a, 0x53, 0x2f, 0x17, + 0x57, 0x7d, 0x8c, 0x37, 0x03, 0x57, 0x96, 0xab +); +DEFINE_HEX_XDR_NETOBJ(camellia128_cts_cmac_Ki, + 0x3e, 0x4f, 0xbd, 0xf3, 0x0f, 0xb8, 0x25, 0x9c, + 0x42, 0x5c, 0xb6, 0xc9, 0x6f, 0x1f, 0x46, 0x35 +); + +DEFINE_HEX_XDR_NETOBJ(camellia256_cts_cmac_basekey, + 0xb9, 0xd6, 0x82, 0x8b, 0x20, 0x56, 0xb7, 0xbe, + 0x65, 0x6d, 0x88, 0xa1, 0x23, 0xb1, 0xfa, 0xc6, + 0x82, 0x14, 0xac, 0x2b, 0x72, 0x7e, 0xcf, 0x5f, + 0x69, 0xaf, 0xe0, 0xc4, 0xdf, 0x2a, 0x6d, 0x2c +); +DEFINE_HEX_XDR_NETOBJ(camellia256_cts_cmac_Kc, + 0xe4, 0x67, 0xf9, 0xa9, 0x55, 0x2b, 0xc7, 0xd3, + 0x15, 0x5a, 0x62, 0x20, 0xaf, 0x9c, 0x19, 0x22, + 0x0e, 0xee, 0xd4, 0xff, 0x78, 0xb0, 0xd1, 0xe6, + 0xa1, 0x54, 0x49, 0x91, 0x46, 0x1a, 0x9e, 0x50 +); +DEFINE_HEX_XDR_NETOBJ(camellia256_cts_cmac_Ke, + 0x41, 0x2a, 0xef, 0xc3, 0x62, 0xa7, 0x28, 0x5f, + 0xc3, 0x96, 0x6c, 0x6a, 0x51, 0x81, 0xe7, 0x60, + 0x5a, 0xe6, 0x75, 0x23, 0x5b, 0x6d, 0x54, 0x9f, + 0xbf, 0xc9, 0xab, 0x66, 0x30, 0xa4, 0xc6, 0x04 +); +DEFINE_HEX_XDR_NETOBJ(camellia256_cts_cmac_Ki, + 0xfa, 0x62, 0x4f, 0xa0, 0xe5, 0x23, 0x99, 0x3f, + 0xa3, 0x88, 0xae, 0xfd, 0xc6, 0x7e, 0x67, 0xeb, + 0xcd, 0x8c, 0x08, 0xe8, 0xa0, 0x24, 0x6b, 0x1d, + 0x73, 0xb0, 0xd1, 0xdd, 0x9f, 0xc5, 0x82, 0xb0 +); + +DEFINE_HEX_XDR_NETOBJ(usage_checksum, + 0x00, 0x00, 0x00, 0x02, KEY_USAGE_SEED_CHECKSUM +); +DEFINE_HEX_XDR_NETOBJ(usage_encryption, + 0x00, 0x00, 0x00, 0x02, KEY_USAGE_SEED_ENCRYPTION +); +DEFINE_HEX_XDR_NETOBJ(usage_integrity, + 0x00, 0x00, 0x00, 0x02, KEY_USAGE_SEED_INTEGRITY +); + +static const struct gss_krb5_test_param rfc6803_kdf_test_params[] = { + { + .desc = "Derive Kc subkey for camellia128-cts-cmac", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .base_key = &camellia128_cts_cmac_basekey, + .usage = &usage_checksum, + .expected_result = &camellia128_cts_cmac_Kc, + }, + { + .desc = "Derive Ke subkey for camellia128-cts-cmac", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .base_key = &camellia128_cts_cmac_basekey, + .usage = &usage_encryption, + .expected_result = &camellia128_cts_cmac_Ke, + }, + { + .desc = "Derive Ki subkey for camellia128-cts-cmac", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .base_key = &camellia128_cts_cmac_basekey, + .usage = &usage_integrity, + .expected_result = &camellia128_cts_cmac_Ki, + }, + { + .desc = "Derive Kc subkey for camellia256-cts-cmac", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .base_key = &camellia256_cts_cmac_basekey, + .usage = &usage_checksum, + .expected_result = &camellia256_cts_cmac_Kc, + }, + { + .desc = "Derive Ke subkey for camellia256-cts-cmac", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .base_key = &camellia256_cts_cmac_basekey, + .usage = &usage_encryption, + .expected_result = &camellia256_cts_cmac_Ke, + }, + { + .desc = "Derive Ki subkey for camellia256-cts-cmac", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .base_key = &camellia256_cts_cmac_basekey, + .usage = &usage_integrity, + .expected_result = &camellia256_cts_cmac_Ki, + }, +}; + +/* Creates the function rfc6803_kdf_gen_params */ +KUNIT_ARRAY_PARAM(rfc6803_kdf, rfc6803_kdf_test_params, gss_krb5_get_desc); + +/* + * From RFC 6803 Section 10. Test vectors + * + * Sample checksums. + * + * Copyright (c) 2012 IETF Trust and the persons identified as the + * document authors. All rights reserved. + * + * XXX: These tests are likely to fail on EBCDIC or Unicode platforms. + */ +DEFINE_STR_XDR_NETOBJ(rfc6803_checksum_test1_plaintext, + "abcdefghijk"); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test1_basekey, + 0x1d, 0xc4, 0x6a, 0x8d, 0x76, 0x3f, 0x4f, 0x93, + 0x74, 0x2b, 0xcb, 0xa3, 0x38, 0x75, 0x76, 0xc3 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test1_usage, + 0x00, 0x00, 0x00, 0x07, KEY_USAGE_SEED_CHECKSUM +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test1_expected_result, + 0x11, 0x78, 0xe6, 0xc5, 0xc4, 0x7a, 0x8c, 0x1a, + 0xe0, 0xc4, 0xb9, 0xc7, 0xd4, 0xeb, 0x7b, 0x6b +); + +DEFINE_STR_XDR_NETOBJ(rfc6803_checksum_test2_plaintext, + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test2_basekey, + 0x50, 0x27, 0xbc, 0x23, 0x1d, 0x0f, 0x3a, 0x9d, + 0x23, 0x33, 0x3f, 0x1c, 0xa6, 0xfd, 0xbe, 0x7c +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test2_usage, + 0x00, 0x00, 0x00, 0x08, KEY_USAGE_SEED_CHECKSUM +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test2_expected_result, + 0xd1, 0xb3, 0x4f, 0x70, 0x04, 0xa7, 0x31, 0xf2, + 0x3a, 0x0c, 0x00, 0xbf, 0x6c, 0x3f, 0x75, 0x3a +); + +DEFINE_STR_XDR_NETOBJ(rfc6803_checksum_test3_plaintext, + "123456789"); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test3_basekey, + 0xb6, 0x1c, 0x86, 0xcc, 0x4e, 0x5d, 0x27, 0x57, + 0x54, 0x5a, 0xd4, 0x23, 0x39, 0x9f, 0xb7, 0x03, + 0x1e, 0xca, 0xb9, 0x13, 0xcb, 0xb9, 0x00, 0xbd, + 0x7a, 0x3c, 0x6d, 0xd8, 0xbf, 0x92, 0x01, 0x5b +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test3_usage, + 0x00, 0x00, 0x00, 0x09, KEY_USAGE_SEED_CHECKSUM +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test3_expected_result, + 0x87, 0xa1, 0x2c, 0xfd, 0x2b, 0x96, 0x21, 0x48, + 0x10, 0xf0, 0x1c, 0x82, 0x6e, 0x77, 0x44, 0xb1 +); + +DEFINE_STR_XDR_NETOBJ(rfc6803_checksum_test4_plaintext, + "!@#$%^&*()!@#$%^&*()!@#$%^&*()"); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test4_basekey, + 0x32, 0x16, 0x4c, 0x5b, 0x43, 0x4d, 0x1d, 0x15, + 0x38, 0xe4, 0xcf, 0xd9, 0xbe, 0x80, 0x40, 0xfe, + 0x8c, 0x4a, 0xc7, 0xac, 0xc4, 0xb9, 0x3d, 0x33, + 0x14, 0xd2, 0x13, 0x36, 0x68, 0x14, 0x7a, 0x05 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test4_usage, + 0x00, 0x00, 0x00, 0x0a, KEY_USAGE_SEED_CHECKSUM +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_checksum_test4_expected_result, + 0x3f, 0xa0, 0xb4, 0x23, 0x55, 0xe5, 0x2b, 0x18, + 0x91, 0x87, 0x29, 0x4a, 0xa2, 0x52, 0xab, 0x64 +); + +static const struct gss_krb5_test_param rfc6803_checksum_test_params[] = { + { + .desc = "camellia128-cts-cmac checksum test 1", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .base_key = &rfc6803_checksum_test1_basekey, + .usage = &rfc6803_checksum_test1_usage, + .plaintext = &rfc6803_checksum_test1_plaintext, + .expected_result = &rfc6803_checksum_test1_expected_result, + }, + { + .desc = "camellia128-cts-cmac checksum test 2", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .base_key = &rfc6803_checksum_test2_basekey, + .usage = &rfc6803_checksum_test2_usage, + .plaintext = &rfc6803_checksum_test2_plaintext, + .expected_result = &rfc6803_checksum_test2_expected_result, + }, + { + .desc = "camellia256-cts-cmac checksum test 3", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .base_key = &rfc6803_checksum_test3_basekey, + .usage = &rfc6803_checksum_test3_usage, + .plaintext = &rfc6803_checksum_test3_plaintext, + .expected_result = &rfc6803_checksum_test3_expected_result, + }, + { + .desc = "camellia256-cts-cmac checksum test 4", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .base_key = &rfc6803_checksum_test4_basekey, + .usage = &rfc6803_checksum_test4_usage, + .plaintext = &rfc6803_checksum_test4_plaintext, + .expected_result = &rfc6803_checksum_test4_expected_result, + }, +}; + +/* Creates the function rfc6803_checksum_gen_params */ +KUNIT_ARRAY_PARAM(rfc6803_checksum, rfc6803_checksum_test_params, + gss_krb5_get_desc); + +/* + * From RFC 6803 Section 10. Test vectors + * + * Sample encryptions (all using the default cipher state) + * + * Copyright (c) 2012 IETF Trust and the persons identified as the + * document authors. All rights reserved. + * + * Key usage values are from errata 4326 against RFC 6803. + */ + +static const struct xdr_netobj rfc6803_enc_empty_plaintext = { + .len = 0, +}; + +DEFINE_STR_XDR_NETOBJ(rfc6803_enc_1byte_plaintext, "1"); +DEFINE_STR_XDR_NETOBJ(rfc6803_enc_9byte_plaintext, "9 bytesss"); +DEFINE_STR_XDR_NETOBJ(rfc6803_enc_13byte_plaintext, "13 bytes byte"); +DEFINE_STR_XDR_NETOBJ(rfc6803_enc_30byte_plaintext, + "30 bytes bytes bytes bytes byt" +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test1_confounder, + 0xb6, 0x98, 0x22, 0xa1, 0x9a, 0x6b, 0x09, 0xc0, + 0xeb, 0xc8, 0x55, 0x7d, 0x1f, 0x1b, 0x6c, 0x0a +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test1_basekey, + 0x1d, 0xc4, 0x6a, 0x8d, 0x76, 0x3f, 0x4f, 0x93, + 0x74, 0x2b, 0xcb, 0xa3, 0x38, 0x75, 0x76, 0xc3 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test1_expected_result, + 0xc4, 0x66, 0xf1, 0x87, 0x10, 0x69, 0x92, 0x1e, + 0xdb, 0x7c, 0x6f, 0xde, 0x24, 0x4a, 0x52, 0xdb, + 0x0b, 0xa1, 0x0e, 0xdc, 0x19, 0x7b, 0xdb, 0x80, + 0x06, 0x65, 0x8c, 0xa3, 0xcc, 0xce, 0x6e, 0xb8 +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test2_confounder, + 0x6f, 0x2f, 0xc3, 0xc2, 0xa1, 0x66, 0xfd, 0x88, + 0x98, 0x96, 0x7a, 0x83, 0xde, 0x95, 0x96, 0xd9 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test2_basekey, + 0x50, 0x27, 0xbc, 0x23, 0x1d, 0x0f, 0x3a, 0x9d, + 0x23, 0x33, 0x3f, 0x1c, 0xa6, 0xfd, 0xbe, 0x7c +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test2_expected_result, + 0x84, 0x2d, 0x21, 0xfd, 0x95, 0x03, 0x11, 0xc0, + 0xdd, 0x46, 0x4a, 0x3f, 0x4b, 0xe8, 0xd6, 0xda, + 0x88, 0xa5, 0x6d, 0x55, 0x9c, 0x9b, 0x47, 0xd3, + 0xf9, 0xa8, 0x50, 0x67, 0xaf, 0x66, 0x15, 0x59, + 0xb8 +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test3_confounder, + 0xa5, 0xb4, 0xa7, 0x1e, 0x07, 0x7a, 0xee, 0xf9, + 0x3c, 0x87, 0x63, 0xc1, 0x8f, 0xdb, 0x1f, 0x10 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test3_basekey, + 0xa1, 0xbb, 0x61, 0xe8, 0x05, 0xf9, 0xba, 0x6d, + 0xde, 0x8f, 0xdb, 0xdd, 0xc0, 0x5c, 0xde, 0xa0 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test3_expected_result, + 0x61, 0x9f, 0xf0, 0x72, 0xe3, 0x62, 0x86, 0xff, + 0x0a, 0x28, 0xde, 0xb3, 0xa3, 0x52, 0xec, 0x0d, + 0x0e, 0xdf, 0x5c, 0x51, 0x60, 0xd6, 0x63, 0xc9, + 0x01, 0x75, 0x8c, 0xcf, 0x9d, 0x1e, 0xd3, 0x3d, + 0x71, 0xdb, 0x8f, 0x23, 0xaa, 0xbf, 0x83, 0x48, + 0xa0 +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test4_confounder, + 0x19, 0xfe, 0xe4, 0x0d, 0x81, 0x0c, 0x52, 0x4b, + 0x5b, 0x22, 0xf0, 0x18, 0x74, 0xc6, 0x93, 0xda +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test4_basekey, + 0x2c, 0xa2, 0x7a, 0x5f, 0xaf, 0x55, 0x32, 0x24, + 0x45, 0x06, 0x43, 0x4e, 0x1c, 0xef, 0x66, 0x76 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test4_expected_result, + 0xb8, 0xec, 0xa3, 0x16, 0x7a, 0xe6, 0x31, 0x55, + 0x12, 0xe5, 0x9f, 0x98, 0xa7, 0xc5, 0x00, 0x20, + 0x5e, 0x5f, 0x63, 0xff, 0x3b, 0xb3, 0x89, 0xaf, + 0x1c, 0x41, 0xa2, 0x1d, 0x64, 0x0d, 0x86, 0x15, + 0xc9, 0xed, 0x3f, 0xbe, 0xb0, 0x5a, 0xb6, 0xac, + 0xb6, 0x76, 0x89, 0xb5, 0xea +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test5_confounder, + 0xca, 0x7a, 0x7a, 0xb4, 0xbe, 0x19, 0x2d, 0xab, + 0xd6, 0x03, 0x50, 0x6d, 0xb1, 0x9c, 0x39, 0xe2 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test5_basekey, + 0x78, 0x24, 0xf8, 0xc1, 0x6f, 0x83, 0xff, 0x35, + 0x4c, 0x6b, 0xf7, 0x51, 0x5b, 0x97, 0x3f, 0x43 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test5_expected_result, + 0xa2, 0x6a, 0x39, 0x05, 0xa4, 0xff, 0xd5, 0x81, + 0x6b, 0x7b, 0x1e, 0x27, 0x38, 0x0d, 0x08, 0x09, + 0x0c, 0x8e, 0xc1, 0xf3, 0x04, 0x49, 0x6e, 0x1a, + 0xbd, 0xcd, 0x2b, 0xdc, 0xd1, 0xdf, 0xfc, 0x66, + 0x09, 0x89, 0xe1, 0x17, 0xa7, 0x13, 0xdd, 0xbb, + 0x57, 0xa4, 0x14, 0x6c, 0x15, 0x87, 0xcb, 0xa4, + 0x35, 0x66, 0x65, 0x59, 0x1d, 0x22, 0x40, 0x28, + 0x2f, 0x58, 0x42, 0xb1, 0x05, 0xa5 +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test6_confounder, + 0x3c, 0xbb, 0xd2, 0xb4, 0x59, 0x17, 0x94, 0x10, + 0x67, 0xf9, 0x65, 0x99, 0xbb, 0x98, 0x92, 0x6c +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test6_basekey, + 0xb6, 0x1c, 0x86, 0xcc, 0x4e, 0x5d, 0x27, 0x57, + 0x54, 0x5a, 0xd4, 0x23, 0x39, 0x9f, 0xb7, 0x03, + 0x1e, 0xca, 0xb9, 0x13, 0xcb, 0xb9, 0x00, 0xbd, + 0x7a, 0x3c, 0x6d, 0xd8, 0xbf, 0x92, 0x01, 0x5b +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test6_expected_result, + 0x03, 0x88, 0x6d, 0x03, 0x31, 0x0b, 0x47, 0xa6, + 0xd8, 0xf0, 0x6d, 0x7b, 0x94, 0xd1, 0xdd, 0x83, + 0x7e, 0xcc, 0xe3, 0x15, 0xef, 0x65, 0x2a, 0xff, + 0x62, 0x08, 0x59, 0xd9, 0x4a, 0x25, 0x92, 0x66 +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test7_confounder, + 0xde, 0xf4, 0x87, 0xfc, 0xeb, 0xe6, 0xde, 0x63, + 0x46, 0xd4, 0xda, 0x45, 0x21, 0xbb, 0xa2, 0xd2 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test7_basekey, + 0x1b, 0x97, 0xfe, 0x0a, 0x19, 0x0e, 0x20, 0x21, + 0xeb, 0x30, 0x75, 0x3e, 0x1b, 0x6e, 0x1e, 0x77, + 0xb0, 0x75, 0x4b, 0x1d, 0x68, 0x46, 0x10, 0x35, + 0x58, 0x64, 0x10, 0x49, 0x63, 0x46, 0x38, 0x33 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test7_expected_result, + 0x2c, 0x9c, 0x15, 0x70, 0x13, 0x3c, 0x99, 0xbf, + 0x6a, 0x34, 0xbc, 0x1b, 0x02, 0x12, 0x00, 0x2f, + 0xd1, 0x94, 0x33, 0x87, 0x49, 0xdb, 0x41, 0x35, + 0x49, 0x7a, 0x34, 0x7c, 0xfc, 0xd9, 0xd1, 0x8a, + 0x12 +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test8_confounder, + 0xad, 0x4f, 0xf9, 0x04, 0xd3, 0x4e, 0x55, 0x53, + 0x84, 0xb1, 0x41, 0x00, 0xfc, 0x46, 0x5f, 0x88 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test8_basekey, + 0x32, 0x16, 0x4c, 0x5b, 0x43, 0x4d, 0x1d, 0x15, + 0x38, 0xe4, 0xcf, 0xd9, 0xbe, 0x80, 0x40, 0xfe, + 0x8c, 0x4a, 0xc7, 0xac, 0xc4, 0xb9, 0x3d, 0x33, + 0x14, 0xd2, 0x13, 0x36, 0x68, 0x14, 0x7a, 0x05 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test8_expected_result, + 0x9c, 0x6d, 0xe7, 0x5f, 0x81, 0x2d, 0xe7, 0xed, + 0x0d, 0x28, 0xb2, 0x96, 0x35, 0x57, 0xa1, 0x15, + 0x64, 0x09, 0x98, 0x27, 0x5b, 0x0a, 0xf5, 0x15, + 0x27, 0x09, 0x91, 0x3f, 0xf5, 0x2a, 0x2a, 0x9c, + 0x8e, 0x63, 0xb8, 0x72, 0xf9, 0x2e, 0x64, 0xc8, + 0x39 +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test9_confounder, + 0xcf, 0x9b, 0xca, 0x6d, 0xf1, 0x14, 0x4e, 0x0c, + 0x0a, 0xf9, 0xb8, 0xf3, 0x4c, 0x90, 0xd5, 0x14 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test9_basekey, + 0xb0, 0x38, 0xb1, 0x32, 0xcd, 0x8e, 0x06, 0x61, + 0x22, 0x67, 0xfa, 0xb7, 0x17, 0x00, 0x66, 0xd8, + 0x8a, 0xec, 0xcb, 0xa0, 0xb7, 0x44, 0xbf, 0xc6, + 0x0d, 0xc8, 0x9b, 0xca, 0x18, 0x2d, 0x07, 0x15 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test9_expected_result, + 0xee, 0xec, 0x85, 0xa9, 0x81, 0x3c, 0xdc, 0x53, + 0x67, 0x72, 0xab, 0x9b, 0x42, 0xde, 0xfc, 0x57, + 0x06, 0xf7, 0x26, 0xe9, 0x75, 0xdd, 0xe0, 0x5a, + 0x87, 0xeb, 0x54, 0x06, 0xea, 0x32, 0x4c, 0xa1, + 0x85, 0xc9, 0x98, 0x6b, 0x42, 0xaa, 0xbe, 0x79, + 0x4b, 0x84, 0x82, 0x1b, 0xee +); + +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test10_confounder, + 0x64, 0x4d, 0xef, 0x38, 0xda, 0x35, 0x00, 0x72, + 0x75, 0x87, 0x8d, 0x21, 0x68, 0x55, 0xe2, 0x28 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test10_basekey, + 0xcc, 0xfc, 0xd3, 0x49, 0xbf, 0x4c, 0x66, 0x77, + 0xe8, 0x6e, 0x4b, 0x02, 0xb8, 0xea, 0xb9, 0x24, + 0xa5, 0x46, 0xac, 0x73, 0x1c, 0xf9, 0xbf, 0x69, + 0x89, 0xb9, 0x96, 0xe7, 0xd6, 0xbf, 0xbb, 0xa7 +); +DEFINE_HEX_XDR_NETOBJ(rfc6803_enc_test10_expected_result, + 0x0e, 0x44, 0x68, 0x09, 0x85, 0x85, 0x5f, 0x2d, + 0x1f, 0x18, 0x12, 0x52, 0x9c, 0xa8, 0x3b, 0xfd, + 0x8e, 0x34, 0x9d, 0xe6, 0xfd, 0x9a, 0xda, 0x0b, + 0xaa, 0xa0, 0x48, 0xd6, 0x8e, 0x26, 0x5f, 0xeb, + 0xf3, 0x4a, 0xd1, 0x25, 0x5a, 0x34, 0x49, 0x99, + 0xad, 0x37, 0x14, 0x68, 0x87, 0xa6, 0xc6, 0x84, + 0x57, 0x31, 0xac, 0x7f, 0x46, 0x37, 0x6a, 0x05, + 0x04, 0xcd, 0x06, 0x57, 0x14, 0x74 +); + +static const struct gss_krb5_test_param rfc6803_encrypt_test_params[] = { + { + .desc = "Encrypt empty plaintext with camellia128-cts-cmac", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .constant = 0, + .base_key = &rfc6803_enc_test1_basekey, + .plaintext = &rfc6803_enc_empty_plaintext, + .confounder = &rfc6803_enc_test1_confounder, + .expected_result = &rfc6803_enc_test1_expected_result, + }, + { + .desc = "Encrypt 1 byte with camellia128-cts-cmac", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .constant = 1, + .base_key = &rfc6803_enc_test2_basekey, + .plaintext = &rfc6803_enc_1byte_plaintext, + .confounder = &rfc6803_enc_test2_confounder, + .expected_result = &rfc6803_enc_test2_expected_result, + }, + { + .desc = "Encrypt 9 bytes with camellia128-cts-cmac", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .constant = 2, + .base_key = &rfc6803_enc_test3_basekey, + .plaintext = &rfc6803_enc_9byte_plaintext, + .confounder = &rfc6803_enc_test3_confounder, + .expected_result = &rfc6803_enc_test3_expected_result, + }, + { + .desc = "Encrypt 13 bytes with camellia128-cts-cmac", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .constant = 3, + .base_key = &rfc6803_enc_test4_basekey, + .plaintext = &rfc6803_enc_13byte_plaintext, + .confounder = &rfc6803_enc_test4_confounder, + .expected_result = &rfc6803_enc_test4_expected_result, + }, + { + .desc = "Encrypt 30 bytes with camellia128-cts-cmac", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .constant = 4, + .base_key = &rfc6803_enc_test5_basekey, + .plaintext = &rfc6803_enc_30byte_plaintext, + .confounder = &rfc6803_enc_test5_confounder, + .expected_result = &rfc6803_enc_test5_expected_result, + }, + { + .desc = "Encrypt empty plaintext with camellia256-cts-cmac", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .constant = 0, + .base_key = &rfc6803_enc_test6_basekey, + .plaintext = &rfc6803_enc_empty_plaintext, + .confounder = &rfc6803_enc_test6_confounder, + .expected_result = &rfc6803_enc_test6_expected_result, + }, + { + .desc = "Encrypt 1 byte with camellia256-cts-cmac", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .constant = 1, + .base_key = &rfc6803_enc_test7_basekey, + .plaintext = &rfc6803_enc_1byte_plaintext, + .confounder = &rfc6803_enc_test7_confounder, + .expected_result = &rfc6803_enc_test7_expected_result, + }, + { + .desc = "Encrypt 9 bytes with camellia256-cts-cmac", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .constant = 2, + .base_key = &rfc6803_enc_test8_basekey, + .plaintext = &rfc6803_enc_9byte_plaintext, + .confounder = &rfc6803_enc_test8_confounder, + .expected_result = &rfc6803_enc_test8_expected_result, + }, + { + .desc = "Encrypt 13 bytes with camellia256-cts-cmac", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .constant = 3, + .base_key = &rfc6803_enc_test9_basekey, + .plaintext = &rfc6803_enc_13byte_plaintext, + .confounder = &rfc6803_enc_test9_confounder, + .expected_result = &rfc6803_enc_test9_expected_result, + }, + { + .desc = "Encrypt 30 bytes with camellia256-cts-cmac", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .constant = 4, + .base_key = &rfc6803_enc_test10_basekey, + .plaintext = &rfc6803_enc_30byte_plaintext, + .confounder = &rfc6803_enc_test10_confounder, + .expected_result = &rfc6803_enc_test10_expected_result, + }, +}; + +/* Creates the function rfc6803_encrypt_gen_params */ +KUNIT_ARRAY_PARAM(rfc6803_encrypt, rfc6803_encrypt_test_params, + gss_krb5_get_desc); + +static void rfc6803_encrypt_case(struct kunit *test) +{ + const struct gss_krb5_test_param *param = test->param_value; + struct crypto_sync_skcipher *cts_tfm, *cbc_tfm; + const struct gss_krb5_enctype *gk5e; + struct xdr_netobj Ke, Ki, checksum; + u8 usage_data[GSS_KRB5_K5CLENGTH]; + struct xdr_netobj usage = { + .data = usage_data, + .len = sizeof(usage_data), + }; + struct crypto_ahash *ahash_tfm; + unsigned int blocksize; + struct xdr_buf buf; + void *text; + size_t len; + u32 err; + + /* Arrange */ + gk5e = gss_krb5_lookup_enctype(param->enctype); + if (!gk5e) + kunit_skip(test, "Encryption type is not available"); + + memset(usage_data, 0, sizeof(usage_data)); + usage.data[3] = param->constant; + + Ke.len = gk5e->Ke_length; + Ke.data = kunit_kzalloc(test, Ke.len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, Ke.data); + usage.data[4] = KEY_USAGE_SEED_ENCRYPTION; + err = gk5e->derive_key(gk5e, param->base_key, &Ke, &usage, GFP_KERNEL); + KUNIT_ASSERT_EQ(test, err, 0); + + cbc_tfm = crypto_alloc_sync_skcipher(gk5e->aux_cipher, 0, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cbc_tfm); + err = crypto_sync_skcipher_setkey(cbc_tfm, Ke.data, Ke.len); + KUNIT_ASSERT_EQ(test, err, 0); + + cts_tfm = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cts_tfm); + err = crypto_sync_skcipher_setkey(cts_tfm, Ke.data, Ke.len); + KUNIT_ASSERT_EQ(test, err, 0); + blocksize = crypto_sync_skcipher_blocksize(cts_tfm); + + len = param->confounder->len + param->plaintext->len + blocksize; + text = kunit_kzalloc(test, len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, text); + memcpy(text, param->confounder->data, param->confounder->len); + memcpy(text + param->confounder->len, param->plaintext->data, + param->plaintext->len); + + memset(&buf, 0, sizeof(buf)); + buf.head[0].iov_base = text; + buf.head[0].iov_len = param->confounder->len + param->plaintext->len; + buf.len = buf.head[0].iov_len; + + checksum.len = gk5e->cksumlength; + checksum.data = kunit_kzalloc(test, checksum.len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, checksum.data); + + Ki.len = gk5e->Ki_length; + Ki.data = kunit_kzalloc(test, Ki.len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, Ki.data); + usage.data[4] = KEY_USAGE_SEED_INTEGRITY; + err = gk5e->derive_key(gk5e, param->base_key, &Ki, + &usage, GFP_KERNEL); + KUNIT_ASSERT_EQ(test, err, 0); + ahash_tfm = crypto_alloc_ahash(gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ahash_tfm); + err = crypto_ahash_setkey(ahash_tfm, Ki.data, Ki.len); + KUNIT_ASSERT_EQ(test, err, 0); + + /* Act */ + err = gss_krb5_checksum(ahash_tfm, NULL, 0, &buf, 0, &checksum); + KUNIT_ASSERT_EQ(test, err, 0); + + err = krb5_cbc_cts_encrypt(cts_tfm, cbc_tfm, 0, &buf, NULL, NULL, 0); + KUNIT_ASSERT_EQ(test, err, 0); + + /* Assert */ + KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, + buf.len + checksum.len, + "ciphertext length mismatch"); + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->expected_result->data, + buf.head[0].iov_base, buf.len), 0, + "encrypted result mismatch"); + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->expected_result->data + + (param->expected_result->len - checksum.len), + checksum.data, checksum.len), 0, + "HMAC mismatch"); + + crypto_free_ahash(ahash_tfm); + crypto_free_sync_skcipher(cts_tfm); + crypto_free_sync_skcipher(cbc_tfm); +} + +static struct kunit_case rfc6803_test_cases[] = { + { + .name = "RFC 6803 key derivation", + .run_case = kdf_case, + .generate_params = rfc6803_kdf_gen_params, + }, + { + .name = "RFC 6803 checksum", + .run_case = checksum_case, + .generate_params = rfc6803_checksum_gen_params, + }, + { + .name = "RFC 6803 encryption", + .run_case = rfc6803_encrypt_case, + .generate_params = rfc6803_encrypt_gen_params, + }, + {} +}; + +static struct kunit_suite rfc6803_suite = { + .name = "RFC 6803 suite", + .test_cases = rfc6803_test_cases, +}; + +/* + * From RFC 8009 Appendix A. Test Vectors + * + * Sample results for SHA-2 enctype key derivation + * + * This test material is copyright (c) 2016 IETF Trust and the + * persons identified as the document authors. All rights reserved. + */ + +DEFINE_HEX_XDR_NETOBJ(aes128_cts_hmac_sha256_128_basekey, + 0x37, 0x05, 0xd9, 0x60, 0x80, 0xc1, 0x77, 0x28, + 0xa0, 0xe8, 0x00, 0xea, 0xb6, 0xe0, 0xd2, 0x3c +); +DEFINE_HEX_XDR_NETOBJ(aes128_cts_hmac_sha256_128_Kc, + 0xb3, 0x1a, 0x01, 0x8a, 0x48, 0xf5, 0x47, 0x76, + 0xf4, 0x03, 0xe9, 0xa3, 0x96, 0x32, 0x5d, 0xc3 +); +DEFINE_HEX_XDR_NETOBJ(aes128_cts_hmac_sha256_128_Ke, + 0x9b, 0x19, 0x7d, 0xd1, 0xe8, 0xc5, 0x60, 0x9d, + 0x6e, 0x67, 0xc3, 0xe3, 0x7c, 0x62, 0xc7, 0x2e +); +DEFINE_HEX_XDR_NETOBJ(aes128_cts_hmac_sha256_128_Ki, + 0x9f, 0xda, 0x0e, 0x56, 0xab, 0x2d, 0x85, 0xe1, + 0x56, 0x9a, 0x68, 0x86, 0x96, 0xc2, 0x6a, 0x6c +); + +DEFINE_HEX_XDR_NETOBJ(aes256_cts_hmac_sha384_192_basekey, + 0x6d, 0x40, 0x4d, 0x37, 0xfa, 0xf7, 0x9f, 0x9d, + 0xf0, 0xd3, 0x35, 0x68, 0xd3, 0x20, 0x66, 0x98, + 0x00, 0xeb, 0x48, 0x36, 0x47, 0x2e, 0xa8, 0xa0, + 0x26, 0xd1, 0x6b, 0x71, 0x82, 0x46, 0x0c, 0x52 +); +DEFINE_HEX_XDR_NETOBJ(aes256_cts_hmac_sha384_192_Kc, + 0xef, 0x57, 0x18, 0xbe, 0x86, 0xcc, 0x84, 0x96, + 0x3d, 0x8b, 0xbb, 0x50, 0x31, 0xe9, 0xf5, 0xc4, + 0xba, 0x41, 0xf2, 0x8f, 0xaf, 0x69, 0xe7, 0x3d +); +DEFINE_HEX_XDR_NETOBJ(aes256_cts_hmac_sha384_192_Ke, + 0x56, 0xab, 0x22, 0xbe, 0xe6, 0x3d, 0x82, 0xd7, + 0xbc, 0x52, 0x27, 0xf6, 0x77, 0x3f, 0x8e, 0xa7, + 0xa5, 0xeb, 0x1c, 0x82, 0x51, 0x60, 0xc3, 0x83, + 0x12, 0x98, 0x0c, 0x44, 0x2e, 0x5c, 0x7e, 0x49 +); +DEFINE_HEX_XDR_NETOBJ(aes256_cts_hmac_sha384_192_Ki, + 0x69, 0xb1, 0x65, 0x14, 0xe3, 0xcd, 0x8e, 0x56, + 0xb8, 0x20, 0x10, 0xd5, 0xc7, 0x30, 0x12, 0xb6, + 0x22, 0xc4, 0xd0, 0x0f, 0xfc, 0x23, 0xed, 0x1f +); + +static const struct gss_krb5_test_param rfc8009_kdf_test_params[] = { + { + .desc = "Derive Kc subkey for aes128-cts-hmac-sha256-128", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .base_key = &aes128_cts_hmac_sha256_128_basekey, + .usage = &usage_checksum, + .expected_result = &aes128_cts_hmac_sha256_128_Kc, + }, + { + .desc = "Derive Ke subkey for aes128-cts-hmac-sha256-128", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .base_key = &aes128_cts_hmac_sha256_128_basekey, + .usage = &usage_encryption, + .expected_result = &aes128_cts_hmac_sha256_128_Ke, + }, + { + .desc = "Derive Ki subkey for aes128-cts-hmac-sha256-128", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .base_key = &aes128_cts_hmac_sha256_128_basekey, + .usage = &usage_integrity, + .expected_result = &aes128_cts_hmac_sha256_128_Ki, + }, + { + .desc = "Derive Kc subkey for aes256-cts-hmac-sha384-192", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .base_key = &aes256_cts_hmac_sha384_192_basekey, + .usage = &usage_checksum, + .expected_result = &aes256_cts_hmac_sha384_192_Kc, + }, + { + .desc = "Derive Ke subkey for aes256-cts-hmac-sha384-192", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .base_key = &aes256_cts_hmac_sha384_192_basekey, + .usage = &usage_encryption, + .expected_result = &aes256_cts_hmac_sha384_192_Ke, + }, + { + .desc = "Derive Ki subkey for aes256-cts-hmac-sha384-192", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .base_key = &aes256_cts_hmac_sha384_192_basekey, + .usage = &usage_integrity, + .expected_result = &aes256_cts_hmac_sha384_192_Ki, + }, +}; + +/* Creates the function rfc8009_kdf_gen_params */ +KUNIT_ARRAY_PARAM(rfc8009_kdf, rfc8009_kdf_test_params, gss_krb5_get_desc); + +/* + * From RFC 8009 Appendix A. Test Vectors + * + * These sample checksums use the above sample key derivation results, + * including use of the same base-key and key usage values. + * + * This test material is copyright (c) 2016 IETF Trust and the + * persons identified as the document authors. All rights reserved. + */ + +DEFINE_HEX_XDR_NETOBJ(rfc8009_checksum_plaintext, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_checksum_test1_expected_result, + 0xd7, 0x83, 0x67, 0x18, 0x66, 0x43, 0xd6, 0x7b, + 0x41, 0x1c, 0xba, 0x91, 0x39, 0xfc, 0x1d, 0xee +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_checksum_test2_expected_result, + 0x45, 0xee, 0x79, 0x15, 0x67, 0xee, 0xfc, 0xa3, + 0x7f, 0x4a, 0xc1, 0xe0, 0x22, 0x2d, 0xe8, 0x0d, + 0x43, 0xc3, 0xbf, 0xa0, 0x66, 0x99, 0x67, 0x2a +); + +static const struct gss_krb5_test_param rfc8009_checksum_test_params[] = { + { + .desc = "Checksum with aes128-cts-hmac-sha256-128", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .base_key = &aes128_cts_hmac_sha256_128_basekey, + .usage = &usage_checksum, + .plaintext = &rfc8009_checksum_plaintext, + .expected_result = &rfc8009_checksum_test1_expected_result, + }, + { + .desc = "Checksum with aes256-cts-hmac-sha384-192", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .base_key = &aes256_cts_hmac_sha384_192_basekey, + .usage = &usage_checksum, + .plaintext = &rfc8009_checksum_plaintext, + .expected_result = &rfc8009_checksum_test2_expected_result, + }, +}; + +/* Creates the function rfc8009_checksum_gen_params */ +KUNIT_ARRAY_PARAM(rfc8009_checksum, rfc8009_checksum_test_params, + gss_krb5_get_desc); + +/* + * From RFC 8009 Appendix A. Test Vectors + * + * Sample encryptions (all using the default cipher state): + * -------------------------------------------------------- + * + * These sample encryptions use the above sample key derivation results, + * including use of the same base-key and key usage values. + * + * This test material is copyright (c) 2016 IETF Trust and the + * persons identified as the document authors. All rights reserved. + */ + +static const struct xdr_netobj rfc8009_enc_empty_plaintext = { + .len = 0, +}; +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_short_plaintext, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_block_plaintext, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_long_plaintext, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14 +); + +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test1_confounder, + 0x7e, 0x58, 0x95, 0xea, 0xf2, 0x67, 0x24, 0x35, + 0xba, 0xd8, 0x17, 0xf5, 0x45, 0xa3, 0x71, 0x48 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test1_expected_result, + 0xef, 0x85, 0xfb, 0x89, 0x0b, 0xb8, 0x47, 0x2f, + 0x4d, 0xab, 0x20, 0x39, 0x4d, 0xca, 0x78, 0x1d +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test1_expected_hmac, + 0xad, 0x87, 0x7e, 0xda, 0x39, 0xd5, 0x0c, 0x87, + 0x0c, 0x0d, 0x5a, 0x0a, 0x8e, 0x48, 0xc7, 0x18 +); + +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test2_confounder, + 0x7b, 0xca, 0x28, 0x5e, 0x2f, 0xd4, 0x13, 0x0f, + 0xb5, 0x5b, 0x1a, 0x5c, 0x83, 0xbc, 0x5b, 0x24 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test2_expected_result, + 0x84, 0xd7, 0xf3, 0x07, 0x54, 0xed, 0x98, 0x7b, + 0xab, 0x0b, 0xf3, 0x50, 0x6b, 0xeb, 0x09, 0xcf, + 0xb5, 0x54, 0x02, 0xce, 0xf7, 0xe6 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test2_expected_hmac, + 0x87, 0x7c, 0xe9, 0x9e, 0x24, 0x7e, 0x52, 0xd1, + 0x6e, 0xd4, 0x42, 0x1d, 0xfd, 0xf8, 0x97, 0x6c +); + +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test3_confounder, + 0x56, 0xab, 0x21, 0x71, 0x3f, 0xf6, 0x2c, 0x0a, + 0x14, 0x57, 0x20, 0x0f, 0x6f, 0xa9, 0x94, 0x8f +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test3_expected_result, + 0x35, 0x17, 0xd6, 0x40, 0xf5, 0x0d, 0xdc, 0x8a, + 0xd3, 0x62, 0x87, 0x22, 0xb3, 0x56, 0x9d, 0x2a, + 0xe0, 0x74, 0x93, 0xfa, 0x82, 0x63, 0x25, 0x40, + 0x80, 0xea, 0x65, 0xc1, 0x00, 0x8e, 0x8f, 0xc2 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test3_expected_hmac, + 0x95, 0xfb, 0x48, 0x52, 0xe7, 0xd8, 0x3e, 0x1e, + 0x7c, 0x48, 0xc3, 0x7e, 0xeb, 0xe6, 0xb0, 0xd3 +); + +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test4_confounder, + 0xa7, 0xa4, 0xe2, 0x9a, 0x47, 0x28, 0xce, 0x10, + 0x66, 0x4f, 0xb6, 0x4e, 0x49, 0xad, 0x3f, 0xac +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test4_expected_result, + 0x72, 0x0f, 0x73, 0xb1, 0x8d, 0x98, 0x59, 0xcd, + 0x6c, 0xcb, 0x43, 0x46, 0x11, 0x5c, 0xd3, 0x36, + 0xc7, 0x0f, 0x58, 0xed, 0xc0, 0xc4, 0x43, 0x7c, + 0x55, 0x73, 0x54, 0x4c, 0x31, 0xc8, 0x13, 0xbc, + 0xe1, 0xe6, 0xd0, 0x72, 0xc1 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test4_expected_hmac, + 0x86, 0xb3, 0x9a, 0x41, 0x3c, 0x2f, 0x92, 0xca, + 0x9b, 0x83, 0x34, 0xa2, 0x87, 0xff, 0xcb, 0xfc +); + +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test5_confounder, + 0xf7, 0x64, 0xe9, 0xfa, 0x15, 0xc2, 0x76, 0x47, + 0x8b, 0x2c, 0x7d, 0x0c, 0x4e, 0x5f, 0x58, 0xe4 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test5_expected_result, + 0x41, 0xf5, 0x3f, 0xa5, 0xbf, 0xe7, 0x02, 0x6d, + 0x91, 0xfa, 0xf9, 0xbe, 0x95, 0x91, 0x95, 0xa0 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test5_expected_hmac, + 0x58, 0x70, 0x72, 0x73, 0xa9, 0x6a, 0x40, 0xf0, + 0xa0, 0x19, 0x60, 0x62, 0x1a, 0xc6, 0x12, 0x74, + 0x8b, 0x9b, 0xbf, 0xbe, 0x7e, 0xb4, 0xce, 0x3c +); + +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test6_confounder, + 0xb8, 0x0d, 0x32, 0x51, 0xc1, 0xf6, 0x47, 0x14, + 0x94, 0x25, 0x6f, 0xfe, 0x71, 0x2d, 0x0b, 0x9a +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test6_expected_result, + 0x4e, 0xd7, 0xb3, 0x7c, 0x2b, 0xca, 0xc8, 0xf7, + 0x4f, 0x23, 0xc1, 0xcf, 0x07, 0xe6, 0x2b, 0xc7, + 0xb7, 0x5f, 0xb3, 0xf6, 0x37, 0xb9 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test6_expected_hmac, + 0xf5, 0x59, 0xc7, 0xf6, 0x64, 0xf6, 0x9e, 0xab, + 0x7b, 0x60, 0x92, 0x23, 0x75, 0x26, 0xea, 0x0d, + 0x1f, 0x61, 0xcb, 0x20, 0xd6, 0x9d, 0x10, 0xf2 +); + +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test7_confounder, + 0x53, 0xbf, 0x8a, 0x0d, 0x10, 0x52, 0x65, 0xd4, + 0xe2, 0x76, 0x42, 0x86, 0x24, 0xce, 0x5e, 0x63 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test7_expected_result, + 0xbc, 0x47, 0xff, 0xec, 0x79, 0x98, 0xeb, 0x91, + 0xe8, 0x11, 0x5c, 0xf8, 0xd1, 0x9d, 0xac, 0x4b, + 0xbb, 0xe2, 0xe1, 0x63, 0xe8, 0x7d, 0xd3, 0x7f, + 0x49, 0xbe, 0xca, 0x92, 0x02, 0x77, 0x64, 0xf6 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test7_expected_hmac, + 0x8c, 0xf5, 0x1f, 0x14, 0xd7, 0x98, 0xc2, 0x27, + 0x3f, 0x35, 0xdf, 0x57, 0x4d, 0x1f, 0x93, 0x2e, + 0x40, 0xc4, 0xff, 0x25, 0x5b, 0x36, 0xa2, 0x66 +); + +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test8_confounder, + 0x76, 0x3e, 0x65, 0x36, 0x7e, 0x86, 0x4f, 0x02, + 0xf5, 0x51, 0x53, 0xc7, 0xe3, 0xb5, 0x8a, 0xf1 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test8_expected_result, + 0x40, 0x01, 0x3e, 0x2d, 0xf5, 0x8e, 0x87, 0x51, + 0x95, 0x7d, 0x28, 0x78, 0xbc, 0xd2, 0xd6, 0xfe, + 0x10, 0x1c, 0xcf, 0xd5, 0x56, 0xcb, 0x1e, 0xae, + 0x79, 0xdb, 0x3c, 0x3e, 0xe8, 0x64, 0x29, 0xf2, + 0xb2, 0xa6, 0x02, 0xac, 0x86 +); +DEFINE_HEX_XDR_NETOBJ(rfc8009_enc_test8_expected_hmac, + 0xfe, 0xf6, 0xec, 0xb6, 0x47, 0xd6, 0x29, 0x5f, + 0xae, 0x07, 0x7a, 0x1f, 0xeb, 0x51, 0x75, 0x08, + 0xd2, 0xc1, 0x6b, 0x41, 0x92, 0xe0, 0x1f, 0x62 +); + +static const struct gss_krb5_test_param rfc8009_encrypt_test_params[] = { + { + .desc = "Encrypt empty plaintext with aes128-cts-hmac-sha256-128", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .plaintext = &rfc8009_enc_empty_plaintext, + .confounder = &rfc8009_enc_test1_confounder, + .base_key = &aes128_cts_hmac_sha256_128_basekey, + .expected_result = &rfc8009_enc_test1_expected_result, + .expected_hmac = &rfc8009_enc_test1_expected_hmac, + }, + { + .desc = "Encrypt short plaintext with aes128-cts-hmac-sha256-128", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .plaintext = &rfc8009_enc_short_plaintext, + .confounder = &rfc8009_enc_test2_confounder, + .base_key = &aes128_cts_hmac_sha256_128_basekey, + .expected_result = &rfc8009_enc_test2_expected_result, + .expected_hmac = &rfc8009_enc_test2_expected_hmac, + }, + { + .desc = "Encrypt block plaintext with aes128-cts-hmac-sha256-128", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .plaintext = &rfc8009_enc_block_plaintext, + .confounder = &rfc8009_enc_test3_confounder, + .base_key = &aes128_cts_hmac_sha256_128_basekey, + .expected_result = &rfc8009_enc_test3_expected_result, + .expected_hmac = &rfc8009_enc_test3_expected_hmac, + }, + { + .desc = "Encrypt long plaintext with aes128-cts-hmac-sha256-128", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .plaintext = &rfc8009_enc_long_plaintext, + .confounder = &rfc8009_enc_test4_confounder, + .base_key = &aes128_cts_hmac_sha256_128_basekey, + .expected_result = &rfc8009_enc_test4_expected_result, + .expected_hmac = &rfc8009_enc_test4_expected_hmac, + }, + { + .desc = "Encrypt empty plaintext with aes256-cts-hmac-sha384-192", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .plaintext = &rfc8009_enc_empty_plaintext, + .confounder = &rfc8009_enc_test5_confounder, + .base_key = &aes256_cts_hmac_sha384_192_basekey, + .expected_result = &rfc8009_enc_test5_expected_result, + .expected_hmac = &rfc8009_enc_test5_expected_hmac, + }, + { + .desc = "Encrypt short plaintext with aes256-cts-hmac-sha384-192", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .plaintext = &rfc8009_enc_short_plaintext, + .confounder = &rfc8009_enc_test6_confounder, + .base_key = &aes256_cts_hmac_sha384_192_basekey, + .expected_result = &rfc8009_enc_test6_expected_result, + .expected_hmac = &rfc8009_enc_test6_expected_hmac, + }, + { + .desc = "Encrypt block plaintext with aes256-cts-hmac-sha384-192", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .plaintext = &rfc8009_enc_block_plaintext, + .confounder = &rfc8009_enc_test7_confounder, + .base_key = &aes256_cts_hmac_sha384_192_basekey, + .expected_result = &rfc8009_enc_test7_expected_result, + .expected_hmac = &rfc8009_enc_test7_expected_hmac, + }, + { + .desc = "Encrypt long plaintext with aes256-cts-hmac-sha384-192", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .plaintext = &rfc8009_enc_long_plaintext, + .confounder = &rfc8009_enc_test8_confounder, + .base_key = &aes256_cts_hmac_sha384_192_basekey, + .expected_result = &rfc8009_enc_test8_expected_result, + .expected_hmac = &rfc8009_enc_test8_expected_hmac, + }, +}; + +/* Creates the function rfc8009_encrypt_gen_params */ +KUNIT_ARRAY_PARAM(rfc8009_encrypt, rfc8009_encrypt_test_params, + gss_krb5_get_desc); + +static void rfc8009_encrypt_case(struct kunit *test) +{ + const struct gss_krb5_test_param *param = test->param_value; + struct crypto_sync_skcipher *cts_tfm, *cbc_tfm; + const struct gss_krb5_enctype *gk5e; + struct xdr_netobj Ke, Ki, checksum; + u8 usage_data[GSS_KRB5_K5CLENGTH]; + struct xdr_netobj usage = { + .data = usage_data, + .len = sizeof(usage_data), + }; + struct crypto_ahash *ahash_tfm; + struct xdr_buf buf; + void *text; + size_t len; + u32 err; + + /* Arrange */ + gk5e = gss_krb5_lookup_enctype(param->enctype); + if (!gk5e) + kunit_skip(test, "Encryption type is not available"); + + *(__be32 *)usage.data = cpu_to_be32(2); + + Ke.len = gk5e->Ke_length; + Ke.data = kunit_kzalloc(test, Ke.len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, Ke.data); + usage.data[4] = KEY_USAGE_SEED_ENCRYPTION; + err = gk5e->derive_key(gk5e, param->base_key, &Ke, + &usage, GFP_KERNEL); + KUNIT_ASSERT_EQ(test, err, 0); + + cbc_tfm = crypto_alloc_sync_skcipher(gk5e->aux_cipher, 0, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cbc_tfm); + err = crypto_sync_skcipher_setkey(cbc_tfm, Ke.data, Ke.len); + KUNIT_ASSERT_EQ(test, err, 0); + + cts_tfm = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cts_tfm); + err = crypto_sync_skcipher_setkey(cts_tfm, Ke.data, Ke.len); + KUNIT_ASSERT_EQ(test, err, 0); + + len = param->confounder->len + param->plaintext->len; + text = kunit_kzalloc(test, len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, text); + memcpy(text, param->confounder->data, param->confounder->len); + memcpy(text + param->confounder->len, param->plaintext->data, + param->plaintext->len); + + memset(&buf, 0, sizeof(buf)); + buf.head[0].iov_base = text; + buf.head[0].iov_len = param->confounder->len + param->plaintext->len; + buf.len = buf.head[0].iov_len; + + checksum.len = gk5e->cksumlength; + checksum.data = kunit_kzalloc(test, checksum.len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, checksum.data); + + Ki.len = gk5e->Ki_length; + Ki.data = kunit_kzalloc(test, Ki.len, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, Ki.data); + usage.data[4] = KEY_USAGE_SEED_INTEGRITY; + err = gk5e->derive_key(gk5e, param->base_key, &Ki, + &usage, GFP_KERNEL); + KUNIT_ASSERT_EQ(test, err, 0); + + ahash_tfm = crypto_alloc_ahash(gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ahash_tfm); + err = crypto_ahash_setkey(ahash_tfm, Ki.data, Ki.len); + KUNIT_ASSERT_EQ(test, err, 0); + + /* Act */ + err = krb5_cbc_cts_encrypt(cts_tfm, cbc_tfm, 0, &buf, NULL, NULL, 0); + KUNIT_ASSERT_EQ(test, err, 0); + err = krb5_etm_checksum(cts_tfm, ahash_tfm, &buf, 0, &checksum); + KUNIT_ASSERT_EQ(test, err, 0); + + /* Assert */ + KUNIT_EXPECT_EQ_MSG(test, + param->expected_result->len, buf.len, + "ciphertext length mismatch"); + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->expected_result->data, + buf.head[0].iov_base, + param->expected_result->len), 0, + "ciphertext mismatch"); + KUNIT_EXPECT_EQ_MSG(test, memcmp(param->expected_hmac->data, + checksum.data, + checksum.len), 0, + "HMAC mismatch"); + + crypto_free_ahash(ahash_tfm); + crypto_free_sync_skcipher(cts_tfm); + crypto_free_sync_skcipher(cbc_tfm); +} + +static struct kunit_case rfc8009_test_cases[] = { + { + .name = "RFC 8009 key derivation", + .run_case = kdf_case, + .generate_params = rfc8009_kdf_gen_params, + }, + { + .name = "RFC 8009 checksum", + .run_case = checksum_case, + .generate_params = rfc8009_checksum_gen_params, + }, + { + .name = "RFC 8009 encryption", + .run_case = rfc8009_encrypt_case, + .generate_params = rfc8009_encrypt_gen_params, + }, + {} +}; + +static struct kunit_suite rfc8009_suite = { + .name = "RFC 8009 suite", + .test_cases = rfc8009_test_cases, +}; + +/* + * Encryption self-tests + */ + +DEFINE_STR_XDR_NETOBJ(encrypt_selftest_plaintext, + "This is the plaintext for the encryption self-test."); + +static const struct gss_krb5_test_param encrypt_selftest_params[] = { + { + .desc = "aes128-cts-hmac-sha1-96 encryption self-test", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, + .Ke = &rfc3962_encryption_key, + .plaintext = &encrypt_selftest_plaintext, + }, + { + .desc = "aes256-cts-hmac-sha1-96 encryption self-test", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA1_96, + .Ke = &rfc3962_encryption_key, + .plaintext = &encrypt_selftest_plaintext, + }, + { + .desc = "camellia128-cts-cmac encryption self-test", + .enctype = ENCTYPE_CAMELLIA128_CTS_CMAC, + .Ke = &camellia128_cts_cmac_Ke, + .plaintext = &encrypt_selftest_plaintext, + }, + { + .desc = "camellia256-cts-cmac encryption self-test", + .enctype = ENCTYPE_CAMELLIA256_CTS_CMAC, + .Ke = &camellia256_cts_cmac_Ke, + .plaintext = &encrypt_selftest_plaintext, + }, + { + .desc = "aes128-cts-hmac-sha256-128 encryption self-test", + .enctype = ENCTYPE_AES128_CTS_HMAC_SHA256_128, + .Ke = &aes128_cts_hmac_sha256_128_Ke, + .plaintext = &encrypt_selftest_plaintext, + }, + { + .desc = "aes256-cts-hmac-sha384-192 encryption self-test", + .enctype = ENCTYPE_AES256_CTS_HMAC_SHA384_192, + .Ke = &aes256_cts_hmac_sha384_192_Ke, + .plaintext = &encrypt_selftest_plaintext, + }, +}; + +/* Creates the function encrypt_selftest_gen_params */ +KUNIT_ARRAY_PARAM(encrypt_selftest, encrypt_selftest_params, + gss_krb5_get_desc); + +/* + * Encrypt and decrypt plaintext, and ensure the input plaintext + * matches the output plaintext. A confounder is not added in this + * case. + */ +static void encrypt_selftest_case(struct kunit *test) +{ + const struct gss_krb5_test_param *param = test->param_value; + struct crypto_sync_skcipher *cts_tfm, *cbc_tfm; + const struct gss_krb5_enctype *gk5e; + struct xdr_buf buf; + void *text; + int err; + + /* Arrange */ + gk5e = gss_krb5_lookup_enctype(param->enctype); + if (!gk5e) + kunit_skip(test, "Encryption type is not available"); + + cbc_tfm = crypto_alloc_sync_skcipher(gk5e->aux_cipher, 0, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cbc_tfm); + err = crypto_sync_skcipher_setkey(cbc_tfm, param->Ke->data, param->Ke->len); + KUNIT_ASSERT_EQ(test, err, 0); + + cts_tfm = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cts_tfm); + err = crypto_sync_skcipher_setkey(cts_tfm, param->Ke->data, param->Ke->len); + KUNIT_ASSERT_EQ(test, err, 0); + + text = kunit_kzalloc(test, roundup(param->plaintext->len, + crypto_sync_skcipher_blocksize(cbc_tfm)), + GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, text); + + memcpy(text, param->plaintext->data, param->plaintext->len); + memset(&buf, 0, sizeof(buf)); + buf.head[0].iov_base = text; + buf.head[0].iov_len = param->plaintext->len; + buf.len = buf.head[0].iov_len; + + /* Act */ + err = krb5_cbc_cts_encrypt(cts_tfm, cbc_tfm, 0, &buf, NULL, NULL, 0); + KUNIT_ASSERT_EQ(test, err, 0); + err = krb5_cbc_cts_decrypt(cts_tfm, cbc_tfm, 0, &buf); + KUNIT_ASSERT_EQ(test, err, 0); + + /* Assert */ + KUNIT_EXPECT_EQ_MSG(test, + param->plaintext->len, buf.len, + "length mismatch"); + KUNIT_EXPECT_EQ_MSG(test, + memcmp(param->plaintext->data, + buf.head[0].iov_base, buf.len), 0, + "plaintext mismatch"); + + crypto_free_sync_skcipher(cts_tfm); + crypto_free_sync_skcipher(cbc_tfm); +} + +static struct kunit_case encryption_test_cases[] = { + { + .name = "Encryption self-tests", + .run_case = encrypt_selftest_case, + .generate_params = encrypt_selftest_gen_params, + }, + {} +}; + +static struct kunit_suite encryption_test_suite = { + .name = "Encryption test suite", + .test_cases = encryption_test_cases, +}; + +kunit_test_suites(&rfc3961_suite, + &rfc3962_suite, + &rfc6803_suite, + &rfc8009_suite, + &encryption_test_suite); + +MODULE_DESCRIPTION("Test RPCSEC GSS Kerberos 5 functions"); +MODULE_LICENSE("GPL"); diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index ba04e3ec970a..ef0e6af9fc95 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -60,102 +60,29 @@ #include <linux/types.h> #include <linux/jiffies.h> #include <linux/sunrpc/gss_krb5.h> -#include <linux/crypto.h> + +#include "gss_krb5_internal.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif - -/* read_token is a mic token, and message_buffer is the data that the mic was - * supposedly taken over. */ - -static u32 -gss_verify_mic_v1(struct krb5_ctx *ctx, - struct xdr_buf *message_buffer, struct xdr_netobj *read_token) -{ - int signalg; - int sealalg; - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - s32 now; - int direction; - u32 seqnum; - unsigned char *ptr = (unsigned char *)read_token->data; - int bodysize; - u8 *cksumkey; - - dprintk("RPC: krb5_read_token\n"); - - if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, - read_token->len)) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) || - (ptr[1] != (KG_TOK_MIC_MSG & 0xff))) - return GSS_S_DEFECTIVE_TOKEN; - - /* XXX sanity-check bodysize?? */ - - signalg = ptr[2] + (ptr[3] << 8); - if (signalg != ctx->gk5e->signalg) - return GSS_S_DEFECTIVE_TOKEN; - - sealalg = ptr[4] + (ptr[5] << 8); - if (sealalg != SEAL_ALG_NONE) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) - return GSS_S_DEFECTIVE_TOKEN; - - if (ctx->gk5e->keyed_cksum) - cksumkey = ctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(ctx, ptr, 8, message_buffer, 0, - cksumkey, KG_USAGE_SIGN, &md5cksum)) - return GSS_S_FAILURE; - - if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, - ctx->gk5e->cksumlength)) - return GSS_S_BAD_SIG; - - /* it got through unscathed. Make sure the context is unexpired */ - - now = ktime_get_real_seconds(); - - if (now > ctx->endtime) - return GSS_S_CONTEXT_EXPIRED; - - /* do sequencing checks */ - - if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, - &direction, &seqnum)) - return GSS_S_FAILURE; - - if ((ctx->initiate && direction != 0xff) || - (!ctx->initiate && direction != 0)) - return GSS_S_BAD_SIG; - - return GSS_S_COMPLETE; -} - -static u32 -gss_verify_mic_v2(struct krb5_ctx *ctx, - struct xdr_buf *message_buffer, struct xdr_netobj *read_token) +u32 +gss_krb5_verify_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *message_buffer, + struct xdr_netobj *read_token) { + struct crypto_ahash *tfm = ctx->initiate ? + ctx->acceptor_sign : ctx->initiator_sign; char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj cksumobj = {.len = sizeof(cksumdata), - .data = cksumdata}; - time64_t now; + struct xdr_netobj cksumobj = { + .len = ctx->gk5e->cksumlength, + .data = cksumdata, + }; u8 *ptr = read_token->data; - u8 *cksumkey; + __be16 be16_ptr; + time64_t now; u8 flags; int i; - unsigned int cksum_usage; - __be16 be16_ptr; dprintk("RPC: %s\n", __func__); @@ -177,16 +104,8 @@ gss_verify_mic_v2(struct krb5_ctx *ctx, if (ptr[i] != 0xff) return GSS_S_DEFECTIVE_TOKEN; - if (ctx->initiate) { - cksumkey = ctx->acceptor_sign; - cksum_usage = KG_USAGE_ACCEPTOR_SIGN; - } else { - cksumkey = ctx->initiator_sign; - cksum_usage = KG_USAGE_INITIATOR_SIGN; - } - - if (make_checksum_v2(ctx, ptr, GSS_KRB5_TOK_HDR_LEN, message_buffer, 0, - cksumkey, cksum_usage, &cksumobj)) + if (gss_krb5_checksum(tfm, ptr, GSS_KRB5_TOK_HDR_LEN, + message_buffer, 0, &cksumobj)) return GSS_S_FAILURE; if (memcmp(cksumobj.data, ptr + GSS_KRB5_TOK_HDR_LEN, @@ -205,22 +124,3 @@ gss_verify_mic_v2(struct krb5_ctx *ctx, return GSS_S_COMPLETE; } - -u32 -gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, - struct xdr_buf *message_buffer, - struct xdr_netobj *read_token) -{ - struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; - - switch (ctx->enctype) { - default: - BUG(); - case ENCTYPE_DES_CBC_RAW: - case ENCTYPE_DES3_CBC_RAW: - return gss_verify_mic_v1(ctx, message_buffer, read_token); - case ENCTYPE_AES128_CTS_HMAC_SHA1_96: - case ENCTYPE_AES256_CTS_HMAC_SHA1_96: - return gss_verify_mic_v2(ctx, message_buffer, read_token); - } -} diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 48337687848c..b3e1738ff6bf 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -32,329 +32,14 @@ #include <linux/types.h> #include <linux/jiffies.h> #include <linux/sunrpc/gss_krb5.h> -#include <linux/random.h> #include <linux/pagemap.h> +#include "gss_krb5_internal.h" + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif -static inline int -gss_krb5_padding(int blocksize, int length) -{ - return blocksize - (length % blocksize); -} - -static inline void -gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) -{ - int padding = gss_krb5_padding(blocksize, buf->len - offset); - char *p; - struct kvec *iov; - - if (buf->page_len || buf->tail[0].iov_len) - iov = &buf->tail[0]; - else - iov = &buf->head[0]; - p = iov->iov_base + iov->iov_len; - iov->iov_len += padding; - buf->len += padding; - memset(p, padding, padding); -} - -static inline int -gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) -{ - u8 *ptr; - u8 pad; - size_t len = buf->len; - - if (len <= buf->head[0].iov_len) { - pad = *(u8 *)(buf->head[0].iov_base + len - 1); - if (pad > buf->head[0].iov_len) - return -EINVAL; - buf->head[0].iov_len -= pad; - goto out; - } else - len -= buf->head[0].iov_len; - if (len <= buf->page_len) { - unsigned int last = (buf->page_base + len - 1) - >>PAGE_SHIFT; - unsigned int offset = (buf->page_base + len - 1) - & (PAGE_SIZE - 1); - ptr = kmap_atomic(buf->pages[last]); - pad = *(ptr + offset); - kunmap_atomic(ptr); - goto out; - } else - len -= buf->page_len; - BUG_ON(len > buf->tail[0].iov_len); - pad = *(u8 *)(buf->tail[0].iov_base + len - 1); -out: - /* XXX: NOTE: we do not adjust the page lengths--they represent - * a range of data in the real filesystem page cache, and we need - * to know that range so the xdr code can properly place read data. - * However adjusting the head length, as we do above, is harmless. - * In the case of a request that fits into a single page, the server - * also uses length and head length together to determine the original - * start of the request to copy the request for deferal; so it's - * easier on the server if we adjust head and tail length in tandem. - * It's not really a problem that we don't fool with the page and - * tail lengths, though--at worst badly formed xdr might lead the - * server to attempt to parse the padding. - * XXX: Document all these weird requirements for gss mechanism - * wrap/unwrap functions. */ - if (pad > blocksize) - return -EINVAL; - if (buf->len > pad) - buf->len -= pad; - else - return -EINVAL; - return 0; -} - -void -gss_krb5_make_confounder(char *p, u32 conflen) -{ - static u64 i = 0; - u64 *q = (u64 *)p; - - /* rfc1964 claims this should be "random". But all that's really - * necessary is that it be unique. And not even that is necessary in - * our case since our "gssapi" implementation exists only to support - * rpcsec_gss, so we know that the only buffers we will ever encrypt - * already begin with a unique sequence number. Just to hedge my bets - * I'll make a half-hearted attempt at something unique, but ensuring - * uniqueness would mean worrying about atomicity and rollover, and I - * don't care enough. */ - - /* initialize to random value */ - if (i == 0) { - i = get_random_u32(); - i = (i << 32) | get_random_u32(); - } - - switch (conflen) { - case 16: - *q++ = i++; - fallthrough; - case 8: - *q++ = i++; - break; - default: - BUG(); - } -} - -/* Assumptions: the head and tail of inbuf are ours to play with. - * The pages, however, may be real pages in the page cache and we replace - * them with scratch pages from **pages before writing to them. */ -/* XXX: obviously the above should be documentation of wrap interface, - * and shouldn't be in this kerberos-specific file. */ - -/* XXX factor out common code with seal/unseal. */ - -static u32 -gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, - struct xdr_buf *buf, struct page **pages) -{ - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - int blocksize = 0, plainlen; - unsigned char *ptr, *msg_start; - time64_t now; - int headlen; - struct page **tmp_pages; - u32 seq_send; - u8 *cksumkey; - u32 conflen = kctx->gk5e->conflen; - - dprintk("RPC: %s\n", __func__); - - now = ktime_get_real_seconds(); - - blocksize = crypto_sync_skcipher_blocksize(kctx->enc); - gss_krb5_add_padding(buf, offset, blocksize); - BUG_ON((buf->len - offset) % blocksize); - plainlen = conflen + buf->len - offset; - - headlen = g_token_size(&kctx->mech_used, - GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) - - (buf->len - offset); - - ptr = buf->head[0].iov_base + offset; - /* shift data to make room for header. */ - xdr_extend_head(buf, offset, headlen); - - /* XXX Would be cleverer to encrypt while copying. */ - BUG_ON((buf->len - offset - headlen) % blocksize); - - g_make_token_header(&kctx->mech_used, - GSS_KRB5_TOK_HDR_LEN + - kctx->gk5e->cksumlength + plainlen, &ptr); - - - /* ptr now at header described in rfc 1964, section 1.2.1: */ - ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); - ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); - - msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; - - /* - * signalg and sealalg are stored as if they were converted from LE - * to host endian, even though they're opaque pairs of bytes according - * to the RFC. - */ - *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); - *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); - ptr[6] = 0xff; - ptr[7] = 0xff; - - gss_krb5_make_confounder(msg_start, conflen); - - if (kctx->gk5e->keyed_cksum) - cksumkey = kctx->cksum; - else - cksumkey = NULL; - - /* XXXJBF: UGH!: */ - tmp_pages = buf->pages; - buf->pages = pages; - if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen, - cksumkey, KG_USAGE_SEAL, &md5cksum)) - return GSS_S_FAILURE; - buf->pages = tmp_pages; - - memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - - seq_send = atomic_fetch_inc(&kctx->seq_send); - - /* XXX would probably be more efficient to compute checksum - * and encrypt at the same time: */ - if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff, - seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) - return GSS_S_FAILURE; - - if (gss_encrypt_xdr_buf(kctx->enc, buf, - offset + headlen - conflen, pages)) - return GSS_S_FAILURE; - - return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; -} - -static u32 -gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len, - struct xdr_buf *buf, unsigned int *slack, - unsigned int *align) -{ - int signalg; - int sealalg; - char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; - struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), - .data = cksumdata}; - time64_t now; - int direction; - s32 seqnum; - unsigned char *ptr; - int bodysize; - void *data_start, *orig_start; - int data_len; - int blocksize; - u32 conflen = kctx->gk5e->conflen; - int crypt_offset; - u8 *cksumkey; - unsigned int saved_len = buf->len; - - dprintk("RPC: gss_unwrap_kerberos\n"); - - ptr = (u8 *)buf->head[0].iov_base + offset; - if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, - len - offset)) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || - (ptr[1] != (KG_TOK_WRAP_MSG & 0xff))) - return GSS_S_DEFECTIVE_TOKEN; - - /* XXX sanity-check bodysize?? */ - - /* get the sign and seal algorithms */ - - signalg = ptr[2] + (ptr[3] << 8); - if (signalg != kctx->gk5e->signalg) - return GSS_S_DEFECTIVE_TOKEN; - - sealalg = ptr[4] + (ptr[5] << 8); - if (sealalg != kctx->gk5e->sealalg) - return GSS_S_DEFECTIVE_TOKEN; - - if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) - return GSS_S_DEFECTIVE_TOKEN; - - /* - * Data starts after token header and checksum. ptr points - * to the beginning of the token header - */ - crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) - - (unsigned char *)buf->head[0].iov_base; - - buf->len = len; - if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) - return GSS_S_DEFECTIVE_TOKEN; - - if (kctx->gk5e->keyed_cksum) - cksumkey = kctx->cksum; - else - cksumkey = NULL; - - if (make_checksum(kctx, ptr, 8, buf, crypt_offset, - cksumkey, KG_USAGE_SEAL, &md5cksum)) - return GSS_S_FAILURE; - - if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, - kctx->gk5e->cksumlength)) - return GSS_S_BAD_SIG; - - /* it got through unscathed. Make sure the context is unexpired */ - - now = ktime_get_real_seconds(); - - if (now > kctx->endtime) - return GSS_S_CONTEXT_EXPIRED; - - /* do sequencing checks */ - - if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, - ptr + 8, &direction, &seqnum)) - return GSS_S_BAD_SIG; - - if ((kctx->initiate && direction != 0xff) || - (!kctx->initiate && direction != 0)) - return GSS_S_BAD_SIG; - - /* Copy the data back to the right position. XXX: Would probably be - * better to copy and encrypt at the same time. */ - - blocksize = crypto_sync_skcipher_blocksize(kctx->enc); - data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + - conflen; - orig_start = buf->head[0].iov_base + offset; - data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; - memmove(orig_start, data_start, data_len); - buf->head[0].iov_len -= (data_start - orig_start); - buf->len = len - (data_start - orig_start); - - if (gss_krb5_remove_padding(buf, blocksize)) - return GSS_S_DEFECTIVE_TOKEN; - - /* slack must include room for krb5 padding */ - *slack = XDR_QUADLEN(saved_len - buf->len); - /* The GSS blob always precedes the RPC message payload */ - *align = *slack; - return GSS_S_COMPLETE; -} - /* * We can shift data by up to LOCAL_BUF_LEN bytes in a pass. If we need * to do more than that, we shift repeatedly. Kevin Coffman reports @@ -405,9 +90,9 @@ static void rotate_left(u32 base, struct xdr_buf *buf, unsigned int shift) _rotate_left(&subbuf, shift); } -static u32 -gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, - struct xdr_buf *buf, struct page **pages) +u32 +gss_krb5_wrap_v2(struct krb5_ctx *kctx, int offset, + struct xdr_buf *buf, struct page **pages) { u8 *ptr; time64_t now; @@ -418,9 +103,6 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, dprintk("RPC: %s\n", __func__); - if (kctx->gk5e->encrypt_v2 == NULL) - return GSS_S_FAILURE; - /* make room for gss token header */ if (xdr_extend_head(buf, offset, GSS_KRB5_TOK_HDR_LEN)) return GSS_S_FAILURE; @@ -448,7 +130,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, be64ptr = (__be64 *)be16ptr; *be64ptr = cpu_to_be64(atomic64_fetch_inc(&kctx->seq_send64)); - err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); + err = (*kctx->gk5e->encrypt)(kctx, offset, buf, pages); if (err) return err; @@ -456,10 +138,10 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; } -static u32 -gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, - struct xdr_buf *buf, unsigned int *slack, - unsigned int *align) +u32 +gss_krb5_unwrap_v2(struct krb5_ctx *kctx, int offset, int len, + struct xdr_buf *buf, unsigned int *slack, + unsigned int *align) { time64_t now; u8 *ptr; @@ -473,9 +155,6 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, dprintk("RPC: %s\n", __func__); - if (kctx->gk5e->decrypt_v2 == NULL) - return GSS_S_FAILURE; - ptr = buf->head[0].iov_base + offset; if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_WRAP) @@ -505,8 +184,8 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, if (rrc != 0) rotate_left(offset + 16, buf, rrc); - err = (*kctx->gk5e->decrypt_v2)(kctx, offset, len, buf, - &headskip, &tailskip); + err = (*kctx->gk5e->decrypt)(kctx, offset, len, buf, + &headskip, &tailskip); if (err) return GSS_S_FAILURE; @@ -556,41 +235,3 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, *slack = *align + XDR_QUADLEN(ec + GSS_KRB5_TOK_HDR_LEN + tailskip); return GSS_S_COMPLETE; } - -u32 -gss_wrap_kerberos(struct gss_ctx *gctx, int offset, - struct xdr_buf *buf, struct page **pages) -{ - struct krb5_ctx *kctx = gctx->internal_ctx_id; - - switch (kctx->enctype) { - default: - BUG(); - case ENCTYPE_DES_CBC_RAW: - case ENCTYPE_DES3_CBC_RAW: - return gss_wrap_kerberos_v1(kctx, offset, buf, pages); - case ENCTYPE_AES128_CTS_HMAC_SHA1_96: - case ENCTYPE_AES256_CTS_HMAC_SHA1_96: - return gss_wrap_kerberos_v2(kctx, offset, buf, pages); - } -} - -u32 -gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, - int len, struct xdr_buf *buf) -{ - struct krb5_ctx *kctx = gctx->internal_ctx_id; - - switch (kctx->enctype) { - default: - BUG(); - case ENCTYPE_DES_CBC_RAW: - case ENCTYPE_DES3_CBC_RAW: - return gss_unwrap_kerberos_v1(kctx, offset, len, buf, - &gctx->slack, &gctx->align); - case ENCTYPE_AES128_CTS_HMAC_SHA1_96: - case ENCTYPE_AES256_CTS_HMAC_SHA1_96: - return gss_unwrap_kerberos_v2(kctx, offset, len, buf, - &gctx->slack, &gctx->align); - } -} diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index fae632da1058..c84d0cf61980 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/oid_registry.h> #include <linux/sunrpc/msg_prot.h> -#include <linux/sunrpc/gss_asn1.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index d79f12c2550a..7d2cdc2bd374 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -250,8 +250,8 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL); if (!creds) { - kfree(oa->data); - return -ENOMEM; + err = -ENOMEM; + goto free_oa; } oa->data[0].option.data = CREDS_VALUE; @@ -265,29 +265,40 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, /* option buffer */ p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto free_creds; + } length = be32_to_cpup(p); p = xdr_inline_decode(xdr, length); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto free_creds; + } if (length == sizeof(CREDS_VALUE) && memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) { /* We have creds here. parse them */ err = gssx_dec_linux_creds(xdr, creds); if (err) - return err; + goto free_creds; oa->data[0].value.len = 1; /* presence */ } else { /* consume uninteresting buffer */ err = gssx_dec_buffer(xdr, &dummy); if (err) - return err; + goto free_creds; } } return 0; + +free_creds: + kfree(creds); +free_oa: + kfree(oa->data); + oa->data = NULL; + return err; } static int gssx_dec_status(struct xdr_stream *xdr, @@ -783,12 +794,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct gssx_res_accept_sec_context *res = data; u32 value_follows; int err; - struct page *scratch; + struct folio *scratch; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (!scratch) return -ENOMEM; - xdr_set_scratch_page(xdr, scratch); + xdr_set_scratch_folio(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); @@ -833,6 +844,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, err = gssx_dec_option_array(xdr, &res->options); out_free: - __free_page(scratch); + folio_put(scratch); return err; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index acb822b23af1..a8ec30759a18 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -71,12 +71,11 @@ struct gss_svc_data { /* decoded gss client cred: */ struct rpc_gss_wire_cred clcred; - /* save a pointer to the beginning of the encoded verifier, - * for use in encryption/checksumming in svcauth_gss_release: */ - __be32 *verf_start; + u32 gsd_databody_offset; struct rsc *rsci; /* for temporary results */ + __be32 gsd_seq_num; u8 gsd_scratch[GSS_SCRATCH_SIZE]; }; @@ -258,11 +257,11 @@ static int rsi_parse(struct cache_detail *cd, rsii.h.flags = 0; /* expiry */ - expiry = get_expiry(&mesg); - status = -EINVAL; - if (expiry == 0) + status = get_expiry(&mesg, &expiry); + if (status) goto out; + status = -EINVAL; /* major/minor */ len = qword_get(&mesg, buf, mlen); if (len <= 0) @@ -484,11 +483,11 @@ static int rsc_parse(struct cache_detail *cd, rsci.h.flags = 0; /* expiry */ - expiry = get_expiry(&mesg); - status = -EINVAL; - if (expiry == 0) + status = get_expiry(&mesg, &expiry); + if (status) goto out; + status = -EINVAL; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; @@ -692,78 +691,49 @@ alreadyseen: goto out; } -static inline u32 round_up_to_quad(u32 i) -{ - return (i + 3 ) & ~3; -} - -static inline int -svc_safe_getnetobj(struct kvec *argv, struct xdr_netobj *o) -{ - int l; - - if (argv->iov_len < 4) - return -1; - o->len = svc_getnl(argv); - l = round_up_to_quad(o->len); - if (argv->iov_len < l) - return -1; - o->data = argv->iov_base; - argv->iov_base += l; - argv->iov_len -= l; - return 0; -} - -static inline int -svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o) -{ - u8 *p; - - if (resv->iov_len + 4 > PAGE_SIZE) - return -1; - svc_putnl(resv, o->len); - p = resv->iov_base + resv->iov_len; - resv->iov_len += round_up_to_quad(o->len); - if (resv->iov_len > PAGE_SIZE) - return -1; - memcpy(p, o->data, o->len); - memset(p + o->len, 0, round_up_to_quad(o->len) - o->len); - return 0; -} - /* - * Verify the checksum on the header and return SVC_OK on success. - * Otherwise, return SVC_DROP (in the case of a bad sequence number) - * or return SVC_DENIED and indicate error in rqstp->rq_auth_stat. + * Decode and verify a Call's verifier field. For RPC_AUTH_GSS Calls, + * the body of this field contains a variable length checksum. + * + * GSS-specific auth_stat values are mandated by RFC 2203 Section + * 5.3.3.3. */ static int -gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, - __be32 *rpcstart, struct rpc_gss_wire_cred *gc) +svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, + __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct gss_ctx *ctx_id = rsci->mechctx; + u32 flavor, maj_stat; struct xdr_buf rpchdr; struct xdr_netobj checksum; - u32 flavor = 0; - struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec iov; - /* data to compute the checksum over: */ + /* + * Compute the checksum of the incoming Call from the + * XID field to credential field: + */ iov.iov_base = rpcstart; - iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart; + iov.iov_len = (u8 *)xdr->p - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); - rqstp->rq_auth_stat = rpc_autherr_badverf; - if (argv->iov_len < 4) - return SVC_DENIED; - flavor = svc_getnl(argv); - if (flavor != RPC_AUTH_GSS) + /* Call's verf field: */ + if (xdr_stream_decode_opaque_auth(xdr, &flavor, + (void **)&checksum.data, + &checksum.len) < 0) { + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; - if (svc_safe_getnetobj(argv, &checksum)) + } + if (flavor != RPC_AUTH_GSS || checksum.len < XDR_UNIT) { + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; + } - if (rqstp->rq_deferred) /* skip verification of revisited request */ + if (rqstp->rq_deferred) return SVC_OK; - if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { + maj_stat = gss_verify_mic(ctx_id, &rpchdr, &checksum); + if (maj_stat != GSS_S_COMPLETE) { + trace_rpcgss_svc_mic(rqstp, maj_stat); rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } @@ -778,54 +748,36 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, return SVC_OK; } -static int -gss_write_null_verf(struct svc_rqst *rqstp) -{ - __be32 *p; - - svc_putnl(rqstp->rq_res.head, RPC_AUTH_NULL); - p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; - /* don't really need to check if head->iov_len > PAGE_SIZE ... */ - *p++ = 0; - if (!xdr_ressize_check(rqstp, p)) - return -1; - return 0; -} - -static int -gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) +/* + * Construct and encode a Reply's verifier field. The verifier's body + * field contains a variable-length checksum of the GSS sequence + * number. + */ +static bool +svcauth_gss_encode_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) { - __be32 *xdr_seq; + struct gss_svc_data *gsd = rqstp->rq_auth_data; u32 maj_stat; struct xdr_buf verf_data; - struct xdr_netobj mic; - __be32 *p; + struct xdr_netobj checksum; struct kvec iov; - int err = -1; - svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS); - xdr_seq = kmalloc(4, GFP_KERNEL); - if (!xdr_seq) - return -ENOMEM; - *xdr_seq = htonl(seq); - - iov.iov_base = xdr_seq; - iov.iov_len = 4; + gsd->gsd_seq_num = cpu_to_be32(seq); + iov.iov_base = &gsd->gsd_seq_num; + iov.iov_len = XDR_UNIT; xdr_buf_from_iov(&iov, &verf_data); - p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; - mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx_id, &verf_data, &mic); + + checksum.data = gsd->gsd_scratch; + maj_stat = gss_get_mic(ctx_id, &verf_data, &checksum); if (maj_stat != GSS_S_COMPLETE) - goto out; - *p++ = htonl(mic.len); - memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len); - p += XDR_QUADLEN(mic.len); - if (!xdr_ressize_check(rqstp, p)) - goto out; - err = 0; -out: - kfree(xdr_seq); - return err; + goto bad_mic; + + return xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, RPC_AUTH_GSS, + checksum.data, checksum.len) > 0; + +bad_mic: + trace_rpcgss_svc_get_mic(rqstp, maj_stat); + return false; } struct gss_domain { @@ -891,142 +843,125 @@ out: } EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); -static inline int -read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) -{ - __be32 raw; - int status; - - status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); - if (status) - return status; - *obj = ntohl(raw); - return 0; -} - -/* It would be nice if this bit of code could be shared with the client. - * Obstacles: - * The client shouldn't malloc(), would have to pass in own memory. - * The server uses base of head iovec as read pointer, while the - * client uses separate pointer. */ -static int -unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +/* + * RFC 2203, Section 5.3.2.2 + * + * struct rpc_gss_integ_data { + * opaque databody_integ<>; + * opaque checksum<>; + * }; + * + * struct rpc_gss_data_t { + * unsigned int seq_num; + * proc_req_arg_t arg; + * }; + */ +static noinline_for_stack int +svcauth_gss_unwrap_integ(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct gss_svc_data *gsd = rqstp->rq_auth_data; - u32 integ_len, rseqno, maj_stat; - struct xdr_netobj mic; - struct xdr_buf integ_buf; - - /* NFS READ normally uses splice to send data in-place. However - * the data in cache can change after the reply's MIC is computed - * but before the RPC reply is sent. To prevent the client from - * rejecting the server-computed MIC in this somewhat rare case, - * do not use splice with the GSS integrity service. - */ - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + u32 len, offset, seq_num, maj_stat; + struct xdr_buf *buf = xdr->buf; + struct xdr_buf databody_integ; + struct xdr_netobj checksum; /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) return 0; - integ_len = svc_getnl(&buf->head[0]); - if (integ_len & 3) + if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; - if (integ_len > buf->len) + if (len & 3) goto unwrap_failed; - if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + offset = xdr_stream_pos(xdr); + if (xdr_buf_subsegment(buf, &databody_integ, offset, len)) goto unwrap_failed; - /* copy out mic... */ - if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) + /* + * The xdr_stream now points to the @seq_num field. The next + * XDR data item is the @arg field, which contains the clear + * text RPC program payload. The checksum, which follows the + * @arg field, is located and decoded without updating the + * xdr_stream. + */ + + offset += len; + if (xdr_decode_word(buf, offset, &checksum.len)) goto unwrap_failed; - if (mic.len > sizeof(gsd->gsd_scratch)) + if (checksum.len > sizeof(gsd->gsd_scratch)) goto unwrap_failed; - mic.data = gsd->gsd_scratch; - if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) + checksum.data = gsd->gsd_scratch; + if (read_bytes_from_xdr_buf(buf, offset + XDR_UNIT, checksum.data, + checksum.len)) goto unwrap_failed; - maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); + + maj_stat = gss_verify_mic(ctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; - rseqno = svc_getnl(&buf->head[0]); - if (rseqno != seq) + + /* The received seqno is protected by the checksum. */ + if (xdr_stream_decode_u32(xdr, &seq_num) < 0) + goto unwrap_failed; + if (seq_num != seq) goto bad_seqno; - /* trim off the mic and padding at the end before returning */ - xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4); + + xdr_truncate_decode(xdr, XDR_UNIT + checksum.len); return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: - trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); + trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_mic: trace_rpcgss_svc_mic(rqstp, maj_stat); return -EINVAL; } -static inline int -total_buf_len(struct xdr_buf *buf) -{ - return buf->head[0].iov_len + buf->page_len + buf->tail[0].iov_len; -} - -static void -fix_priv_head(struct xdr_buf *buf, int pad) -{ - if (buf->page_len == 0) { - /* We need to adjust head and buf->len in tandem in this - * case to make svc_defer() work--it finds the original - * buffer start using buf->len - buf->head[0].iov_len. */ - buf->head[0].iov_len -= pad; - } -} - -static int -unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +/* + * RFC 2203, Section 5.3.2.3 + * + * struct rpc_gss_priv_data { + * opaque databody_priv<> + * }; + * + * struct rpc_gss_data_t { + * unsigned int seq_num; + * proc_req_arg_t arg; + * }; + */ +static noinline_for_stack int +svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { - u32 priv_len, maj_stat; - int pad, remaining_len, offset; - u32 rseqno; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + u32 len, maj_stat, seq_num, offset; + struct xdr_buf *buf = xdr->buf; + unsigned int saved_len; - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); - - priv_len = svc_getnl(&buf->head[0]); + if (xdr_stream_decode_u32(xdr, &len) < 0) + goto unwrap_failed; if (rqstp->rq_deferred) { /* Already decrypted last time through! The sequence number * check at out_seq is unnecessary but harmless: */ goto out_seq; } - /* buf->len is the number of bytes from the original start of the - * request to the end, where head[0].iov_len is just the bytes - * not yet read from the head, so these two values are different: */ - remaining_len = total_buf_len(buf); - if (priv_len > remaining_len) + if (len > xdr_stream_remaining(xdr)) goto unwrap_failed; - pad = remaining_len - priv_len; - buf->len -= pad; - fix_priv_head(buf, pad); - - maj_stat = gss_unwrap(ctx, 0, priv_len, buf); - pad = priv_len - buf->len; - /* The upper layers assume the buffer is aligned on 4-byte boundaries. - * In the krb5p case, at least, the data ends up offset, so we need to - * move it around. */ - /* XXX: This is very inefficient. It would be better to either do - * this while we encrypt, or maybe in the receive code, if we can peak - * ahead and work out the service and mechanism there. */ - offset = xdr_pad_size(buf->head[0].iov_len); - if (offset) { - buf->buflen = RPCSVC_MAXPAYLOAD; - xdr_shift_buf(buf, offset); - fix_priv_head(buf, pad); - } + offset = xdr_stream_pos(xdr); + + saved_len = buf->len; + maj_stat = gss_unwrap(ctx, offset, offset + len, buf); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; + xdr->nwords -= XDR_QUADLEN(saved_len - buf->len); + out_seq: - rseqno = svc_getnl(&buf->head[0]); - if (rseqno != seq) + /* gss_unwrap() decrypted the sequence number. */ + if (xdr_stream_decode_u32(xdr, &seq_num) < 0) + goto unwrap_failed; + if (seq_num != seq) goto bad_seqno; return 0; @@ -1034,14 +969,14 @@ unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: - trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); + trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_unwrap: trace_rpcgss_svc_unwrap(rqstp, maj_stat); return -EINVAL; } -static int +static enum svc_auth_status svcauth_gss_set_client(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; @@ -1071,87 +1006,38 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) return SVC_OK; } -static inline int -gss_write_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp, - struct xdr_netobj *out_handle, int *major_status) +static bool +svcauth_gss_proc_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp, + struct xdr_netobj *out_handle, int *major_status, + u32 seq_num) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rsc *rsci; - int rc; + bool rc; if (*major_status != GSS_S_COMPLETE) - return gss_write_null_verf(rqstp); + goto null_verifier; rsci = gss_svc_searchbyctx(cd, out_handle); if (rsci == NULL) { *major_status = GSS_S_NO_CONTEXT; - return gss_write_null_verf(rqstp); + goto null_verifier; } - rc = gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN); + + rc = svcauth_gss_encode_verf(rqstp, rsci->mechctx, seq_num); cache_put(&rsci->h, cd); return rc; -} - -static inline int -gss_read_common_verf(struct rpc_gss_wire_cred *gc, - struct kvec *argv, __be32 *authp, - struct xdr_netobj *in_handle) -{ - /* Read the verifier; should be NULL: */ - *authp = rpc_autherr_badverf; - if (argv->iov_len < 2 * 4) - return SVC_DENIED; - if (svc_getnl(argv) != RPC_AUTH_NULL) - return SVC_DENIED; - if (svc_getnl(argv) != 0) - return SVC_DENIED; - /* Martial context handle and token for upcall: */ - *authp = rpc_autherr_badcred; - if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) - return SVC_DENIED; - if (dup_netobj(in_handle, &gc->gc_ctx)) - return SVC_CLOSE; - *authp = rpc_autherr_badverf; - - return 0; -} - -static inline int -gss_read_verf(struct rpc_gss_wire_cred *gc, - struct kvec *argv, __be32 *authp, - struct xdr_netobj *in_handle, - struct xdr_netobj *in_token) -{ - struct xdr_netobj tmpobj; - int res; - - res = gss_read_common_verf(gc, argv, authp, in_handle); - if (res) - return res; - - if (svc_safe_getnetobj(argv, &tmpobj)) { - kfree(in_handle->data); - return SVC_DENIED; - } - if (dup_netobj(in_token, &tmpobj)) { - kfree(in_handle->data); - return SVC_CLOSE; - } - return 0; +null_verifier: + return xdr_stream_encode_opaque_auth(xdr, RPC_AUTH_NULL, NULL, 0) > 0; } static void gss_free_in_token_pages(struct gssp_in_token *in_token) { - u32 inlen; int i; i = 0; - inlen = in_token->page_len; - while (inlen) { - if (in_token->pages[i]) - put_page(in_token->pages[i]); - inlen -= inlen > PAGE_SIZE ? PAGE_SIZE : inlen; - } - + while (in_token->pages[i]) + put_page(in_token->pages[i++]); kfree(in_token->pages); in_token->pages = NULL; } @@ -1161,40 +1047,43 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { - struct kvec *argv = &rqstp->rq_arg.head[0]; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; unsigned int length, pgto_offs, pgfrom_offs; - int pages, i, res, pgto, pgfrom; - size_t inlen, to_offs, from_offs; + int pages, i, pgto, pgfrom; + size_t to_offs, from_offs; + u32 inlen; - res = gss_read_common_verf(gc, argv, &rqstp->rq_auth_stat, in_handle); - if (res) - return res; + if (dup_netobj(in_handle, &gc->gc_ctx)) + return SVC_CLOSE; - inlen = svc_getnl(argv); - if (inlen > (argv->iov_len + rqstp->rq_arg.page_len)) { - kfree(in_handle->data); - return SVC_DENIED; - } + /* + * RFC 2203 Section 5.2.2 + * + * struct rpc_gss_init_arg { + * opaque gss_token<>; + * }; + */ + if (xdr_stream_decode_u32(xdr, &inlen) < 0) + goto out_denied_free; + if (inlen > xdr_stream_remaining(xdr)) + goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); - in_token->pages = kcalloc(pages, sizeof(struct page *), GFP_KERNEL); - if (!in_token->pages) { - kfree(in_handle->data); - return SVC_DENIED; - } + in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + if (!in_token->pages) + goto out_denied_free; in_token->page_base = 0; in_token->page_len = inlen; for (i = 0; i < pages; i++) { in_token->pages[i] = alloc_page(GFP_KERNEL); if (!in_token->pages[i]) { - kfree(in_handle->data); gss_free_in_token_pages(in_token); - return SVC_DENIED; + goto out_denied_free; } } - length = min_t(unsigned int, inlen, argv->iov_len); - memcpy(page_address(in_token->pages[0]), argv->iov_base, length); + length = min_t(unsigned int, inlen, (char *)xdr->end - (char *)xdr->p); + memcpy(page_address(in_token->pages[0]), xdr->p, length); inlen -= length; to_offs = length; @@ -1217,26 +1106,41 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, inlen -= length; } return 0; + +out_denied_free: + kfree(in_handle->data); + return SVC_DENIED; } -static inline int -gss_write_resv(struct kvec *resv, size_t size_limit, - struct xdr_netobj *out_handle, struct xdr_netobj *out_token, - int major_status, int minor_status) -{ - if (resv->iov_len + 4 > size_limit) - return -1; - svc_putnl(resv, RPC_SUCCESS); - if (svc_safe_putnetobj(resv, out_handle)) - return -1; - if (resv->iov_len + 3 * 4 > size_limit) - return -1; - svc_putnl(resv, major_status); - svc_putnl(resv, minor_status); - svc_putnl(resv, GSS_SEQ_WIN); - if (svc_safe_putnetobj(resv, out_token)) - return -1; - return 0; +/* + * RFC 2203, Section 5.2.3.1. + * + * struct rpc_gss_init_res { + * opaque handle<>; + * unsigned int gss_major; + * unsigned int gss_minor; + * unsigned int seq_window; + * opaque gss_token<>; + * }; + */ +static bool +svcxdr_encode_gss_init_res(struct xdr_stream *xdr, + struct xdr_netobj *handle, + struct xdr_netobj *gss_token, + unsigned int major_status, + unsigned int minor_status, u32 seq_num) +{ + if (xdr_stream_encode_opaque(xdr, handle->data, handle->len) < 0) + return false; + if (xdr_stream_encode_u32(xdr, major_status) < 0) + return false; + if (xdr_stream_encode_u32(xdr, minor_status) < 0) + return false; + if (xdr_stream_encode_u32(xdr, seq_num) < 0) + return false; + if (xdr_stream_encode_opaque(xdr, gss_token->data, gss_token->len) < 0) + return false; + return true; } /* @@ -1246,20 +1150,44 @@ gss_write_resv(struct kvec *resv, size_t size_limit, * the upcall results are available, write the verifier and result. * Otherwise, drop the request pending an answer to the upcall. */ -static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc) +static int +svcauth_gss_legacy_init(struct svc_rqst *rqstp, + struct rpc_gss_wire_cred *gc) { - struct kvec *argv = &rqstp->rq_arg.head[0]; - struct kvec *resv = &rqstp->rq_res.head[0]; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct rsi *rsip, rsikey; + __be32 *p; + u32 len; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); - ret = gss_read_verf(gc, argv, &rqstp->rq_auth_stat, - &rsikey.in_handle, &rsikey.in_token); - if (ret) - return ret; + if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) + return SVC_CLOSE; + + /* + * RFC 2203 Section 5.2.2 + * + * struct rpc_gss_init_arg { + * opaque gss_token<>; + * }; + */ + if (xdr_stream_decode_u32(xdr, &len) < 0) { + kfree(rsikey.in_handle.data); + return SVC_DENIED; + } + p = xdr_inline_decode(xdr, len); + if (!p) { + kfree(rsikey.in_handle.data); + return SVC_DENIED; + } + rsikey.in_token.data = kmalloc(len, GFP_KERNEL); + if (ZERO_OR_NULL_PTR(rsikey.in_token.data)) { + kfree(rsikey.in_handle.data); + return SVC_CLOSE; + } + memcpy(rsikey.in_token.data, p, len); + rsikey.in_token.len = len; /* Perform upcall, or find upcall result: */ rsip = rsi_lookup(sn->rsi_cache, &rsikey); @@ -1271,13 +1199,14 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, return SVC_CLOSE; ret = SVC_CLOSE; - /* Got an answer to the upcall; use it: */ - if (gss_write_init_verf(sn->rsc_cache, rqstp, - &rsip->out_handle, &rsip->major_status)) + if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &rsip->out_handle, + &rsip->major_status, GSS_SEQ_WIN)) + goto out; + if (!svcxdr_set_accept_stat(rqstp)) goto out; - if (gss_write_resv(resv, PAGE_SIZE, - &rsip->out_handle, &rsip->out_token, - rsip->major_status, rsip->minor_status)) + if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &rsip->out_handle, + &rsip->out_token, rsip->major_status, + rsip->minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; @@ -1361,7 +1290,6 @@ out: static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { - struct kvec *resv = &rqstp->rq_res.head[0]; struct xdr_netobj cli_handle; struct gssp_upcall_data ud; uint64_t handle; @@ -1399,13 +1327,14 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, goto out; } - /* Got an answer to the upcall; use it: */ - if (gss_write_init_verf(sn->rsc_cache, rqstp, - &cli_handle, &ud.major_status)) + if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &cli_handle, + &ud.major_status, GSS_SEQ_WIN)) goto out; - if (gss_write_resv(resv, PAGE_SIZE, - &cli_handle, &ud.out_token, - ud.major_status, ud.minor_status)) + if (!svcxdr_set_accept_stat(rqstp)) + goto out; + if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &cli_handle, + &ud.out_token, ud.major_status, + ud.minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; @@ -1442,6 +1371,31 @@ static bool use_gss_proxy(struct net *net) return sn->use_gss_proxy; } +static noinline_for_stack int +svcauth_gss_proc_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) +{ + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + u32 flavor, len; + void *body; + + /* Call's verf field: */ + if (xdr_stream_decode_opaque_auth(xdr, &flavor, &body, &len) < 0) + return SVC_GARBAGE; + if (flavor != RPC_AUTH_NULL || len != 0) { + rqstp->rq_auth_stat = rpc_autherr_badverf; + return SVC_DENIED; + } + + if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) { + rqstp->rq_auth_stat = rpc_autherr_badcred; + return SVC_DENIED; + } + + if (!use_gss_proxy(SVC_NET(rqstp))) + return svcauth_gss_legacy_init(rqstp, gc); + return svcauth_gss_proxy_init(rqstp, gc); +} + #ifdef CONFIG_PROC_FS static ssize_t write_gssp(struct file *file, const char __user *buf, @@ -1524,6 +1478,56 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net) clear_gssp_clnt(sn); } } + +static ssize_t read_gss_krb5_enctypes(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct rpcsec_gss_oid oid = { + .len = 9, + .data = "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02", + }; + struct gss_api_mech *mech; + ssize_t ret; + + mech = gss_mech_get_by_OID(&oid); + if (!mech) + return 0; + if (!mech->gm_upcall_enctypes) { + gss_mech_put(mech); + return 0; + } + + ret = simple_read_from_buffer(buf, count, ppos, + mech->gm_upcall_enctypes, + strlen(mech->gm_upcall_enctypes)); + gss_mech_put(mech); + return ret; +} + +static const struct proc_ops gss_krb5_enctypes_proc_ops = { + .proc_open = nonseekable_open, + .proc_read = read_gss_krb5_enctypes, +}; + +static int create_krb5_enctypes_proc_entry(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + sn->gss_krb5_enctypes = + proc_create_data("gss_krb5_enctypes", S_IFREG | 0444, + sn->proc_net_rpc, &gss_krb5_enctypes_proc_ops, + net); + return sn->gss_krb5_enctypes ? 0 : -ENOMEM; +} + +static void destroy_krb5_enctypes_proc_entry(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + if (sn->gss_krb5_enctypes) + remove_proc_entry("gss_krb5_enctypes", sn->proc_net_rpc); +} + #else /* CONFIG_PROC_FS */ static int create_use_gss_proxy_proc_entry(struct net *net) @@ -1533,83 +1537,129 @@ static int create_use_gss_proxy_proc_entry(struct net *net) static void destroy_use_gss_proxy_proc_entry(struct net *net) {} +static int create_krb5_enctypes_proc_entry(struct net *net) +{ + return 0; +} + +static void destroy_krb5_enctypes_proc_entry(struct net *net) {} + #endif /* CONFIG_PROC_FS */ /* - * Accept an rpcsec packet. - * If context establishment, punt to user space - * If data exchange, verify/decrypt - * If context destruction, handle here - * In the context establishment and destruction case we encode - * response here and return SVC_COMPLETE. + * The Call's credential body should contain a struct rpc_gss_cred_t. + * + * RFC 2203 Section 5 + * + * struct rpc_gss_cred_t { + * union switch (unsigned int version) { + * case RPCSEC_GSS_VERS_1: + * struct { + * rpc_gss_proc_t gss_proc; + * unsigned int seq_num; + * rpc_gss_service_t service; + * opaque handle<>; + * } rpc_gss_cred_vers_1_t; + * } + * }; */ -static int +static bool +svcauth_gss_decode_credbody(struct xdr_stream *xdr, + struct rpc_gss_wire_cred *gc, + __be32 **rpcstart) +{ + ssize_t handle_len; + u32 body_len; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT); + if (!p) + return false; + /* + * start of rpc packet is 7 u32's back from here: + * xid direction rpcversion prog vers proc flavour + */ + *rpcstart = p - 7; + body_len = be32_to_cpup(p); + if (body_len > RPC_MAX_AUTH_SIZE) + return false; + + /* struct rpc_gss_cred_t */ + if (xdr_stream_decode_u32(xdr, &gc->gc_v) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &gc->gc_proc) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &gc->gc_seq) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &gc->gc_svc) < 0) + return false; + handle_len = xdr_stream_decode_opaque_inline(xdr, + (void **)&gc->gc_ctx.data, + body_len); + if (handle_len < 0) + return false; + if (body_len != XDR_UNIT * 5 + xdr_align_size(handle_len)) + return false; + + gc->gc_ctx.len = handle_len; + return true; +} + +/** + * svcauth_gss_accept - Decode and validate incoming RPC_AUTH_GSS credential + * @rqstp: RPC transaction + * + * Return values: + * %SVC_OK: Success + * %SVC_COMPLETE: GSS context lifetime event + * %SVC_DENIED: Credential or verifier is not valid + * %SVC_GARBAGE: Failed to decode credential or verifier + * %SVC_CLOSE: Temporary failure + * + * The rqstp->rq_auth_stat field is also set (see RFCs 2203 and 5531). + */ +static enum svc_auth_status svcauth_gss_accept(struct svc_rqst *rqstp) { - struct kvec *argv = &rqstp->rq_arg.head[0]; - struct kvec *resv = &rqstp->rq_res.head[0]; - u32 crlen; struct gss_svc_data *svcdata = rqstp->rq_auth_data; + __be32 *rpcstart; struct rpc_gss_wire_cred *gc; struct rsc *rsci = NULL; - __be32 *rpcstart; - __be32 *reject_stat = resv->iov_base + resv->iov_len; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - rqstp->rq_auth_stat = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_failed; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; - svcdata->verf_start = NULL; + svcdata->gsd_databody_offset = 0; svcdata->rsci = NULL; gc = &svcdata->clcred; - /* start of rpc packet is 7 u32's back from here: - * xid direction rpcversion prog vers proc flavour - */ - rpcstart = argv->iov_base; - rpcstart -= 7; - - /* credential is: - * version(==1), proc(0,1,2,3), seq, service (1,2,3), handle - * at least 5 u32s, and is preceded by length, so that makes 6. - */ - - if (argv->iov_len < 5 * 4) - goto auth_err; - crlen = svc_getnl(argv); - if (svc_getnl(argv) != RPC_GSS_VERSION) - goto auth_err; - gc->gc_proc = svc_getnl(argv); - gc->gc_seq = svc_getnl(argv); - gc->gc_svc = svc_getnl(argv); - if (svc_safe_getnetobj(argv, &gc->gc_ctx)) + rqstp->rq_auth_stat = rpc_autherr_badcred; + if (!svcauth_gss_decode_credbody(&rqstp->rq_arg_stream, gc, &rpcstart)) goto auth_err; - if (crlen != round_up_to_quad(gc->gc_ctx.len) + 5 * 4) + if (gc->gc_v != RPC_GSS_VERSION) goto auth_err; - if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) - goto auth_err; - - rqstp->rq_auth_stat = rpc_autherr_badverf; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: - if (use_gss_proxy(SVC_NET(rqstp))) - return svcauth_gss_proxy_init(rqstp, gc); - else - return svcauth_gss_legacy_init(rqstp, gc); - case RPC_GSS_PROC_DATA: + if (rqstp->rq_proc != 0) + goto auth_err; + return svcauth_gss_proc_init(rqstp, gc); case RPC_GSS_PROC_DESTROY: - /* Look up the context, and check the verifier: */ + if (rqstp->rq_proc != 0) + goto auth_err; + fallthrough; + case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; - switch (gss_verify_header(rqstp, rsci, rpcstart, gc)) { + switch (svcauth_gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: @@ -1619,6 +1669,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp) } break; default: + if (rqstp->rq_proc != 0) + goto auth_err; rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } @@ -1626,19 +1678,20 @@ svcauth_gss_accept(struct svc_rqst *rqstp) /* now act upon the command: */ switch (gc->gc_proc) { case RPC_GSS_PROC_DESTROY: - if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) + if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) + goto auth_err; + if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; /* Delete the entry from the cache_list and call cache_put */ sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); - if (resv->iov_len + 4 > PAGE_SIZE) - goto drop; - svc_putnl(resv, RPC_SUCCESS); goto complete; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; - svcdata->verf_start = resv->iov_base + resv->iov_len; - if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) + if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) + goto auth_err; + if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; + svcdata->gsd_databody_offset = xdr_stream_pos(&rqstp->rq_res_stream); rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); rqstp->rq_auth_stat = rpc_autherr_badcred; @@ -1646,22 +1699,20 @@ svcauth_gss_accept(struct svc_rqst *rqstp) case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: - /* placeholders for length and seq. number: */ - svc_putnl(resv, 0); - svc_putnl(resv, 0); - if (unwrap_integ_data(rqstp, &rqstp->rq_arg, - gc->gc_seq, rsci->mechctx)) + /* placeholders for body length and seq. number: */ + xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); + if (svcauth_gss_unwrap_integ(rqstp, gc->gc_seq, + rsci->mechctx)) goto garbage_args; - rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE; + svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE); break; case RPC_GSS_SVC_PRIVACY: - /* placeholders for length and seq. number: */ - svc_putnl(resv, 0); - svc_putnl(resv, 0); - if (unwrap_priv_data(rqstp, &rqstp->rq_arg, - gc->gc_seq, rsci->mechctx)) + /* placeholders for body length and seq. number: */ + xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); + if (svcauth_gss_unwrap_priv(rqstp, gc->gc_seq, + rsci->mechctx)) goto garbage_args; - rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE * 2; + svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE * 2); break; default: goto auth_err; @@ -1680,8 +1731,7 @@ garbage_args: ret = SVC_GARBAGE; goto out; auth_err: - /* Restore write pointer to its original value: */ - xdr_ressize_check(rqstp, reject_stat); + xdr_truncate_encode(&rqstp->rq_res_stream, XDR_UNIT * 2); ret = SVC_DENIED; goto out; complete: @@ -1695,104 +1745,125 @@ out: return ret; } -static __be32 * -svcauth_gss_prepare_to_wrap(struct xdr_buf *resbuf, struct gss_svc_data *gsd) +static u32 +svcauth_gss_prepare_to_wrap(struct svc_rqst *rqstp, struct gss_svc_data *gsd) { - __be32 *p; - u32 verf_len; + u32 offset; + + /* Release can be called twice, but we only wrap once. */ + offset = gsd->gsd_databody_offset; + gsd->gsd_databody_offset = 0; - p = gsd->verf_start; - gsd->verf_start = NULL; + /* AUTH_ERROR replies are not wrapped. */ + if (rqstp->rq_auth_stat != rpc_auth_ok) + return 0; - /* If the reply stat is nonzero, don't wrap: */ - if (*(p-1) != rpc_success) - return NULL; - /* Skip the verifier: */ - p += 1; - verf_len = ntohl(*p++); - p += XDR_QUADLEN(verf_len); - /* move accept_stat to right place: */ - memcpy(p, p + 2, 4); - /* Also don't wrap if the accept stat is nonzero: */ - if (*p != rpc_success) { - resbuf->head[0].iov_len -= 2 * 4; - return NULL; - } - p++; - return p; + /* Also don't wrap if the accept_stat is nonzero: */ + if (*rqstp->rq_accept_statp != rpc_success) + return 0; + + return offset; } -static inline int -svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) +/* + * RFC 2203, Section 5.3.2.2 + * + * struct rpc_gss_integ_data { + * opaque databody_integ<>; + * opaque checksum<>; + * }; + * + * struct rpc_gss_data_t { + * unsigned int seq_num; + * proc_req_arg_t arg; + * }; + * + * The RPC Reply message has already been XDR-encoded. rq_res_stream + * is now positioned so that the checksum can be written just past + * the RPC Reply message. + */ +static int svcauth_gss_wrap_integ(struct svc_rqst *rqstp) { - struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct gss_svc_data *gsd = rqstp->rq_auth_data; + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rpc_gss_wire_cred *gc = &gsd->clcred; - struct xdr_buf *resbuf = &rqstp->rq_res; - struct xdr_buf integ_buf; - struct xdr_netobj mic; - struct kvec *resv; - __be32 *p; - int integ_offset, integ_len; - int stat = -EINVAL; + struct xdr_buf *buf = xdr->buf; + struct xdr_buf databody_integ; + struct xdr_netobj checksum; + u32 offset, maj_stat; - p = svcauth_gss_prepare_to_wrap(resbuf, gsd); - if (p == NULL) - goto out; - integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; - integ_len = resbuf->len - integ_offset; - if (integ_len & 3) + offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); + if (!offset) goto out; - *p++ = htonl(integ_len); - *p++ = htonl(gc->gc_seq); - if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) { - WARN_ON_ONCE(1); - goto out_err; - } - if (resbuf->tail[0].iov_base == NULL) { - if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE) - goto out_err; - resbuf->tail[0].iov_base = resbuf->head[0].iov_base - + resbuf->head[0].iov_len; - resbuf->tail[0].iov_len = 0; - } - resv = &resbuf->tail[0]; - mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; - if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic)) - goto out_err; - svc_putnl(resv, mic.len); - memset(mic.data + mic.len, 0, - round_up_to_quad(mic.len) - mic.len); - resv->iov_len += XDR_QUADLEN(mic.len) << 2; - /* not strictly required: */ - resbuf->len += XDR_QUADLEN(mic.len) << 2; - if (resv->iov_len > PAGE_SIZE) - goto out_err; + + if (xdr_buf_subsegment(buf, &databody_integ, offset + XDR_UNIT, + buf->len - offset - XDR_UNIT)) + goto wrap_failed; + /* Buffer space for these has already been reserved in + * svcauth_gss_accept(). */ + if (xdr_encode_word(buf, offset, databody_integ.len)) + goto wrap_failed; + if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) + goto wrap_failed; + + checksum.data = gsd->gsd_scratch; + maj_stat = gss_get_mic(gsd->rsci->mechctx, &databody_integ, &checksum); + if (maj_stat != GSS_S_COMPLETE) + goto bad_mic; + + if (xdr_stream_encode_opaque(xdr, checksum.data, checksum.len) < 0) + goto wrap_failed; + xdr_commit_encode(xdr); + out: - stat = 0; -out_err: - return stat; + return 0; + +bad_mic: + trace_rpcgss_svc_get_mic(rqstp, maj_stat); + return -EINVAL; +wrap_failed: + trace_rpcgss_svc_wrap_failed(rqstp); + return -EINVAL; } -static inline int -svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) +/* + * RFC 2203, Section 5.3.2.3 + * + * struct rpc_gss_priv_data { + * opaque databody_priv<> + * }; + * + * struct rpc_gss_data_t { + * unsigned int seq_num; + * proc_req_arg_t arg; + * }; + * + * gss_wrap() expands the size of the RPC message payload in the + * response buffer. The main purpose of svcauth_gss_wrap_priv() + * is to ensure there is adequate space in the response buffer to + * avoid overflow during the wrap. + */ +static int svcauth_gss_wrap_priv(struct svc_rqst *rqstp) { - struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc = &gsd->clcred; - struct xdr_buf *resbuf = &rqstp->rq_res; - struct page **inpages = NULL; - __be32 *p, *len; - int offset; - int pad; - - p = svcauth_gss_prepare_to_wrap(resbuf, gsd); - if (p == NULL) + struct xdr_buf *buf = &rqstp->rq_res; + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + u32 offset, pad, maj_stat; + __be32 *p; + + offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); + if (!offset) return 0; - len = p++; - offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base; - *p++ = htonl(gc->gc_seq); - inpages = resbuf->pages; - /* XXX: Would be better to write some xdr helper functions for - * nfs{2,3,4}xdr.c that place the data right, instead of copying: */ + + /* + * Buffer space for this field has already been reserved + * in svcauth_gss_accept(). Note that the GSS sequence + * number is encrypted along with the RPC reply payload. + */ + if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) + goto wrap_failed; /* * If there is currently tail data, make sure there is @@ -1801,19 +1872,17 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) * there is RPC_MAX_AUTH_SIZE slack space available in * both the head and tail. */ - if (resbuf->tail[0].iov_base) { - if (resbuf->tail[0].iov_base >= - resbuf->head[0].iov_base + PAGE_SIZE) - return -EINVAL; - if (resbuf->tail[0].iov_base < resbuf->head[0].iov_base) - return -EINVAL; - if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len + if (tail->iov_base) { + if (tail->iov_base >= head->iov_base + PAGE_SIZE) + goto wrap_failed; + if (tail->iov_base < head->iov_base) + goto wrap_failed; + if (tail->iov_len + head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) - return -ENOMEM; - memmove(resbuf->tail[0].iov_base + RPC_MAX_AUTH_SIZE, - resbuf->tail[0].iov_base, - resbuf->tail[0].iov_len); - resbuf->tail[0].iov_base += RPC_MAX_AUTH_SIZE; + goto wrap_failed; + memmove(tail->iov_base + RPC_MAX_AUTH_SIZE, tail->iov_base, + tail->iov_len); + tail->iov_base += RPC_MAX_AUTH_SIZE; } /* * If there is no current tail data, make sure there is @@ -1822,55 +1891,70 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) * is RPC_MAX_AUTH_SIZE slack space available in both the * head and tail. */ - if (resbuf->tail[0].iov_base == NULL) { - if (resbuf->head[0].iov_len + 2*RPC_MAX_AUTH_SIZE > PAGE_SIZE) - return -ENOMEM; - resbuf->tail[0].iov_base = resbuf->head[0].iov_base - + resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE; - resbuf->tail[0].iov_len = 0; + if (!tail->iov_base) { + if (head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) + goto wrap_failed; + tail->iov_base = head->iov_base + + head->iov_len + RPC_MAX_AUTH_SIZE; + tail->iov_len = 0; } - if (gss_wrap(gsd->rsci->mechctx, offset, resbuf, inpages)) - return -ENOMEM; - *len = htonl(resbuf->len - offset); - pad = 3 - ((resbuf->len - offset - 1)&3); - p = (__be32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len); + + maj_stat = gss_wrap(gsd->rsci->mechctx, offset + XDR_UNIT, buf, + buf->pages); + if (maj_stat != GSS_S_COMPLETE) + goto bad_wrap; + + /* Wrapping can change the size of databody_priv. */ + if (xdr_encode_word(buf, offset, buf->len - offset - XDR_UNIT)) + goto wrap_failed; + pad = xdr_pad_size(buf->len - offset - XDR_UNIT); + p = (__be32 *)(tail->iov_base + tail->iov_len); memset(p, 0, pad); - resbuf->tail[0].iov_len += pad; - resbuf->len += pad; + tail->iov_len += pad; + buf->len += pad; + return 0; +wrap_failed: + trace_rpcgss_svc_wrap_failed(rqstp); + return -EINVAL; +bad_wrap: + trace_rpcgss_svc_wrap(rqstp, maj_stat); + return -ENOMEM; } +/** + * svcauth_gss_release - Wrap payload and release resources + * @rqstp: RPC transaction context + * + * Return values: + * %0: the Reply is ready to be sent + * %-ENOMEM: failed to allocate memory + * %-EINVAL: encoding error + */ static int svcauth_gss_release(struct svc_rqst *rqstp) { - struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; - struct rpc_gss_wire_cred *gc; - struct xdr_buf *resbuf = &rqstp->rq_res; - int stat = -EINVAL; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); + struct gss_svc_data *gsd = rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc; + int stat; if (!gsd) goto out; gc = &gsd->clcred; if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; - /* Release can be called twice, but we only wrap once. */ - if (gsd->verf_start == NULL) - goto out; - /* normally not set till svc_send, but we need it here: */ - /* XXX: what for? Do we mess it up the moment we call svc_putu32 - * or whatever? */ - resbuf->len = total_buf_len(resbuf); + switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: - stat = svcauth_gss_wrap_resp_integ(rqstp); + stat = svcauth_gss_wrap_integ(rqstp); if (stat) goto out_err; break; case RPC_GSS_SVC_PRIVACY: - stat = svcauth_gss_wrap_resp_priv(rqstp); + stat = svcauth_gss_wrap_priv(rqstp); if (stat) goto out_err; break; @@ -1915,6 +1999,11 @@ svcauth_gss_domain_release(struct auth_domain *dom) call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); } +static rpc_authflavor_t svcauth_gss_pseudoflavor(struct svc_rqst *rqstp) +{ + return svcauth_gss_flavor(rqstp->rq_gssclient); +} + static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, @@ -1923,6 +2012,7 @@ static struct auth_ops svcauthops_gss = { .release = svcauth_gss_release, .domain_release = svcauth_gss_domain_release, .set_client = svcauth_gss_set_client, + .pseudoflavor = svcauth_gss_pseudoflavor, }; static int rsi_cache_create_net(struct net *net) @@ -1997,7 +2087,15 @@ gss_svc_init_net(struct net *net) rv = create_use_gss_proxy_proc_entry(net); if (rv) goto out2; + + rv = create_krb5_enctypes_proc_entry(net); + if (rv) + goto out3; + return 0; + +out3: + destroy_use_gss_proxy_proc_entry(net); out2: rsi_cache_destroy_net(net); out1: @@ -2008,6 +2106,7 @@ out1: void gss_svc_shutdown_net(struct net *net) { + destroy_krb5_enctypes_proc_entry(net); destroy_use_gss_proxy_proc_entry(net); rsi_cache_destroy_net(net); rsc_cache_destroy_net(net); diff --git a/net/sunrpc/auth_tls.c b/net/sunrpc/auth_tls.c new file mode 100644 index 000000000000..87f570fd3b00 --- /dev/null +++ b/net/sunrpc/auth_tls.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021, 2022 Oracle. All rights reserved. + * + * The AUTH_TLS credential is used only to probe a remote peer + * for RPC-over-TLS support. + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/sunrpc/clnt.h> + +static const char *starttls_token = "STARTTLS"; +static const size_t starttls_len = 8; + +static struct rpc_auth tls_auth; +static struct rpc_cred tls_cred; + +static void tls_encode_probe(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + const void *obj) +{ +} + +static int tls_decode_probe(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + void *obj) +{ + return 0; +} + +static const struct rpc_procinfo rpcproc_tls_probe = { + .p_encode = tls_encode_probe, + .p_decode = tls_decode_probe, +}; + +static void rpc_tls_probe_call_prepare(struct rpc_task *task, void *data) +{ + task->tk_flags &= ~RPC_TASK_NO_RETRANS_TIMEOUT; + rpc_call_start(task); +} + +static void rpc_tls_probe_call_done(struct rpc_task *task, void *data) +{ +} + +static const struct rpc_call_ops rpc_tls_probe_ops = { + .rpc_call_prepare = rpc_tls_probe_call_prepare, + .rpc_call_done = rpc_tls_probe_call_done, +}; + +static int tls_probe(struct rpc_clnt *clnt) +{ + struct rpc_message msg = { + .rpc_proc = &rpcproc_tls_probe, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .rpc_op_cred = &tls_cred, + .callback_ops = &rpc_tls_probe_ops, + .flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + }; + struct rpc_task *task; + int status; + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} + +static struct rpc_auth *tls_create(const struct rpc_auth_create_args *args, + struct rpc_clnt *clnt) +{ + refcount_inc(&tls_auth.au_count); + return &tls_auth; +} + +static void tls_destroy(struct rpc_auth *auth) +{ +} + +static struct rpc_cred *tls_lookup_cred(struct rpc_auth *auth, + struct auth_cred *acred, int flags) +{ + return get_rpccred(&tls_cred); +} + +static void tls_destroy_cred(struct rpc_cred *cred) +{ +} + +static int tls_match(struct auth_cred *acred, struct rpc_cred *cred, int taskflags) +{ + return 1; +} + +static int tls_marshal(struct rpc_task *task, struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 * XDR_UNIT); + if (!p) + return -EMSGSIZE; + /* Credential */ + *p++ = rpc_auth_tls; + *p++ = xdr_zero; + /* Verifier */ + *p++ = rpc_auth_null; + *p = xdr_zero; + return 0; +} + +static int tls_refresh(struct rpc_task *task) +{ + set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags); + return 0; +} + +static int tls_validate(struct rpc_task *task, struct xdr_stream *xdr) +{ + __be32 *p; + void *str; + + p = xdr_inline_decode(xdr, XDR_UNIT); + if (!p) + return -EIO; + if (*p != rpc_auth_null) + return -EIO; + if (xdr_stream_decode_opaque_inline(xdr, &str, starttls_len) != starttls_len) + return -EPROTONOSUPPORT; + if (memcmp(str, starttls_token, starttls_len)) + return -EPROTONOSUPPORT; + return 0; +} + +const struct rpc_authops authtls_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_TLS, + .au_name = "NULL", + .create = tls_create, + .destroy = tls_destroy, + .lookup_cred = tls_lookup_cred, + .ping = tls_probe, +}; + +static struct rpc_auth tls_auth = { + .au_cslack = NUL_CALLSLACK, + .au_rslack = NUL_REPLYSLACK, + .au_verfsize = NUL_REPLYSLACK, + .au_ralign = NUL_REPLYSLACK, + .au_ops = &authtls_ops, + .au_flavor = RPC_AUTH_TLS, + .au_count = REFCOUNT_INIT(1), +}; + +static const struct rpc_credops tls_credops = { + .cr_name = "AUTH_TLS", + .crdestroy = tls_destroy_cred, + .crmatch = tls_match, + .crmarshal = tls_marshal, + .crwrap_req = rpcauth_wrap_req_encode, + .crrefresh = tls_refresh, + .crvalidate = tls_validate, + .crunwrap_resp = rpcauth_unwrap_resp_decode, +}; + +static struct rpc_cred tls_cred = { + .cr_lru = LIST_HEAD_INIT(tls_cred.cr_lru), + .cr_auth = &tls_auth, + .cr_ops = &tls_credops, + .cr_count = REFCOUNT_INIT(2), + .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, +}; diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 65a6c6429a53..caa94cf57123 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -83,7 +83,6 @@ static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt) return NULL; req->rq_xprt = xprt; - INIT_LIST_HEAD(&req->rq_bc_list); /* Preallocate one XDR receive buffer */ if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) { @@ -349,10 +348,8 @@ found: } /* - * Add callback request to callback list. The callback - * service sleeps on the sv_cb_waitq waiting for new - * requests. Wake it up after adding enqueing the - * request. + * Add callback request to callback list. Wake a thread + * on the first pool (usually the only pool) to handle it. */ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) { @@ -369,8 +366,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) dprintk("RPC: add callback request to list\n"); xprt_get(xprt); - spin_lock(&bc_serv->sv_cb_lock); - list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); - wake_up(&bc_serv->sv_cb_waitq); - spin_unlock(&bc_serv->sv_cb_lock); + lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list); + svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]); } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 95ff74706104..131090f31e6a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -135,6 +135,8 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, hlist_add_head_rcu(&new->cache_list, head); detail->entries++; + if (detail->nextcheck > new->expiry_time) + detail->nextcheck = new->expiry_time + 1; cache_get(new); spin_unlock(&detail->hash_lock); @@ -281,21 +283,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h return rv; } -/* - * This is the generic cache management routine for all - * the authentication caches. - * It checks the currency of a cache item and will (later) - * initiate an upcall to fill it if needed. - * - * - * Returns 0 if the cache_head can be used, or cache_puts it and returns - * -EAGAIN if upcall is pending and request has been queued - * -ETIMEDOUT if upcall failed or request could not be queue or - * upcall completed but item is still invalid (implying that - * the cache item has been replaced with a newer one). - * -ENOENT if cache entry was negative - */ -int cache_check(struct cache_detail *detail, +int cache_check_rcu(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp) { int rv; @@ -336,6 +324,31 @@ int cache_check(struct cache_detail *detail, rv = -ETIMEDOUT; } } + + return rv; +} +EXPORT_SYMBOL_GPL(cache_check_rcu); + +/* + * This is the generic cache management routine for all + * the authentication caches. + * It checks the currency of a cache item and will (later) + * initiate an upcall to fill it if needed. + * + * + * Returns 0 if the cache_head can be used, or cache_puts it and returns + * -EAGAIN if upcall is pending and request has been queued + * -ETIMEDOUT if upcall failed or request could not be queue or + * upcall completed but item is still invalid (implying that + * the cache item has been replaced with a newer one). + * -ENOENT if cache entry was negative + */ +int cache_check(struct cache_detail *detail, + struct cache_head *h, struct cache_req *rqstp) +{ + int rv; + + rv = cache_check_rcu(detail, h, rqstp); if (rv) cache_put(h, detail); return rv; @@ -451,24 +464,21 @@ static int cache_clean(void) } } + spin_lock(¤t_detail->hash_lock); + /* find a non-empty bucket in the table */ - while (current_detail && - current_index < current_detail->hash_size && + while (current_index < current_detail->hash_size && hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ - - if (current_detail && current_index < current_detail->hash_size) { + if (current_index < current_detail->hash_size) { struct cache_head *ch = NULL; struct cache_detail *d; struct hlist_head *head; struct hlist_node *tmp; - spin_lock(¤t_detail->hash_lock); - /* Ok, now to clean this strand */ - head = ¤t_detail->hash_table[current_index]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) @@ -489,8 +499,10 @@ static int cache_clean(void) spin_unlock(&cache_list_lock); if (ch) sunrpc_end_cache_remove_entry(ch, d); - } else + } else { + spin_unlock(¤t_detail->hash_lock); spin_unlock(&cache_list_lock); + } return rv; } @@ -731,11 +743,10 @@ static bool cache_defer_req(struct cache_req *req, struct cache_head *item) static void cache_revisit_request(struct cache_head *item) { struct cache_deferred_req *dreq; - struct list_head pending; struct hlist_node *tmp; int hash = DFR_HASH(item); + LIST_HEAD(pending); - INIT_LIST_HEAD(&pending); spin_lock(&cache_defer_lock); hlist_for_each_entry_safe(dreq, tmp, &cache_defer_hash[hash], hash) @@ -756,10 +767,8 @@ static void cache_revisit_request(struct cache_head *item) void cache_clean_deferred(void *owner) { struct cache_deferred_req *dreq, *tmp; - struct list_head pending; + LIST_HEAD(pending); - - INIT_LIST_HEAD(&pending); spin_lock(&cache_defer_lock); list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) { @@ -1085,9 +1094,8 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) { struct cache_queue *cq, *tmp; struct cache_request *cr; - struct list_head dequeued; + LIST_HEAD(dequeued); - INIT_LIST_HEAD(&dequeued); spin_lock(&queue_lock); list_for_each_entry_safe(cq, tmp, &detail->queue, list) if (!cq->reader) { @@ -1431,15 +1439,11 @@ static int c_show(struct seq_file *m, void *p) seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), kref_read(&cp->ref), cp->flags); - cache_get(cp); - if (cache_check(cd, cp, NULL)) - /* cache_check does a cache_put on failure */ + + if (cache_check_rcu(cd, cp, NULL)) + seq_puts(m, "# "); + else if (cache_is_expired(cd, cp)) seq_puts(m, "# "); - else { - if (cache_is_expired(cd, cp)) - seq_puts(m, "# "); - cache_put(cp, cd); - } return cd->cache_show(m, cd, cp); } @@ -1596,7 +1600,6 @@ static int cache_release_procfs(struct inode *inode, struct file *filp) } static const struct proc_ops cache_channel_proc_ops = { - .proc_lseek = no_llseek, .proc_read = cache_read_procfs, .proc_write = cache_write_procfs, .proc_poll = cache_poll_procfs, @@ -1662,7 +1665,6 @@ static const struct proc_ops cache_flush_proc_ops = { .proc_read = read_flush_procfs, .proc_write = write_flush_procfs, .proc_release = release_flush_procfs, - .proc_lseek = no_llseek, }; static void remove_cache_proc_entries(struct cache_detail *cd) @@ -1673,12 +1675,14 @@ static void remove_cache_proc_entries(struct cache_detail *cd) } } -#ifdef CONFIG_PROC_FS static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) { struct proc_dir_entry *p; struct sunrpc_net *sn; + if (!IS_ENABLED(CONFIG_PROC_FS)) + return 0; + sn = net_generic(net, sunrpc_net_id); cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); if (cd->procfs == NULL) @@ -1706,12 +1710,6 @@ out_nomem: remove_cache_proc_entries(cd); return -ENOMEM; } -#else /* CONFIG_PROC_FS */ -static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) -{ - return 0; -} -#endif void __init cache_initialize(void) { @@ -1815,7 +1813,6 @@ static int cache_release_pipefs(struct inode *inode, struct file *filp) const struct file_operations cache_file_operations_pipefs = { .owner = THIS_MODULE, - .llseek = no_llseek, .read = cache_read_pipefs, .write = cache_write_pipefs, .poll = cache_poll_pipefs, @@ -1881,7 +1878,6 @@ const struct file_operations cache_flush_operations_pipefs = { .read = read_flush_pipefs, .write = write_flush_pipefs, .release = release_flush_pipefs, - .llseek = no_llseek, }; int sunrpc_cache_register_pipefs(struct dentry *parent, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0b0b9f1eed46..58442ae1c2da 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -48,13 +48,8 @@ # define RPCDBG_FACILITY RPCDBG_CALL #endif -/* - * All RPC clients are linked into this list - */ - static DECLARE_WAIT_QUEUE_HEAD(destroy_wait); - static void call_start(struct rpc_task *task); static void call_reserve(struct rpc_task *task); static void call_reserveresult(struct rpc_task *task); @@ -111,50 +106,52 @@ static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt) pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { - __rpc_clnt_remove_pipedir(clnt); + if (pipefs_sb == clnt->pipefs_sb) + __rpc_clnt_remove_pipedir(clnt); rpc_put_sb_net(net); } } -static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb, +static int rpc_setup_pipedir_sb(struct super_block *sb, struct rpc_clnt *clnt) { static uint32_t clntid; const char *dir_name = clnt->cl_program->pipe_dir_name; char name[15]; - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, dir_name); if (dir == NULL) { pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name); - return dir; + return -ENOENT; } for (;;) { snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++); name[sizeof(name) - 1] = '\0'; - dentry = rpc_create_client_dir(dir, name, clnt); - if (!IS_ERR(dentry)) + err = rpc_create_client_dir(dir, name, clnt); + if (!err) break; - if (dentry == ERR_PTR(-EEXIST)) + if (err == -EEXIST) continue; printk(KERN_INFO "RPC: Couldn't create pipefs entry" - " %s/%s, error %ld\n", - dir_name, name, PTR_ERR(dentry)); + " %s/%s, error %d\n", + dir_name, name, err); break; } dput(dir); - return dentry; + return err; } static int rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt) { - struct dentry *dentry; + clnt->pipefs_sb = pipefs_sb; if (clnt->cl_program->pipe_dir_name != NULL) { - dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + int err = rpc_setup_pipedir_sb(pipefs_sb, clnt); + if (err && err != -ENOENT) + return err; } return 0; } @@ -182,16 +179,9 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { - struct dentry *dentry; - switch (event) { case RPC_PIPEFS_MOUNT: - dentry = rpc_setup_pipedir_sb(sb, clnt); - if (!dentry) - return -ENOENT; - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - break; + return rpc_setup_pipedir_sb(sb, clnt); case RPC_PIPEFS_UMOUNT: __rpc_clnt_remove_pipedir(clnt); break; @@ -272,9 +262,6 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, old = rcu_dereference_protected(clnt->cl_xprt, lockdep_is_held(&clnt->cl_lock)); - if (!xprt_bound(xprt)) - clnt->cl_autobind = 1; - clnt->cl_timeout = timeout; rcu_assign_pointer(clnt->cl_xprt, xprt); spin_unlock(&clnt->cl_lock); @@ -284,8 +271,14 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) { - clnt->cl_nodelen = strlcpy(clnt->cl_nodename, - nodename, sizeof(clnt->cl_nodename)); + ssize_t copied; + + copied = strscpy(clnt->cl_nodename, + nodename, sizeof(clnt->cl_nodename)); + + clnt->cl_nodelen = copied < 0 + ? sizeof(clnt->cl_nodename) - 1 + : copied; } static int rpc_client_register(struct rpc_clnt *clnt, @@ -385,6 +378,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, if (!clnt) goto out_err; clnt->cl_parent = parent ? : clnt; + clnt->cl_xprtsec = args->xprtsec; err = rpc_alloc_clid(clnt); if (err) @@ -395,7 +389,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, clnt->cl_maxproc = version->nrprocs; clnt->cl_prog = args->prognumber ? : program->number; clnt->cl_vers = version->number; - clnt->cl_stats = program->stats; + clnt->cl_stats = args->stats ? : program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); rpc_init_pipe_dir_head(&clnt->cl_pipedir_objects); err = -ENOMEM; @@ -434,7 +428,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, if (parent) refcount_inc(&parent->cl_count); - trace_rpc_clnt_new(clnt, xprt, program->name, args->servername); + trace_rpc_clnt_new(clnt, xprt, args); return clnt; out_no_path: @@ -507,6 +501,8 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, clnt->cl_discrtry = 1; if (!(args->flags & RPC_CLNT_CREATE_QUIET)) clnt->cl_chatty = 1; + if (args->flags & RPC_CLNT_CREATE_NETUNREACH_FATAL) + clnt->cl_netunreach_fatal = 1; return clnt; } @@ -532,8 +528,11 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) .addrlen = args->addrsize, .servername = args->servername, .bc_xprt = args->bc_xprt, + .xprtsec = args->xprtsec, + .connect_timeout = args->connect_timeout, + .reconnect_timeout = args->reconnect_timeout, }; - char servername[48]; + char servername[RPC_MAXNETNAMELEN]; struct rpc_clnt *clnt; int i; @@ -565,8 +564,12 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) servername[0] = '\0'; switch (args->address->sa_family) { case AF_LOCAL: - snprintf(servername, sizeof(servername), "%s", - sun->sun_path); + if (sun->sun_path[0]) + snprintf(servername, sizeof(servername), "%s", + sun->sun_path); + else + snprintf(servername, sizeof(servername), "@%s", + sun->sun_path+1); break; case AF_INET: snprintf(servername, sizeof(servername), "%pI4", @@ -650,6 +653,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; + new->cl_netunreach_fatal = clnt->cl_netunreach_fatal; new->cl_principal = clnt->cl_principal; new->cl_max_connect = clnt->cl_max_connect; return new; @@ -674,6 +678,7 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) .version = clnt->cl_vers, .authflavor = clnt->cl_auth->au_flavor, .cred = clnt->cl_cred, + .stats = clnt->cl_stats, }; return __rpc_clone_client(&args, clnt); } @@ -696,6 +701,7 @@ rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) .version = clnt->cl_vers, .authflavor = flavor, .cred = clnt->cl_cred, + .stats = clnt->cl_stats, }; return __rpc_clone_client(&args, clnt); } @@ -727,6 +733,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, struct rpc_clnt *parent; int err; + args->xprtsec = clnt->cl_xprtsec; xprt = xprt_create_transport(args); if (IS_ERR(xprt)) return PTR_ERR(xprt); @@ -785,15 +792,24 @@ out_revert: } EXPORT_SYMBOL_GPL(rpc_switch_client_transport); -static -int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi, - void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps)) +static struct rpc_xprt_switch *rpc_clnt_xprt_switch_get(struct rpc_clnt *clnt) { struct rpc_xprt_switch *xps; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); + + return xps; +} + +static +int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi, + void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps)) +{ + struct rpc_xprt_switch *xps; + + xps = rpc_clnt_xprt_switch_get(clnt); if (xps == NULL) return -EAGAIN; func(xpi, xps); @@ -934,12 +950,17 @@ void rpc_shutdown_client(struct rpc_clnt *clnt) trace_rpc_clnt_shutdown(clnt); + clnt->cl_shutdown = 1; while (!list_empty(&clnt->cl_tasks)) { rpc_killall_tasks(clnt); wait_event_timeout(destroy_wait, list_empty(&clnt->cl_tasks), 1*HZ); } + /* wait for tasks still in workqueue or waitqueue */ + wait_event_timeout(destroy_wait, + atomic_read(&clnt->cl_task_count) == 0, 1 * HZ); + rpc_release_client(clnt); } EXPORT_SYMBOL_GPL(rpc_shutdown_client); @@ -1041,6 +1062,8 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, .version = vers, .authflavor = old->cl_auth->au_flavor, .cred = old->cl_cred, + .stats = old->cl_stats, + .timeout = old->cl_timeout, }; struct rpc_clnt *clnt; int err; @@ -1113,6 +1136,7 @@ void rpc_task_release_client(struct rpc_task *task) list_del(&task->tk_task); spin_unlock(&clnt->cl_lock); task->tk_client = NULL; + atomic_dec(&clnt->cl_task_count); rpc_release_client(clnt); } @@ -1163,10 +1187,9 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) task->tk_flags |= RPC_TASK_TIMEOUT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; - /* Add to the client's list of all tasks */ - spin_lock(&clnt->cl_lock); - list_add_tail(&task->tk_task, &clnt->cl_tasks); - spin_unlock(&clnt->cl_lock); + if (clnt->cl_netunreach_fatal) + task->tk_flags |= RPC_TASK_NETUNREACH_FATAL; + atomic_inc(&clnt->cl_task_count); } static void @@ -1290,8 +1313,10 @@ static void call_bc_encode(struct rpc_task *task); * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run * rpc_execute against it * @req: RPC request + * @timeout: timeout values to use for this task */ -struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) +struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, + struct rpc_timeout *timeout) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { @@ -1310,7 +1335,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) return task; } - xprt_init_bc_request(req, task); + xprt_init_bc_request(req, task, timeout); task->tk_action = call_bc_encode; atomic_inc(&task->tk_count); @@ -1432,12 +1457,12 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, switch (sap->sa_family) { case AF_INET: err = kernel_bind(sock, - (struct sockaddr *)&rpc_inaddr_loopback, + (struct sockaddr_unsized *)&rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: err = kernel_bind(sock, - (struct sockaddr *)&rpc_in6addr_loopback, + (struct sockaddr_unsized *)&rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: @@ -1449,7 +1474,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, goto out_release; } - err = kernel_connect(sock, sap, salen, 0); + err = kernel_connect(sock, (struct sockaddr_unsized *)sap, salen, 0); if (err < 0) { dprintk("RPC: can't connect UDP socket (%d)\n", err); goto out_release; @@ -1717,6 +1742,11 @@ call_start(struct rpc_task *task) trace_rpc_request(task); + if (task->tk_client->cl_shutdown) { + rpc_call_rpcerror(task, -EIO); + return; + } + /* Increment call count (version might not be valid for ping) */ if (clnt->cl_program->version[clnt->cl_vers]) clnt->cl_program->version[clnt->cl_vers]->counts[idx]++; @@ -1754,9 +1784,14 @@ call_reserveresult(struct rpc_task *task) if (status >= 0) { if (task->tk_rqstp) { task->tk_action = call_refresh; + + /* Add to the client's list of all tasks */ + spin_lock(&task->tk_client->cl_lock); + if (list_empty(&task->tk_task)) + list_add_tail(&task->tk_task, &task->tk_client->cl_tasks); + spin_unlock(&task->tk_client->cl_lock); return; } - rpc_call_rpcerror(task, -EIO); return; } @@ -1821,13 +1856,13 @@ call_refreshresult(struct rpc_task *task) fallthrough; case -EAGAIN: status = -EACCES; - fallthrough; - case -EKEYEXPIRED: if (!task->tk_cred_retry) break; task->tk_cred_retry--; trace_rpc_retry_refresh_status(task); return; + case -EKEYEXPIRED: + break; case -ENOMEM: rpc_delay(task, HZ >> 4); return; @@ -1855,12 +1890,6 @@ call_allocate(struct rpc_task *task) if (req->rq_buffer) return; - if (proc->p_proc != 0) { - BUG_ON(proc->p_arglen == 0); - if (proc->p_decode != NULL) - BUG_ON(proc->p_replen == 0); - } - /* * Calculate the size (in quads) of the RPC call * and reply headers, and convert both values @@ -2050,9 +2079,6 @@ call_bind_status(struct rpc_task *task) status = -EOPNOTSUPP; break; } - if (task->tk_rebind_retry == 0) - break; - task->tk_rebind_retry--; rpc_delay(task, 3*HZ); goto retry_timeout; case -ENOBUFS: @@ -2070,14 +2096,17 @@ call_bind_status(struct rpc_task *task) case -EPROTONOSUPPORT: trace_rpcb_bind_version_err(task); goto retry_timeout; + case -ENETDOWN: + case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + break; + fallthrough; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: - case -ENETDOWN: case -EHOSTUNREACH: - case -ENETUNREACH: case -EPIPE: trace_rpcb_unreachable_err(task); if (!RPC_IS_SOFTCONN(task)) { @@ -2159,19 +2188,22 @@ call_connect_status(struct rpc_task *task) task->tk_status = 0; switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + break; + fallthrough; case -ECONNREFUSED: + case -ECONNRESET: /* A positive refusal suggests a rebind is needed. */ - if (RPC_IS_SOFTCONN(task)) - break; if (clnt->cl_autobind) { rpc_force_rebind(clnt); + if (RPC_IS_SOFTCONN(task)) + break; goto out_retry; } fallthrough; - case -ECONNRESET: case -ECONNABORTED: - case -ENETDOWN: - case -ENETUNREACH: case -EHOSTUNREACH: case -EPIPE: case -EPROTO: @@ -2192,9 +2224,7 @@ call_connect_status(struct rpc_task *task) struct rpc_xprt *saved = task->tk_xprt; struct rpc_xprt_switch *xps; - rcu_read_lock(); - xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); - rcu_read_unlock(); + xps = rpc_clnt_xprt_switch_get(clnt); if (xps->xps_nxprts > 1) { long value; @@ -2209,7 +2239,7 @@ call_connect_status(struct rpc_task *task) } xprt_switch_put(xps); if (!task->tk_xprt) - return; + goto out; } goto out_retry; case -ENOBUFS: @@ -2224,6 +2254,7 @@ out_next: out_retry: /* Check for timeouts before looping back to call_bind */ task->tk_action = call_bind; +out: rpc_check_timeout(task); } @@ -2292,12 +2323,13 @@ call_transmit_status(struct rpc_task *task) task->tk_action = call_transmit; task->tk_status = 0; break; - case -ECONNREFUSED: case -EHOSTDOWN: case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: + break; + case -ECONNREFUSED: if (RPC_IS_SOFTCONN(task)) { if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, @@ -2423,10 +2455,13 @@ call_status(struct rpc_task *task) trace_rpc_call_status(task); task->tk_status = 0; switch(status) { - case -EHOSTDOWN: case -ENETDOWN: - case -EHOSTUNREACH: case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + goto out_exit; + fallthrough; + case -EHOSTDOWN: + case -EHOSTUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) goto out_exit; @@ -2465,8 +2500,7 @@ call_status(struct rpc_task *task) goto out_exit; } task->tk_action = call_encode; - if (status != -ECONNRESET && status != -ECONNABORTED) - rpc_check_timeout(task); + rpc_check_timeout(task); return; out_exit: rpc_call_rpcerror(task, status); @@ -2593,6 +2627,7 @@ out: case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); + xdr_finish_decode(&xdr); return; case -EAGAIN: task->tk_status = 0; @@ -2665,8 +2700,19 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) goto out_msg_denied; error = rpcauth_checkverf(task, xdr); - if (error) + if (error) { + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + if (!test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { + rpcauth_invalcred(task); + if (!task->tk_cred_retry) + goto out_err; + task->tk_cred_retry--; + trace_rpc__stale_creds(task); + return -EKEYREJECTED; + } goto out_verifier; + } p = xdr_inline_decode(xdr, sizeof(*p)); if (!p) @@ -2713,7 +2759,20 @@ out_unparsable: out_verifier: trace_rpc_bad_verifier(task); - goto out_err; + switch (error) { + case -EPROTONOSUPPORT: + goto out_err; + case -EACCES: + /* possible RPCSEC_GSS out-of-sequence event (RFC2203), + * reset recv state and keep waiting, don't retransmit + */ + task->tk_rqstp->rq_reply_bytes_recvd = 0; + task->tk_status = xprt_request_enqueue_receive(task); + task->tk_action = call_transmit_status; + return -EBADMSG; + default: + goto out_garbage; + } out_msg_denied: error = -EACCES; @@ -2739,6 +2798,7 @@ out_msg_denied: case rpc_autherr_rejectedverf: case rpcsec_gsserr_credproblem: case rpcsec_gsserr_ctxproblem: + rpcauth_invalcred(task); if (!task->tk_cred_retry) break; task->tk_cred_retry--; @@ -2829,6 +2889,9 @@ static int rpc_ping(struct rpc_clnt *clnt) struct rpc_task *task; int status; + if (clnt->cl_auth->au_ops->ping) + return clnt->cl_auth->au_ops->ping(clnt); + task = rpc_call_null_helper(clnt, NULL, NULL, 0, NULL, NULL); if (IS_ERR(task)) return PTR_ERR(task); @@ -2892,19 +2955,22 @@ static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { * @clnt: pointer to struct rpc_clnt * @xps: pointer to struct rpc_xprt_switch, * @xprt: pointer struct rpc_xprt - * @dummy: unused + * @in_max_connect: pointer to the max_connect value for the passed in xprt transport */ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, - void *dummy) + void *in_max_connect) { struct rpc_cb_add_xprt_calldata *data; struct rpc_task *task; + int max_connect = clnt->cl_max_connect; - if (xps->xps_nunique_destaddr_xprts + 1 > clnt->cl_max_connect) { + if (in_max_connect) + max_connect = *(int *)in_max_connect; + if (xps->xps_nunique_destaddr_xprts + 1 > max_connect) { rcu_read_lock(); pr_warn("SUNRPC: reached max allowed number (%d) did not add " - "transport to server: %s\n", clnt->cl_max_connect, + "transport to server: %s\n", max_connect, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); rcu_read_unlock(); return -EINVAL; @@ -3049,6 +3115,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, if (!xprtargs->ident) xprtargs->ident = ident; + xprtargs->xprtsec = clnt->cl_xprtsec; xprt = xprt_create_transport(xprtargs); if (IS_ERR(xprt)) { ret = PTR_ERR(xprt); @@ -3056,6 +3123,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, } xprt->resvport = resvport; xprt->reuseport = reuseport; + + if (xprtargs->connect_timeout) + connect_timeout = xprtargs->connect_timeout; + if (xprtargs->reconnect_timeout) + reconnect_timeout = xprtargs->reconnect_timeout; if (xprt->ops->set_connect_timeout != NULL) xprt->ops->set_connect_timeout(xprt, connect_timeout, @@ -3080,7 +3152,6 @@ static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_add_xprt_test *data) { - struct rpc_xprt_switch *xps; struct rpc_xprt *main_xprt; int status = 0; @@ -3088,7 +3159,6 @@ static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, rcu_read_lock(); main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); - xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); status = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr, (struct sockaddr *)&main_xprt->addr); rcu_read_unlock(); @@ -3099,7 +3169,6 @@ static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, status = rpc_clnt_add_xprt_helper(clnt, xprt, data); out: xprt_put(xprt); - xprt_switch_put(xps); return status; } @@ -3214,34 +3283,27 @@ rpc_set_connect_timeout(struct rpc_clnt *clnt, } EXPORT_SYMBOL_GPL(rpc_set_connect_timeout); -void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) -{ - rcu_read_lock(); - xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); - void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; - rcu_read_lock(); - xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); - rcu_read_unlock(); + xps = rpc_clnt_xprt_switch_get(clnt); xprt_set_online_locked(xprt, xps); + xprt_switch_put(xps); } void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { + struct rpc_xprt_switch *xps; + if (rpc_clnt_xprt_switch_has_addr(clnt, (const struct sockaddr *)&xprt->addr)) { return rpc_clnt_xprt_set_online(clnt, xprt); } - rcu_read_lock(); - rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), - xprt); - rcu_read_unlock(); + + xps = rpc_clnt_xprt_switch_get(clnt); + rpc_xprt_switch_add_xprt(xps, xprt); + xprt_switch_put(xps); } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); @@ -3273,8 +3335,11 @@ bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -static void rpc_show_header(void) +static void rpc_show_header(struct rpc_clnt *clnt) { + printk(KERN_INFO "clnt[%pISpc] RPC tasks[%d]\n", + (struct sockaddr *)&clnt->cl_xprt->addr, + atomic_read(&clnt->cl_task_count)); printk(KERN_INFO "-pid- flgs status -client- --rqstp- " "-timeout ---ops--\n"); } @@ -3306,7 +3371,7 @@ void rpc_show_tasks(struct net *net) spin_lock(&clnt->cl_lock); list_for_each_entry(task, &clnt->cl_tasks, tk_task) { if (!header) { - rpc_show_header(); + rpc_show_header(clnt); header++; } rpc_show_task(clnt, task); @@ -3350,6 +3415,8 @@ rpc_clnt_swap_deactivate_callback(struct rpc_clnt *clnt, void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) { + while (clnt != clnt->cl_parent) + clnt = clnt->cl_parent; if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_swap_deactivate_callback, NULL); diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index a176d5a0b0ee..32417db340de 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -74,6 +74,9 @@ tasks_stop(struct seq_file *f, void *v) { struct rpc_clnt *clnt = f->private; spin_unlock(&clnt->cl_lock); + seq_printf(f, "clnt[%pISpc] RPC tasks[%d]\n", + (struct sockaddr *)&clnt->cl_xprt->addr, + atomic_read(&clnt->cl_task_count)); } static const struct seq_operations tasks_seq_operations = { @@ -179,6 +182,18 @@ xprt_info_show(struct seq_file *f, void *v) seq_printf(f, "addr: %s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); seq_printf(f, "port: %s\n", xprt->address_strings[RPC_DISPLAY_PORT]); seq_printf(f, "state: 0x%lx\n", xprt->state); + seq_printf(f, "netns: %u\n", xprt->xprt_net->ns.inum); + + if (xprt->ops->get_srcaddr) { + int ret, buflen; + char buf[INET6_ADDRSTRLEN]; + + buflen = ARRAY_SIZE(buf); + ret = xprt->ops->get_srcaddr(xprt, buf, buflen); + if (ret < 0) + ret = sprintf(buf, "<closed>"); + seq_printf(f, "saddr: %.*s\n", ret, buf); + } return 0; } diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 7ec10b92bea1..4efb5f28d881 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -33,6 +33,7 @@ struct sunrpc_net { int pipe_version; atomic_t pipe_users; struct proc_dir_entry *use_gssp_proc; + struct proc_dir_entry *gss_krb5_enctypes; }; extern unsigned int sunrpc_net_id; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 0b6034fab9ab..379daefc4847 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -168,8 +168,9 @@ rpc_inode_setowner(struct inode *inode, void *private) } static void -rpc_close_pipes(struct inode *inode) +rpc_close_pipes(struct dentry *dentry) { + struct inode *inode = dentry->d_inode; struct rpc_pipe *pipe = RPC_I(inode)->pipe; int need_release; LIST_HEAD(free_list); @@ -385,7 +386,6 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static const struct file_operations rpc_pipe_fops = { .owner = THIS_MODULE, - .llseek = no_llseek, .read = rpc_pipe_read, .write = rpc_pipe_write, .poll = rpc_pipe_poll, @@ -472,7 +472,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode) return NULL; inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; @@ -485,60 +485,6 @@ rpc_get_inode(struct super_block *sb, umode_t mode) return inode; } -static int __rpc_create_common(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - struct inode *inode; - - d_drop(dentry); - inode = rpc_get_inode(dir->i_sb, mode); - if (!inode) - goto out_err; - inode->i_ino = iunique(dir->i_sb, 100); - if (i_fop) - inode->i_fop = i_fop; - if (private) - rpc_inode_setowner(inode, private); - d_add(dentry, inode); - return 0; -out_err: - printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %pd\n", - __FILE__, __func__, dentry); - dput(dentry); - return -ENOMEM; -} - -static int __rpc_create(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - int err; - - err = __rpc_create_common(dir, dentry, S_IFREG | mode, i_fop, private); - if (err) - return err; - fsnotify_create(dir, dentry); - return 0; -} - -static int __rpc_mkdir(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - int err; - - err = __rpc_create_common(dir, dentry, S_IFDIR | mode, i_fop, private); - if (err) - return err; - inc_nlink(dir); - fsnotify_mkdir(dir, dentry); - return 0; -} - static void init_pipe(struct rpc_pipe *pipe) { @@ -575,119 +521,58 @@ struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags) } EXPORT_SYMBOL_GPL(rpc_mkpipe_data); -static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private, - struct rpc_pipe *pipe) +static int rpc_new_file(struct dentry *parent, + const char *name, + umode_t mode, + const struct file_operations *i_fop, + void *private) { - struct rpc_inode *rpci; - int err; + struct dentry *dentry = simple_start_creating(parent, name); + struct inode *dir = parent->d_inode; + struct inode *inode; - err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); - if (err) - return err; - rpci = RPC_I(d_inode(dentry)); - rpci->private = private; - rpci->pipe = pipe; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + inode = rpc_get_inode(dir->i_sb, S_IFREG | mode); + if (unlikely(!inode)) { + simple_done_creating(dentry); + return -ENOMEM; + } + inode->i_ino = iunique(dir->i_sb, 100); + if (i_fop) + inode->i_fop = i_fop; + rpc_inode_setowner(inode, private); + d_make_persistent(dentry, inode); fsnotify_create(dir, dentry); + simple_done_creating(dentry); return 0; } -static int __rpc_rmdir(struct inode *dir, struct dentry *dentry) +static struct dentry *rpc_new_dir(struct dentry *parent, + const char *name, + umode_t mode) { - int ret; - - dget(dentry); - ret = simple_rmdir(dir, dentry); - d_drop(dentry); - if (!ret) - fsnotify_rmdir(dir, dentry); - dput(dentry); - return ret; -} - -static int __rpc_unlink(struct inode *dir, struct dentry *dentry) -{ - int ret; - - dget(dentry); - ret = simple_unlink(dir, dentry); - d_drop(dentry); - if (!ret) - fsnotify_unlink(dir, dentry); - dput(dentry); - return ret; -} - -static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - - rpc_close_pipes(inode); - return __rpc_unlink(dir, dentry); -} + struct dentry *dentry = simple_start_creating(parent, name); + struct inode *dir = parent->d_inode; + struct inode *inode; -static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, - const char *name) -{ - struct qstr q = QSTR_INIT(name, strlen(name)); - struct dentry *dentry = d_hash_and_lookup(parent, &q); - if (!dentry) { - dentry = d_alloc(parent, &q); - if (!dentry) - return ERR_PTR(-ENOMEM); - } - if (d_really_is_negative(dentry)) + if (IS_ERR(dentry)) return dentry; - dput(dentry); - return ERR_PTR(-EEXIST); -} - -/* - * FIXME: This probably has races. - */ -static void __rpc_depopulate(struct dentry *parent, - const struct rpc_filelist *files, - int start, int eof) -{ - struct inode *dir = d_inode(parent); - struct dentry *dentry; - struct qstr name; - int i; - - for (i = start; i < eof; i++) { - name.name = files[i].name; - name.len = strlen(files[i].name); - dentry = d_hash_and_lookup(parent, &name); - if (dentry == NULL) - continue; - if (d_really_is_negative(dentry)) - goto next; - switch (d_inode(dentry)->i_mode & S_IFMT) { - default: - BUG(); - case S_IFREG: - __rpc_unlink(dir, dentry); - break; - case S_IFDIR: - __rpc_rmdir(dir, dentry); - } -next: - dput(dentry); + inode = rpc_get_inode(dir->i_sb, S_IFDIR | mode); + if (unlikely(!inode)) { + simple_done_creating(dentry); + return ERR_PTR(-ENOMEM); } -} -static void rpc_depopulate(struct dentry *parent, - const struct rpc_filelist *files, - int start, int eof) -{ - struct inode *dir = d_inode(parent); + inode->i_ino = iunique(dir->i_sb, 100); + inc_nlink(dir); + d_make_persistent(dentry, inode); + fsnotify_mkdir(dir, dentry); + simple_done_creating(dentry); - inode_lock_nested(dir, I_MUTEX_CHILD); - __rpc_depopulate(parent, files, start, eof); - inode_unlock(dir); + return dentry; // borrowed } static int rpc_populate(struct dentry *parent, @@ -695,92 +580,39 @@ static int rpc_populate(struct dentry *parent, int start, int eof, void *private) { - struct inode *dir = d_inode(parent); struct dentry *dentry; int i, err; - inode_lock(dir); for (i = start; i < eof; i++) { - dentry = __rpc_lookup_create_exclusive(parent, files[i].name); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_bad; switch (files[i].mode & S_IFMT) { default: BUG(); case S_IFREG: - err = __rpc_create(dir, dentry, + err = rpc_new_file(parent, + files[i].name, files[i].mode, files[i].i_fop, private); + if (err) + goto out_bad; break; case S_IFDIR: - err = __rpc_mkdir(dir, dentry, - files[i].mode, - NULL, - private); + dentry = rpc_new_dir(parent, + files[i].name, + files[i].mode); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_bad; + } } - if (err != 0) - goto out_bad; } - inode_unlock(dir); return 0; out_bad: - __rpc_depopulate(parent, files, start, eof); - inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; } -static struct dentry *rpc_mkdir_populate(struct dentry *parent, - const char *name, umode_t mode, void *private, - int (*populate)(struct dentry *, void *), void *args_populate) -{ - struct dentry *dentry; - struct inode *dir = d_inode(parent); - int error; - - inode_lock_nested(dir, I_MUTEX_PARENT); - dentry = __rpc_lookup_create_exclusive(parent, name); - if (IS_ERR(dentry)) - goto out; - error = __rpc_mkdir(dir, dentry, mode, NULL, private); - if (error != 0) - goto out_err; - if (populate != NULL) { - error = populate(dentry, args_populate); - if (error) - goto err_rmdir; - } -out: - inode_unlock(dir); - return dentry; -err_rmdir: - __rpc_rmdir(dir, dentry); -out_err: - dentry = ERR_PTR(error); - goto out; -} - -static int rpc_rmdir_depopulate(struct dentry *dentry, - void (*depopulate)(struct dentry *)) -{ - struct dentry *parent; - struct inode *dir; - int error; - - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock_nested(dir, I_MUTEX_PARENT); - if (depopulate != NULL) - depopulate(dentry); - error = __rpc_rmdir(dir, dentry); - inode_unlock(dir); - dput(parent); - return error; -} - /** * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace * communication @@ -800,11 +632,13 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, * The @private argument passed here will be available to all these methods * from the file pointer, via RPC_I(file_inode(file))->private. */ -struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, +int rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe) { - struct dentry *dentry; struct inode *dir = d_inode(parent); + struct dentry *dentry; + struct inode *inode; + struct rpc_inode *rpci; umode_t umode = S_IFIFO | 0600; int err; @@ -813,48 +647,52 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (pipe->ops->downcall == NULL) umode &= ~0222; - inode_lock_nested(dir, I_MUTEX_PARENT); - dentry = __rpc_lookup_create_exclusive(parent, name); - if (IS_ERR(dentry)) - goto out; - err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops, - private, pipe); - if (err) - goto out_err; -out: - inode_unlock(dir); - return dentry; -out_err: - dentry = ERR_PTR(err); - printk(KERN_WARNING "%s: %s() failed to create pipe %pd/%s (errno = %d)\n", - __FILE__, __func__, parent, name, - err); - goto out; + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto failed; + } + + inode = rpc_get_inode(dir->i_sb, umode); + if (unlikely(!inode)) { + simple_done_creating(dentry); + err = -ENOMEM; + goto failed; + } + inode->i_ino = iunique(dir->i_sb, 100); + inode->i_fop = &rpc_pipe_fops; + rpci = RPC_I(inode); + rpci->private = private; + rpci->pipe = pipe; + rpc_inode_setowner(inode, private); + pipe->dentry = dentry; // borrowed + d_make_persistent(dentry, inode); + fsnotify_create(dir, dentry); + simple_done_creating(dentry); + return 0; + +failed: + pr_warn("%s() failed to create pipe %pd/%s (errno = %d)\n", + __func__, parent, name, err); + return err; } EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry); /** * rpc_unlink - remove a pipe - * @dentry: dentry for the pipe, as returned from rpc_mkpipe + * @pipe: the pipe to be removed * * After this call, lookups will no longer find the pipe, and any * attempts to read or write using preexisting opens of the pipe will * return -EPIPE. */ -int -rpc_unlink(struct dentry *dentry) +void +rpc_unlink(struct rpc_pipe *pipe) { - struct dentry *parent; - struct inode *dir; - int error = 0; - - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock_nested(dir, I_MUTEX_PARENT); - error = __rpc_rmpipe(dir, dentry); - inode_unlock(dir); - dput(parent); - return error; + if (pipe->dentry) { + simple_recursive_removal(pipe->dentry, rpc_close_pipes); + pipe->dentry = NULL; + } } EXPORT_SYMBOL_GPL(rpc_unlink); @@ -1011,31 +849,6 @@ rpc_destroy_pipe_dir_objects(struct rpc_pipe_dir_head *pdh) pdo->pdo_ops->destroy(dir, pdo); } -enum { - RPCAUTH_info, - RPCAUTH_EOF -}; - -static const struct rpc_filelist authfiles[] = { - [RPCAUTH_info] = { - .name = "info", - .i_fop = &rpc_info_operations, - .mode = S_IFREG | 0400, - }, -}; - -static int rpc_clntdir_populate(struct dentry *dentry, void *private) -{ - return rpc_populate(dentry, - authfiles, RPCAUTH_info, RPCAUTH_EOF, - private); -} - -static void rpc_clntdir_depopulate(struct dentry *dentry) -{ - rpc_depopulate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF); -} - /** * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs * @dentry: the parent of new directory @@ -1047,19 +860,27 @@ static void rpc_clntdir_depopulate(struct dentry *dentry) * information about the client, together with any "pipes" that may * later be created using rpc_mkpipe(). */ -struct dentry *rpc_create_client_dir(struct dentry *dentry, - const char *name, - struct rpc_clnt *rpc_client) +int rpc_create_client_dir(struct dentry *dentry, + const char *name, + struct rpc_clnt *rpc_client) { struct dentry *ret; + int err; - ret = rpc_mkdir_populate(dentry, name, 0555, NULL, - rpc_clntdir_populate, rpc_client); - if (!IS_ERR(ret)) { - rpc_client->cl_pipedir_objects.pdh_dentry = ret; - rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); + ret = rpc_new_dir(dentry, name, 0555); + if (IS_ERR(ret)) + return PTR_ERR(ret); + err = rpc_new_file(ret, "info", S_IFREG | 0400, + &rpc_info_operations, rpc_client); + if (err) { + pr_warn("%s failed to populate directory %pd\n", + __func__, ret); + simple_recursive_removal(ret, NULL); + return err; } - return ret; + rpc_client->cl_pipedir_objects.pdh_dentry = ret; + rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); + return 0; } /** @@ -1074,7 +895,8 @@ int rpc_remove_client_dir(struct rpc_clnt *rpc_client) return 0; rpc_destroy_pipe_dir_objects(&rpc_client->cl_pipedir_objects); rpc_client->cl_pipedir_objects.pdh_dentry = NULL; - return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate); + simple_recursive_removal(dentry, NULL); + return 0; } static const struct rpc_filelist cache_pipefs_files[3] = { @@ -1095,28 +917,25 @@ static const struct rpc_filelist cache_pipefs_files[3] = { }, }; -static int rpc_cachedir_populate(struct dentry *dentry, void *private) -{ - return rpc_populate(dentry, - cache_pipefs_files, 0, 3, - private); -} - -static void rpc_cachedir_depopulate(struct dentry *dentry) -{ - rpc_depopulate(dentry, cache_pipefs_files, 0, 3); -} - struct dentry *rpc_create_cache_dir(struct dentry *parent, const char *name, umode_t umode, struct cache_detail *cd) { - return rpc_mkdir_populate(parent, name, umode, NULL, - rpc_cachedir_populate, cd); + struct dentry *dentry; + + dentry = rpc_new_dir(parent, name, umode); + if (!IS_ERR(dentry)) { + int error = rpc_populate(dentry, cache_pipefs_files, 0, 3, cd); + if (error) { + simple_recursive_removal(dentry, NULL); + return ERR_PTR(error); + } + } + return dentry; } void rpc_remove_cache_dir(struct dentry *dentry) { - rpc_rmdir_depopulate(dentry, rpc_cachedir_depopulate); + simple_recursive_removal(dentry, NULL); } /* @@ -1142,7 +961,6 @@ enum { RPCAUTH_nfsd4_cb, RPCAUTH_cache, RPCAUTH_nfsd, - RPCAUTH_gssd, RPCAUTH_RootEOF }; @@ -1179,10 +997,6 @@ static const struct rpc_filelist files[] = { .name = "nfsd", .mode = S_IFDIR | 0555, }, - [RPCAUTH_gssd] = { - .name = "gssd", - .mode = S_IFDIR | 0555, - }, }; /* @@ -1191,8 +1005,7 @@ static const struct rpc_filelist files[] = { struct dentry *rpc_d_lookup_sb(const struct super_block *sb, const unsigned char *dir_name) { - struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name)); - return d_hash_and_lookup(sb->s_root, &dir); + return try_lookup_noperm(&QSTR(dir_name), sb->s_root); } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); @@ -1243,13 +1056,6 @@ void rpc_put_sb_net(const struct net *net) } EXPORT_SYMBOL_GPL(rpc_put_sb_net); -static const struct rpc_filelist gssd_dummy_clnt_dir[] = { - [0] = { - .name = "clntXX", - .mode = S_IFDIR | 0555, - }, -}; - static ssize_t dummy_downcall(struct file *filp, const char __user *src, size_t len) { @@ -1278,14 +1084,6 @@ rpc_dummy_info_show(struct seq_file *m, void *v) } DEFINE_SHOW_ATTRIBUTE(rpc_dummy_info); -static const struct rpc_filelist gssd_dummy_info_file[] = { - [0] = { - .name = "info", - .i_fop = &rpc_dummy_info_fops, - .mode = S_IFREG | 0400, - }, -}; - /** * rpc_gssd_dummy_populate - create a dummy gssd pipe * @root: root of the rpc_pipefs filesystem @@ -1294,72 +1092,32 @@ static const struct rpc_filelist gssd_dummy_info_file[] = { * Create a dummy set of directories and a pipe that gssd can hold open to * indicate that it is up and running. */ -static struct dentry * +static int rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) { - int ret = 0; - struct dentry *gssd_dentry; - struct dentry *clnt_dentry = NULL; - struct dentry *pipe_dentry = NULL; - struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name, - strlen(files[RPCAUTH_gssd].name)); - - /* We should never get this far if "gssd" doesn't exist */ - gssd_dentry = d_hash_and_lookup(root, &q); - if (!gssd_dentry) - return ERR_PTR(-ENOENT); - - ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL); - if (ret) { - pipe_dentry = ERR_PTR(ret); - goto out; - } - - q.name = gssd_dummy_clnt_dir[0].name; - q.len = strlen(gssd_dummy_clnt_dir[0].name); - clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); - if (!clnt_dentry) { - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - pipe_dentry = ERR_PTR(-ENOENT); - goto out; - } + struct dentry *gssd_dentry, *clnt_dentry; + int err; - ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL); - if (ret) { - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - pipe_dentry = ERR_PTR(ret); - goto out; - } + gssd_dentry = rpc_new_dir(root, "gssd", 0555); + if (IS_ERR(gssd_dentry)) + return -ENOENT; - pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); - if (IS_ERR(pipe_dentry)) { - __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1); - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - } -out: - dput(clnt_dentry); - dput(gssd_dentry); - return pipe_dentry; -} + clnt_dentry = rpc_new_dir(gssd_dentry, "clntXX", 0555); + if (IS_ERR(clnt_dentry)) + return -ENOENT; -static void -rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) -{ - struct dentry *clnt_dir = pipe_dentry->d_parent; - struct dentry *gssd_dir = clnt_dir->d_parent; - - dget(pipe_dentry); - __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry); - __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); - __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); - dput(pipe_dentry); + err = rpc_new_file(clnt_dentry, "info", 0400, + &rpc_dummy_info_fops, NULL); + if (!err) + err = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); + return err; } static int rpc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; - struct dentry *root, *gssd_dentry; + struct dentry *root; struct net *net = sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1368,7 +1126,7 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RPCAUTH_GSSMAGIC; sb->s_op = &s_ops; - sb->s_d_op = &simple_dentry_operations; + sb->s_d_flags = DCACHE_DONTCACHE; sb->s_time_gran = 1; inode = rpc_get_inode(sb, S_IFDIR | 0555); @@ -1378,11 +1136,9 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; - gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy); - if (IS_ERR(gssd_dentry)) { - __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); - return PTR_ERR(gssd_dentry); - } + err = rpc_gssd_dummy_populate(root, sn->gssd_dummy); + if (err) + return err; dprintk("RPC: sending pipefs MOUNT notification for net %x%s\n", net->ns.inum, NET_NAME(net)); @@ -1391,18 +1147,6 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_MOUNT, sb); - if (err) - goto err_depopulate; - mutex_unlock(&sn->pipefs_sb_lock); - return 0; - -err_depopulate: - rpc_gssd_dummy_depopulate(gssd_dentry); - blocking_notifier_call_chain(&rpc_pipefs_notifier_list, - RPC_PIPEFS_UMOUNT, - sb); - sn->pipefs_sb = NULL; - __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); mutex_unlock(&sn->pipefs_sb_lock); return err; } @@ -1459,7 +1203,7 @@ static void rpc_kill_sb(struct super_block *sb) sb); mutex_unlock(&sn->pipefs_sb_lock); out: - kill_litter_super(sb); + kill_anon_super(sb); put_net(net); } @@ -1490,7 +1234,7 @@ int register_rpc_pipefs(void) rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 5a8e6d46809a..53bcca365fb1 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -36,6 +36,7 @@ #include "netns.h" #define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock" +#define RPCBIND_SOCK_ABSTRACT_NAME "\0/run/rpcbind.sock" #define RPCBIND_PROGRAM (100000u) #define RPCBIND_PORT (111u) @@ -216,21 +217,22 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt, sn->rpcb_users = 1; } +/* Evaluate to actual length of the `sockaddr_un' structure. */ +# define SUN_LEN(ptr) (offsetof(struct sockaddr_un, sun_path) \ + + 1 + strlen((ptr)->sun_path + 1)) + /* * Returns zero on success, otherwise a negative errno value * is returned. */ -static int rpcb_create_local_unix(struct net *net) +static int rpcb_create_af_local(struct net *net, + const struct sockaddr_un *addr) { - static const struct sockaddr_un rpcb_localaddr_rpcbind = { - .sun_family = AF_LOCAL, - .sun_path = RPCBIND_SOCK_PATHNAME, - }; struct rpc_create_args args = { .net = net, .protocol = XPRT_TRANSPORT_LOCAL, - .address = (struct sockaddr *)&rpcb_localaddr_rpcbind, - .addrsize = sizeof(rpcb_localaddr_rpcbind), + .address = (struct sockaddr *)addr, + .addrsize = SUN_LEN(addr), .servername = "localhost", .program = &rpcb_program, .version = RPCBVERS_2, @@ -269,6 +271,26 @@ out: return result; } +static int rpcb_create_local_abstract(struct net *net) +{ + static const struct sockaddr_un rpcb_localaddr_abstract = { + .sun_family = AF_LOCAL, + .sun_path = RPCBIND_SOCK_ABSTRACT_NAME, + }; + + return rpcb_create_af_local(net, &rpcb_localaddr_abstract); +} + +static int rpcb_create_local_unix(struct net *net) +{ + static const struct sockaddr_un rpcb_localaddr_unix = { + .sun_family = AF_LOCAL, + .sun_path = RPCBIND_SOCK_PATHNAME, + }; + + return rpcb_create_af_local(net, &rpcb_localaddr_unix); +} + /* * Returns zero on success, otherwise a negative errno value * is returned. @@ -332,7 +354,8 @@ int rpcb_create_local(struct net *net) if (rpcb_get_local(net)) goto out; - if (rpcb_create_local_unix(net) != 0) + if (rpcb_create_local_abstract(net) != 0 && + rpcb_create_local_unix(net) != 0) result = rpcb_create_local_net(net); out: @@ -746,6 +769,10 @@ void rpcb_getport_async(struct rpc_task *task) child = rpcb_call_async(rpcb_clnt, map, proc); rpc_release_client(rpcb_clnt); + if (IS_ERR(child)) { + /* rpcb_map_release() has freed the arguments */ + return; + } xprt->stat.bind_count++; rpc_put_task(child); @@ -793,9 +820,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) } trace_rpcb_setport(child, map->r_status, map->r_port); - xprt->ops->set_port(xprt, map->r_port); - if (map->r_port) + if (map->r_port) { + xprt->ops->set_port(xprt, map->r_port); xprt_set_bound(xprt); + } } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index be587a308e05..016f16ca5779 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -369,8 +369,10 @@ static void rpc_make_runnable(struct workqueue_struct *wq, if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); queue_work(wq, &task->u.tk_work); - } else + } else { + smp_mb__after_atomic(); wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); + } } /* @@ -817,7 +819,6 @@ rpc_init_task_statistics(struct rpc_task *task) /* Initialize retry counters */ task->tk_garb_retry = 2; task->tk_cred_retry = 2; - task->tk_rebind_retry = 2; /* starting timestamp */ task->tk_start = ktime_get(); @@ -863,8 +864,6 @@ void rpc_signal_task(struct rpc_task *task) if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) return; trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) rpc_wake_up_queued_task(queue, task); @@ -928,11 +927,10 @@ static void __rpc_execute(struct rpc_task *task) */ do_action = task->tk_action; /* Tasks with an RPC error status should exit */ - if (do_action != rpc_exit_task && + if (do_action && do_action != rpc_exit_task && (status = READ_ONCE(task->tk_rpc_status)) != 0) { task->tk_status = status; - if (do_action != NULL) - do_action = rpc_exit_task; + do_action = rpc_exit_task; } /* Callbacks override all actions */ if (task->tk_callback) { @@ -1076,7 +1074,6 @@ int rpc_malloc(struct rpc_task *task) rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } -EXPORT_SYMBOL_GPL(rpc_malloc); /** * rpc_free - free RPC buffer resources allocated via rpc_malloc @@ -1097,7 +1094,6 @@ void rpc_free(struct rpc_task *task) else kfree(buf); } -EXPORT_SYMBOL_GPL(rpc_free); /* * Creation and deletion of RPC task structures diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 1b2b84feeec6..d8d8842c7de5 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -27,135 +27,91 @@ struct xdr_skb_reader { struct sk_buff *skb; unsigned int offset; + bool need_checksum; size_t count; __wsum csum; }; -typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to, - size_t len); - /** * xdr_skb_read_bits - copy some data bits from skb to internal buffer * @desc: sk_buff copy helper * @to: copy destination * @len: number of bytes to copy * - * Possibly called several times to iterate over an sk_buff and copy - * data out of it. + * Possibly called several times to iterate over an sk_buff and copy data out of + * it. */ static size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) { - if (len > desc->count) - len = desc->count; - if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len))) - return 0; - desc->count -= len; - desc->offset += len; - return len; -} + len = min(len, desc->count); + + if (desc->need_checksum) { + __wsum csum; + + csum = skb_copy_and_csum_bits(desc->skb, desc->offset, to, len); + desc->csum = csum_block_add(desc->csum, csum, desc->offset); + } else { + if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len))) + return 0; + } -/** - * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer - * @desc: sk_buff copy helper - * @to: copy destination - * @len: number of bytes to copy - * - * Same as skb_read_bits, but calculate a checksum at the same time. - */ -static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, size_t len) -{ - unsigned int pos; - __wsum csum2; - - if (len > desc->count) - len = desc->count; - pos = desc->offset; - csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len); - desc->csum = csum_block_add(desc->csum, csum2, pos); desc->count -= len; desc->offset += len; return len; } -/** - * xdr_partial_copy_from_skb - copy data out of an skb - * @xdr: target XDR buffer - * @base: starting offset - * @desc: sk_buff copy helper - * @copy_actor: virtual method for copying data - * - */ static ssize_t -xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) +xdr_partial_copy_from_skb(struct xdr_buf *xdr, struct xdr_skb_reader *desc) { - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - ssize_t copied = 0; - size_t ret; - - len = xdr->head[0].iov_len; - if (base < len) { - len -= base; - ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); - copied += ret; - if (ret != len || !desc->count) - goto out; - base = 0; - } else - base -= len; - - if (unlikely(pglen == 0)) - goto copy_tail; - if (unlikely(base >= pglen)) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_SHIFT; - base &= ~PAGE_MASK; - } - do { + struct page **ppage = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + unsigned int poff = xdr->page_base & ~PAGE_MASK; + unsigned int pglen = xdr->page_len; + ssize_t copied = 0; + size_t ret; + + if (xdr->head[0].iov_len == 0) + return 0; + + ret = xdr_skb_read_bits(desc, xdr->head[0].iov_base, + xdr->head[0].iov_len); + if (ret != xdr->head[0].iov_len || !desc->count) + return ret; + copied += ret; + + while (pglen) { + unsigned int len = min(PAGE_SIZE - poff, pglen); char *kaddr; /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { - *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + *ppage = alloc_page(GFP_NOWAIT); if (unlikely(*ppage == NULL)) { if (copied == 0) - copied = -ENOMEM; - goto out; + return -ENOMEM; + return copied; } } - len = PAGE_SIZE; kaddr = kmap_atomic(*ppage); - if (base) { - len -= base; - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr + base, len); - base = 0; - } else { - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr, len); - } + ret = xdr_skb_read_bits(desc, kaddr + poff, len); flush_dcache_page(*ppage); kunmap_atomic(kaddr); + copied += ret; if (ret != len || !desc->count) - goto out; + return copied; ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) - copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); -out: + pglen -= len; + poff = 0; + } + + if (xdr->tail[0].iov_len) { + copied += xdr_skb_read_bits(desc, xdr->tail[0].iov_base, + xdr->tail[0].iov_len); + } + return copied; } @@ -169,17 +125,22 @@ out: */ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) { - struct xdr_skb_reader desc; - - desc.skb = skb; - desc.offset = 0; - desc.count = skb->len - desc.offset; + struct xdr_skb_reader desc = { + .skb = skb, + .count = skb->len - desc.offset, + }; - if (skb_csum_unnecessary(skb)) - goto no_checksum; + if (skb_csum_unnecessary(skb)) { + if (xdr_partial_copy_from_skb(xdr, &desc) < 0) + return -1; + if (desc.count) + return -1; + return 0; + } + desc.need_checksum = true; desc.csum = csum_partial(skb->data, desc.offset, skb->csum); - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_and_csum_bits) < 0) + if (xdr_partial_copy_from_skb(xdr, &desc) < 0) return -1; if (desc.offset != skb->len) { __wsum csum2; @@ -194,14 +155,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); return 0; -no_checksum: - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; } -EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr); static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek) diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 52908f9e6eab..383860cb1d5b 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -83,7 +83,8 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { const struct svc_program *prog = statp->program; const struct svc_version *vers; - unsigned int i, j; + unsigned int i, j, k; + unsigned long count; seq_printf(seq, "net %u %u %u %u\n", @@ -104,8 +105,12 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) if (!vers) continue; seq_printf(seq, "proc%d %u", i, vers->vs_nproc); - for (j = 0; j < vers->vs_nproc; j++) - seq_printf(seq, " %u", vers->vs_count[j]); + for (j = 0; j < vers->vs_nproc; j++) { + count = 0; + for_each_possible_cpu(k) + count += per_cpu(vers->vs_count[j], k); + seq_printf(seq, " %lu", count); + } seq_putc(seq, '\n'); } } @@ -309,7 +314,7 @@ EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { - return do_register(net, statp->program->pg_name, statp, proc_ops); + return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index d4a362c9e4b3..e3c6e3b63f0b 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -36,7 +36,11 @@ static inline int sock_is_loopback(struct sock *sk) return loopback; } +struct svc_serv; +struct svc_rqst; int rpc_clients_notifier_register(void); void rpc_clients_notifier_unregister(void); void auth_domain_cleanup(void); +void svc_sock_update_bufs(struct svc_serv *serv); +enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp); #endif /* _NET_SUNRPC_SUNRPC_H */ diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 691c0000e9ea..bab6cab29405 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -148,6 +148,7 @@ cleanup_sunrpc(void) #endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ } +MODULE_DESCRIPTION("Sun RPC core"); MODULE_LICENSE("GPL"); fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ module_exit(cleanup_sunrpc); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index f06622814a95..4704dce7284e 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -32,6 +32,7 @@ #include <trace/events/sunrpc.h> #include "fail.h" +#include "sunrpc.h" #define RPCDBG_FACILITY RPCDBG_SVCDSP @@ -72,57 +73,100 @@ static struct svc_pool_map svc_pool_map = { static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int -param_set_pool_mode(const char *val, const struct kernel_param *kp) +__param_set_pool_mode(const char *val, struct svc_pool_map *m) { - int *ip = (int *)kp->arg; - struct svc_pool_map *m = &svc_pool_map; - int err; + int err, mode; mutex_lock(&svc_pool_map_mutex); - err = -EBUSY; - if (m->count) - goto out; - err = 0; if (!strncmp(val, "auto", 4)) - *ip = SVC_POOL_AUTO; + mode = SVC_POOL_AUTO; else if (!strncmp(val, "global", 6)) - *ip = SVC_POOL_GLOBAL; + mode = SVC_POOL_GLOBAL; else if (!strncmp(val, "percpu", 6)) - *ip = SVC_POOL_PERCPU; + mode = SVC_POOL_PERCPU; else if (!strncmp(val, "pernode", 7)) - *ip = SVC_POOL_PERNODE; + mode = SVC_POOL_PERNODE; else err = -EINVAL; + if (err) + goto out; + + if (m->count == 0) + m->mode = mode; + else if (mode != m->mode) + err = -EBUSY; out: mutex_unlock(&svc_pool_map_mutex); return err; } static int -param_get_pool_mode(char *buf, const struct kernel_param *kp) +param_set_pool_mode(const char *val, const struct kernel_param *kp) +{ + struct svc_pool_map *m = kp->arg; + + return __param_set_pool_mode(val, m); +} + +int sunrpc_set_pool_mode(const char *val) +{ + return __param_set_pool_mode(val, &svc_pool_map); +} +EXPORT_SYMBOL(sunrpc_set_pool_mode); + +/** + * sunrpc_get_pool_mode - get the current pool_mode for the host + * @buf: where to write the current pool_mode + * @size: size of @buf + * + * Grab the current pool_mode from the svc_pool_map and write + * the resulting string to @buf. Returns the number of characters + * written to @buf (a'la snprintf()). + */ +int +sunrpc_get_pool_mode(char *buf, size_t size) { - int *ip = (int *)kp->arg; + struct svc_pool_map *m = &svc_pool_map; - switch (*ip) + switch (m->mode) { case SVC_POOL_AUTO: - return strlcpy(buf, "auto\n", 20); + return snprintf(buf, size, "auto"); case SVC_POOL_GLOBAL: - return strlcpy(buf, "global\n", 20); + return snprintf(buf, size, "global"); case SVC_POOL_PERCPU: - return strlcpy(buf, "percpu\n", 20); + return snprintf(buf, size, "percpu"); case SVC_POOL_PERNODE: - return strlcpy(buf, "pernode\n", 20); + return snprintf(buf, size, "pernode"); default: - return sprintf(buf, "%d\n", *ip); + return snprintf(buf, size, "%d", m->mode); } } +EXPORT_SYMBOL(sunrpc_get_pool_mode); + +static int +param_get_pool_mode(char *buf, const struct kernel_param *kp) +{ + char str[16]; + int len; + + len = sunrpc_get_pool_mode(str, ARRAY_SIZE(str)); + + /* Ensure we have room for newline and NUL */ + len = min_t(int, len, ARRAY_SIZE(str) - 2); + + /* tack on the newline */ + str[len] = '\n'; + str[len + 1] = '\0'; + + return sysfs_emit(buf, "%s", str); +} module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode, - &svc_pool_map.mode, 0644); + &svc_pool_map, 0644); /* * Detect best pool mapping mode heuristically, @@ -250,10 +294,8 @@ svc_pool_map_get(void) int npools = -1; mutex_lock(&svc_pool_map_mutex); - if (m->count++) { mutex_unlock(&svc_pool_map_mutex); - WARN_ON_ONCE(m->npools <= 1); return m->npools; } @@ -275,32 +317,21 @@ svc_pool_map_get(void) m->mode = SVC_POOL_GLOBAL; } m->npools = npools; - - if (npools == 1) - /* service is unpooled, so doesn't hold a reference */ - m->count--; - mutex_unlock(&svc_pool_map_mutex); return npools; } /* - * Drop a reference to the global map of cpus to pools, if - * pools were in use, i.e. if npools > 1. + * Drop a reference to the global map of cpus to pools. * When the last reference is dropped, the map data is - * freed; this allows the sysadmin to change the pool - * mode using the pool_mode module option without - * rebooting or re-loading sunrpc.ko. + * freed; this allows the sysadmin to change the pool. */ static void -svc_pool_map_put(int npools) +svc_pool_map_put(void) { struct svc_pool_map *m = &svc_pool_map; - if (npools <= 1) - return; mutex_lock(&svc_pool_map_mutex); - if (!--m->count) { kfree(m->to_pool); m->to_pool = NULL; @@ -308,7 +339,6 @@ svc_pool_map_put(int npools) m->pool_to = NULL; m->npools = 0; } - mutex_unlock(&svc_pool_map_mutex); } @@ -322,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx) if (m->mode == SVC_POOL_PERNODE) return m->pool_to[pidx]; } - return NUMA_NO_NODE; + return numa_mem_id(); } /* * Set the given thread's cpus_allowed mask so that it @@ -388,7 +418,7 @@ struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv) return &serv->sv_pools[pidx % serv->sv_nrpools]; } -int svc_rpcb_setup(struct svc_serv *serv, struct net *net) +static int svc_rpcb_setup(struct svc_serv *serv, struct net *net) { int err; @@ -400,21 +430,20 @@ int svc_rpcb_setup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); return 0; } -EXPORT_SYMBOL_GPL(svc_rpcb_setup); void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) { svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { - struct svc_program *progp; - unsigned int i; + unsigned int p, i; + + for (p = 0; p < serv->sv_nprogs; p++) { + struct svc_program *progp = &serv->sv_programs[p]; - for (progp = serv->sv_program; progp; progp = progp->pg_next) { for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; @@ -438,9 +467,7 @@ EXPORT_SYMBOL_GPL(svc_bind); static void __svc_init_bc(struct svc_serv *serv) { - INIT_LIST_HEAD(&serv->sv_cb_list); - spin_lock_init(&serv->sv_cb_lock); - init_waitqueue_head(&serv->sv_cb_waitq); + lwq_init(&serv->sv_cb_list); } #else static void @@ -453,8 +480,8 @@ __svc_init_bc(struct svc_serv *serv) * Create an RPC service */ static struct svc_serv * -__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - int (*threadfn)(void *data)) +__svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats, + unsigned int bufsize, int npools, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int vers; @@ -464,26 +491,27 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; serv->sv_name = prog->pg_name; - serv->sv_program = prog; - kref_init(&serv->sv_refcnt); - serv->sv_stats = prog->pg_stats; + serv->sv_programs = prog; + serv->sv_nprogs = nprogs; + serv->sv_stats = stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE); serv->sv_threadfn = threadfn; xdrsize = 0; - while (prog) { - prog->pg_lovers = prog->pg_nvers-1; - for (vers=0; vers<prog->pg_nvers ; vers++) - if (prog->pg_vers[vers]) { - prog->pg_hivers = vers; - if (prog->pg_lovers > vers) - prog->pg_lovers = vers; - if (prog->pg_vers[vers]->vs_xdrsize > xdrsize) - xdrsize = prog->pg_vers[vers]->vs_xdrsize; + for (i = 0; i < nprogs; i++) { + struct svc_program *progp = &prog[i]; + + progp->pg_lovers = progp->pg_nvers-1; + for (vers = 0; vers < progp->pg_nvers ; vers++) + if (progp->pg_vers[vers]) { + progp->pg_hivers = vers; + if (progp->pg_lovers > vers) + progp->pg_lovers = vers; + if (progp->pg_vers[vers]->vs_xdrsize > xdrsize) + xdrsize = progp->pg_vers[vers]->vs_xdrsize; } - prog = prog->pg_next; } serv->sv_xdrsize = xdrsize; INIT_LIST_HEAD(&serv->sv_tempsocks); @@ -509,9 +537,13 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, i, serv->sv_name); pool->sp_id = i; - INIT_LIST_HEAD(&pool->sp_sockets); + lwq_init(&pool->sp_xprts); INIT_LIST_HEAD(&pool->sp_all_threads); - spin_lock_init(&pool->sp_lock); + init_llist_head(&pool->sp_idle_threads); + + percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL); + percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL); + percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); } return serv; @@ -528,31 +560,36 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize, int (*threadfn)(void *data)) { - return __svc_create(prog, bufsize, 1, threadfn); + return __svc_create(prog, 1, NULL, bufsize, 1, threadfn); } EXPORT_SYMBOL_GPL(svc_create); /** * svc_create_pooled - Create an RPC service with pooled threads - * @prog: the RPC program the new service will handle + * @prog: Array of RPC programs the new service will handle + * @nprogs: Number of programs in the array + * @stats: the stats struct if desired * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create_pooled(struct svc_program *prog, + unsigned int nprogs, + struct svc_stat *stats, unsigned int bufsize, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, threadfn); + serv = __svc_create(prog, nprogs, stats, bufsize, npools, threadfn); if (!serv) goto out_err; + serv->sv_is_pooled = true; return serv; out_err: - svc_pool_map_put(npools); + svc_pool_map_put(); return NULL; } EXPORT_SYMBOL_GPL(svc_create_pooled); @@ -562,57 +599,54 @@ EXPORT_SYMBOL_GPL(svc_create_pooled); * protect sv_permsocks and sv_tempsocks. */ void -svc_destroy(struct kref *ref) +svc_destroy(struct svc_serv **servp) { - struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt); + struct svc_serv *serv = *servp; + unsigned int i; - dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name); + *servp = NULL; + + dprintk("svc: svc_destroy(%s)\n", serv->sv_programs->pg_name); timer_shutdown_sync(&serv->sv_temptimer); /* - * The last user is gone and thus all sockets have to be destroyed to - * the point. Check this. + * Remaining transports at this point are not expected. */ - BUG_ON(!list_empty(&serv->sv_permsocks)); - BUG_ON(!list_empty(&serv->sv_tempsocks)); + WARN_ONCE(!list_empty(&serv->sv_permsocks), + "SVC: permsocks remain for %s\n", serv->sv_programs->pg_name); + WARN_ONCE(!list_empty(&serv->sv_tempsocks), + "SVC: tempsocks remain for %s\n", serv->sv_programs->pg_name); cache_clean_deferred(serv); - svc_pool_map_put(serv->sv_nrpools); + if (serv->sv_is_pooled) + svc_pool_map_put(); + + for (i = 0; i < serv->sv_nrpools; i++) { + struct svc_pool *pool = &serv->sv_pools[i]; + percpu_counter_destroy(&pool->sp_messages_arrived); + percpu_counter_destroy(&pool->sp_sockets_queued); + percpu_counter_destroy(&pool->sp_threads_woken); + } kfree(serv->sv_pools); kfree(serv); } EXPORT_SYMBOL_GPL(svc_destroy); -/* - * Allocate an RPC server's buffer space. - * We allocate pages and place them in rq_pages. - */ -static int -svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) +static bool +svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { - unsigned int pages, arghi; - - /* bc_xprt uses fore channel allocated buffers */ - if (svc_is_backchannel(rqstp)) - return 1; - - pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply. - * We assume one is at most one page - */ - arghi = 0; - WARN_ON_ONCE(pages > RPCSVC_MAXPAGES); - if (pages > RPCSVC_MAXPAGES) - pages = RPCSVC_MAXPAGES; - while (pages) { - struct page *p = alloc_pages_node(node, GFP_KERNEL, 0); - if (!p) - break; - rqstp->rq_pages[arghi++] = p; - pages--; - } - return pages == 0; + rqstp->rq_maxpages = svc_serv_maxpages(serv); + + /* rq_pages' last entry is NULL for historical reasons. */ + rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, + sizeof(struct page *), + GFP_KERNEL, node); + if (!rqstp->rq_pages) + return false; + + return true; } /* @@ -621,15 +655,30 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) static void svc_release_buffer(struct svc_rqst *rqstp) { - unsigned int i; + unsigned long i; - for (i = 0; i < ARRAY_SIZE(rqstp->rq_pages); i++) + for (i = 0; i < rqstp->rq_maxpages; i++) if (rqstp->rq_pages[i]) put_page(rqstp->rq_pages[i]); + kfree(rqstp->rq_pages); } -struct svc_rqst * -svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) +static void +svc_rqst_free(struct svc_rqst *rqstp) +{ + folio_batch_release(&rqstp->rq_fbatch); + kfree(rqstp->rq_bvec); + svc_release_buffer(rqstp); + if (rqstp->rq_scratch_folio) + folio_put(rqstp->rq_scratch_folio); + kfree(rqstp->rq_resp); + kfree(rqstp->rq_argp); + kfree(rqstp->rq_auth_data); + kfree_rcu(rqstp, rq_rcu_head); +} + +static struct svc_rqst * +svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; @@ -637,12 +686,13 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp) return rqstp; - __set_bit(RQ_BUSY, &rqstp->rq_flags); + folio_batch_init(&rqstp->rq_fbatch); + rqstp->rq_server = serv; rqstp->rq_pool = pool; - rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); - if (!rqstp->rq_scratch_page) + rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node); + if (!rqstp->rq_scratch_folio) goto out_enomem; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); @@ -653,91 +703,97 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp->rq_resp) goto out_enomem; - if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) + if (!svc_init_buffer(rqstp, serv, node)) goto out_enomem; - return rqstp; -out_enomem: - svc_rqst_free(rqstp); - return NULL; -} -EXPORT_SYMBOL_GPL(svc_rqst_alloc); - -static struct svc_rqst * -svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) -{ - struct svc_rqst *rqstp; + rqstp->rq_bvec = kcalloc_node(rqstp->rq_maxpages, + sizeof(struct bio_vec), + GFP_KERNEL, node); + if (!rqstp->rq_bvec) + goto out_enomem; - rqstp = svc_rqst_alloc(serv, pool, node); - if (!rqstp) - return ERR_PTR(-ENOMEM); + rqstp->rq_err = -EAGAIN; /* No error yet */ - svc_get(serv); - spin_lock_bh(&serv->sv_lock); serv->sv_nrthreads += 1; - spin_unlock_bh(&serv->sv_lock); + pool->sp_nrthreads += 1; - spin_lock_bh(&pool->sp_lock); - pool->sp_nrthreads++; + /* Protected by whatever lock the service uses when calling + * svc_set_num_threads() + */ list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); - spin_unlock_bh(&pool->sp_lock); + return rqstp; + +out_enomem: + svc_rqst_free(rqstp); + return NULL; } -/* - * Choose a pool in which to create a new thread, for svc_set_num_threads +/** + * svc_pool_wake_idle_thread - Awaken an idle thread in @pool + * @pool: service thread pool + * + * Can be called from soft IRQ or process context. Finding an idle + * service thread and marking it BUSY is atomic with respect to + * other calls to svc_pool_wake_idle_thread(). + * */ -static inline struct svc_pool * -choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +void svc_pool_wake_idle_thread(struct svc_pool *pool) { - if (pool != NULL) - return pool; + struct svc_rqst *rqstp; + struct llist_node *ln; + + rcu_read_lock(); + ln = READ_ONCE(pool->sp_idle_threads.first); + if (ln) { + rqstp = llist_entry(ln, struct svc_rqst, rq_idle); + WRITE_ONCE(rqstp->rq_qtime, ktime_get()); + if (!task_is_running(rqstp->rq_task)) { + wake_up_process(rqstp->rq_task); + trace_svc_pool_thread_wake(pool, rqstp->rq_task->pid); + percpu_counter_inc(&pool->sp_threads_woken); + } else { + trace_svc_pool_thread_running(pool, rqstp->rq_task->pid); + } + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + trace_svc_pool_thread_noidle(pool, 0); +} +EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread); - return &serv->sv_pools[(*state)++ % serv->sv_nrpools]; +static struct svc_pool * +svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools]; } -/* - * Choose a thread to kill, for svc_set_num_threads - */ -static inline struct task_struct * -choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +static struct svc_pool * +svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool, + unsigned int *state) { + struct svc_pool *pool; unsigned int i; - struct task_struct *task = NULL; - if (pool != NULL) { - spin_lock_bh(&pool->sp_lock); - } else { - /* choose a pool in round-robin fashion */ + pool = target_pool; + + if (!pool) { for (i = 0; i < serv->sv_nrpools; i++) { pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; - spin_lock_bh(&pool->sp_lock); - if (!list_empty(&pool->sp_all_threads)) - goto found_pool; - spin_unlock_bh(&pool->sp_lock); + if (pool->sp_nrthreads) + break; } - return NULL; } -found_pool: - if (!list_empty(&pool->sp_all_threads)) { - struct svc_rqst *rqstp; - - /* - * Remove from the pool->sp_all_threads list - * so we don't try to kill it again. - */ - rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); - set_bit(RQ_VICTIM, &rqstp->rq_flags); - list_del_rcu(&rqstp->rq_all); - task = rqstp->rq_task; + if (pool && pool->sp_nrthreads) { + set_bit(SP_VICTIM_REMAINS, &pool->sp_flags); + set_bit(SP_NEED_VICTIM, &pool->sp_flags); + return pool; } - spin_unlock_bh(&pool->sp_lock); - - return task; + return NULL; } -/* create new threads */ static int svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { @@ -746,16 +802,16 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) struct svc_pool *chosen_pool; unsigned int state = serv->sv_nrthreads-1; int node; + int err; do { nrservs--; - chosen_pool = choose_pool(serv, pool, &state); - + chosen_pool = svc_pool_next(serv, pool, &state); node = svc_pool_map_get_node(chosen_pool->sp_id); - rqstp = svc_prepare_thread(serv, chosen_pool, node); - if (IS_ERR(rqstp)) - return PTR_ERR(rqstp); + rqstp = svc_prepare_thread(serv, chosen_pool, node); + if (!rqstp) + return -ENOMEM; task = kthread_create_on_node(serv->sv_threadfn, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { @@ -769,47 +825,60 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) svc_sock_update_bufs(serv); wake_up_process(task); + + wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN); + err = rqstp->rq_err; + if (err) { + svc_exit_thread(rqstp); + return err; + } } while (nrservs > 0); return 0; } -/* - * Create or destroy enough new threads to make the number - * of threads the given number. If `pool' is non-NULL, applies - * only to threads in that pool, otherwise round-robins between - * all pools. Caller must ensure that mutual exclusion between this and - * server startup or shutdown. - */ - -/* destroy old threads */ static int svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - struct task_struct *task; unsigned int state = serv->sv_nrthreads-1; + struct svc_pool *victim; - /* destroy old threads */ do { - task = choose_victim(serv, pool, &state); - if (task == NULL) + victim = svc_pool_victim(serv, pool, &state); + if (!victim) break; - kthread_stop(task); + svc_pool_wake_idle_thread(victim); + wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS, + TASK_IDLE); nrservs++; } while (nrservs < 0); return 0; } +/** + * svc_set_num_threads - adjust number of threads per RPC service + * @serv: RPC service to adjust + * @pool: Specific pool from which to choose threads, or NULL + * @nrservs: New number of threads for @serv (0 or less means kill all threads) + * + * Create or destroy threads to make the number of threads for @serv the + * given number. If @pool is non-NULL, change only threads in that pool; + * otherwise, round-robin between all pools for @serv. @serv's + * sv_nrthreads is adjusted for each thread created or destroyed. + * + * Caller must ensure mutual exclusion between this and server startup or + * shutdown. + * + * Returns zero on success or a negative errno if an error occurred while + * starting a thread. + */ int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - if (pool == NULL) { + if (!pool) nrservs -= serv->sv_nrthreads; - } else { - spin_lock_bh(&pool->sp_lock); + else nrservs -= pool->sp_nrthreads; - spin_unlock_bh(&pool->sp_lock); - } if (nrservs > 0) return svc_start_kthreads(serv, pool, nrservs); @@ -826,57 +895,80 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); * * When replacing a page in rq_pages, batch the release of the * replaced pages to avoid hammering the page allocator. + * + * Return values: + * %true: page replaced + * %false: array bounds checking failed */ -void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) +bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { + struct page **begin = rqstp->rq_pages; + struct page **end = &rqstp->rq_pages[rqstp->rq_maxpages]; + + if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { + trace_svc_replace_page_err(rqstp); + return false; + } + if (*rqstp->rq_next_page) { - if (!pagevec_space(&rqstp->rq_pvec)) - __pagevec_release(&rqstp->rq_pvec); - pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page); + if (!folio_batch_add(&rqstp->rq_fbatch, + page_folio(*rqstp->rq_next_page))) + __folio_batch_release(&rqstp->rq_fbatch); } get_page(page); *(rqstp->rq_next_page++) = page; + return true; } EXPORT_SYMBOL_GPL(svc_rqst_replace_page); -/* - * Called from a server thread as it's exiting. Caller must hold the "service - * mutex" for the service. +/** + * svc_rqst_release_pages - Release Reply buffer pages + * @rqstp: RPC transaction context + * + * Release response pages that might still be in flight after + * svc_send, and any spliced filesystem-owned pages. */ -void -svc_rqst_free(struct svc_rqst *rqstp) +void svc_rqst_release_pages(struct svc_rqst *rqstp) { - svc_release_buffer(rqstp); - if (rqstp->rq_scratch_page) - put_page(rqstp->rq_scratch_page); - kfree(rqstp->rq_resp); - kfree(rqstp->rq_argp); - kfree(rqstp->rq_auth_data); - kfree_rcu(rqstp, rq_rcu_head); + int i, count = rqstp->rq_next_page - rqstp->rq_respages; + + if (count) { + release_pages(rqstp->rq_respages, count); + for (i = 0; i < count; i++) + rqstp->rq_respages[i] = NULL; + } } -EXPORT_SYMBOL_GPL(svc_rqst_free); +/** + * svc_exit_thread - finalise the termination of a sunrpc server thread + * @rqstp: the svc_rqst which represents the thread. + * + * When a thread started with svc_new_thread() exits it must call + * svc_exit_thread() as its last act. This must be done with the + * service mutex held. Normally this is held by a DIFFERENT thread, the + * one that is calling svc_set_num_threads() and which will wait for + * SP_VICTIM_REMAINS to be cleared before dropping the mutex. If the + * thread exits for any reason other than svc_thread_should_stop() + * returning %true (which indicated that svc_set_num_threads() is + * waiting for it to exit), then it must take the service mutex itself, + * which can only safely be done using mutex_try_lock(). + */ void svc_exit_thread(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct svc_pool *pool = rqstp->rq_pool; - spin_lock_bh(&pool->sp_lock); - pool->sp_nrthreads--; - if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags)) - list_del_rcu(&rqstp->rq_all); - spin_unlock_bh(&pool->sp_lock); + list_del_rcu(&rqstp->rq_all); - spin_lock_bh(&serv->sv_lock); + pool->sp_nrthreads -= 1; serv->sv_nrthreads -= 1; - spin_unlock_bh(&serv->sv_lock); svc_sock_update_bufs(serv); svc_rqst_free(rqstp); - svc_put(serv); + clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags); } EXPORT_SYMBOL_GPL(svc_exit_thread); @@ -1002,10 +1094,11 @@ static int __svc_register(struct net *net, const char *progname, #endif } - trace_svc_register(progname, version, protocol, port, family, error); + trace_svc_register(progname, version, family, protocol, port, error); return error; } +static int svc_rpcbind_set_version(struct net *net, const struct svc_program *progp, u32 version, int family, @@ -1016,7 +1109,6 @@ int svc_rpcbind_set_version(struct net *net, version, family, proto, port); } -EXPORT_SYMBOL_GPL(svc_rpcbind_set_version); int svc_generic_rpcbind_set(struct net *net, const struct svc_program *progp, @@ -1064,15 +1156,16 @@ int svc_register(const struct svc_serv *serv, struct net *net, const int family, const unsigned short proto, const unsigned short port) { - struct svc_program *progp; - unsigned int i; + unsigned int p, i; int error = 0; WARN_ON_ONCE(proto == 0 && port == 0); if (proto == 0 && port == 0) return -EINVAL; - for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (p = 0; p < serv->sv_nprogs; p++) { + struct svc_program *progp = &serv->sv_programs[p]; + for (i = 0; i < progp->pg_nvers; i++) { error = progp->pg_rpcbind_set(net, progp, i, @@ -1123,13 +1216,15 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi */ static void svc_unregister(const struct svc_serv *serv, struct net *net) { - struct svc_program *progp; + struct sighand_struct *sighand; unsigned long flags; - unsigned int i; + unsigned int p, i; clear_thread_flag(TIF_SIGPENDING); - for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (p = 0; p < serv->sv_nprogs; p++) { + struct svc_program *progp = &serv->sv_programs[p]; + for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; @@ -1139,9 +1234,12 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net) } } - spin_lock_irqsave(¤t->sighand->siglock, flags); + rcu_read_lock(); + sighand = rcu_dereference(current->sighand); + spin_lock_irqsave(&sighand->siglock, flags); recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); + spin_unlock_irqrestore(&sighand->siglock, flags); + rcu_read_unlock(); } /* @@ -1200,15 +1298,13 @@ svc_generic_init_request(struct svc_rqst *rqstp, if (rqstp->rq_proc >= versp->vs_nproc) goto err_bad_proc; rqstp->rq_procinfo = procp = &versp->vs_proc[rqstp->rq_proc]; - if (!procp) - goto err_bad_proc; /* Initialize storage for argp and resp */ memset(rqstp->rq_argp, 0, procp->pc_argzero); memset(rqstp->rq_resp, 0, procp->pc_ressize); /* Bump per-procedure stats counter */ - versp->vs_count[rqstp->rq_proc]++; + this_cpu_inc(versp->vs_count[rqstp->rq_proc]); ret->dispatch = versp->vs_dispatch; return rpc_success; @@ -1225,51 +1321,45 @@ EXPORT_SYMBOL_GPL(svc_generic_init_request); * Common routine for processing the RPC request. */ static int -svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) +svc_process_common(struct svc_rqst *rqstp) { - struct svc_program *progp; + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct svc_program *progp = NULL; const struct svc_procedure *procp = NULL; struct svc_serv *serv = rqstp->rq_server; struct svc_process_info process; - __be32 *statp; - u32 prog, vers; - __be32 rpc_stat; - int auth_res, rc; - __be32 *reply_statp; + enum svc_auth_status auth_res; + unsigned int aoffset; + int pr, rc; + __be32 *p; - rpc_stat = rpc_success; + /* Reset the accept_stat for the RPC */ + rqstp->rq_accept_statp = NULL; - if (argv->iov_len < 6*4) - goto err_short_len; - - /* Will be turned off by GSS integrity and privacy services */ - set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); clear_bit(RQ_DROPME, &rqstp->rq_flags); - svc_putu32(resv, rqstp->rq_xid); - - vers = svc_getnl(argv); + /* Construct the first words of the reply: */ + svcxdr_init_encode(rqstp); + xdr_stream_encode_be32(xdr, rqstp->rq_xid); + xdr_stream_encode_be32(xdr, rpc_reply); - /* First words of reply: */ - svc_putnl(resv, 1); /* REPLY */ - - if (vers != 2) /* RPC version number */ + p = xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 4); + if (unlikely(!p)) + goto err_short_len; + if (*p++ != cpu_to_be32(RPC_VERSION)) goto err_bad_rpc; - /* Save position in case we later decide to reject: */ - reply_statp = resv->iov_base + resv->iov_len; + xdr_stream_encode_be32(xdr, rpc_msg_accepted); - svc_putnl(resv, 0); /* ACCEPT */ + rqstp->rq_prog = be32_to_cpup(p++); + rqstp->rq_vers = be32_to_cpup(p++); + rqstp->rq_proc = be32_to_cpup(p); - rqstp->rq_prog = prog = svc_getnl(argv); /* program number */ - rqstp->rq_vers = svc_getnl(argv); /* version number */ - rqstp->rq_proc = svc_getnl(argv); /* procedure number */ - - for (progp = serv->sv_program; progp; progp = progp->pg_next) - if (prog == progp->pg_prog) - break; + for (pr = 0; pr < serv->sv_nprogs; pr++) + if (rqstp->rq_prog == serv->sv_programs[pr].pg_prog) + progp = &serv->sv_programs[pr]; /* * Decode auth data, and add verifier to reply buffer. @@ -1285,10 +1375,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) case SVC_OK: break; case SVC_GARBAGE: - goto err_garbage; - case SVC_SYSERR: - rpc_stat = rpc_system_err; - goto err_bad; + rqstp->rq_auth_stat = rpc_autherr_badcred; + goto err_bad_auth; case SVC_DENIED: goto err_bad_auth; case SVC_CLOSE: @@ -1297,13 +1385,16 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto dropit; case SVC_COMPLETE: goto sendit; + default: + pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res); + rqstp->rq_auth_stat = rpc_autherr_failed; + goto err_bad_auth; } if (progp == NULL) goto err_bad_prog; - rpc_stat = progp->pg_init_request(rqstp, progp, &process); - switch (rpc_stat) { + switch (progp->pg_init_request(rqstp, progp, &process)) { case rpc_success: break; case rpc_prog_unavail: @@ -1320,12 +1411,11 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_bad_proc; /* Syntactic check complete */ - serv->sv_stats->rpccnt++; + if (serv->sv_stats) + serv->sv_stats->rpccnt++; trace_svc_process(rqstp, progp->pg_name); - /* Build the reply header. */ - statp = resv->iov_base +resv->iov_len; - svc_putnl(resv, RPC_SUCCESS); + aoffset = xdr_stream_pos(xdr); /* un-reserve some of the out-queue now that we have a * better idea of reply size @@ -1334,17 +1424,16 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) svc_reserve_auth(rqstp, procp->pc_xdrressize<<2); /* Call the function that processes the request. */ - rc = process.dispatch(rqstp, statp); - if (procp->pc_release) - procp->pc_release(rqstp); + rc = process.dispatch(rqstp); + xdr_finish_decode(xdr); + if (!rc) goto dropit; if (rqstp->rq_auth_stat != rpc_auth_ok) goto err_bad_auth; - /* Check RPC status result */ - if (*statp != rpc_success) - resv->iov_len = ((void*)statp) - resv->iov_base + 4; + if (*rqstp->rq_accept_statp != rpc_success) + xdr_truncate_encode(xdr, aoffset); if (procp->pc_encode == NULL) goto dropit; @@ -1368,71 +1457,89 @@ close_xprt: return 0; err_short_len: - svc_printk(rqstp, "short len %zd, dropping request\n", - argv->iov_len); + svc_printk(rqstp, "short len %u, dropping request\n", + rqstp->rq_arg.len); goto close_xprt; err_bad_rpc: - serv->sv_stats->rpcbadfmt++; - svc_putnl(resv, 1); /* REJECT */ - svc_putnl(resv, 0); /* RPC_MISMATCH */ - svc_putnl(resv, 2); /* Only RPCv2 supported */ - svc_putnl(resv, 2); - goto sendit; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; + xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); + xdr_stream_encode_u32(xdr, RPC_MISMATCH); + /* Only RPCv2 supported */ + xdr_stream_encode_u32(xdr, RPC_VERSION); + xdr_stream_encode_u32(xdr, RPC_VERSION); + return 1; /* don't wrap */ err_bad_auth: dprintk("svc: authentication failed (%d)\n", be32_to_cpu(rqstp->rq_auth_stat)); - serv->sv_stats->rpcbadauth++; - /* Restore write pointer to location of accept status: */ - xdr_ressize_check(rqstp, reply_statp); - svc_putnl(resv, 1); /* REJECT */ - svc_putnl(resv, 1); /* AUTH_ERROR */ - svc_putu32(resv, rqstp->rq_auth_stat); /* status */ + if (serv->sv_stats) + serv->sv_stats->rpcbadauth++; + /* Restore write pointer to location of reply status: */ + xdr_truncate_encode(xdr, XDR_UNIT * 2); + xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); + xdr_stream_encode_u32(xdr, RPC_AUTH_ERROR); + xdr_stream_encode_be32(xdr, rqstp->rq_auth_stat); goto sendit; err_bad_prog: - dprintk("svc: unknown program %d\n", prog); - serv->sv_stats->rpcbadfmt++; - svc_putnl(resv, RPC_PROG_UNAVAIL); + dprintk("svc: unknown program %d\n", rqstp->rq_prog); + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; + *rqstp->rq_accept_statp = rpc_prog_unavail; goto sendit; err_bad_vers: svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", rqstp->rq_vers, rqstp->rq_prog, progp->pg_name); - serv->sv_stats->rpcbadfmt++; - svc_putnl(resv, RPC_PROG_MISMATCH); - svc_putnl(resv, process.mismatch.lovers); - svc_putnl(resv, process.mismatch.hivers); + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; + *rqstp->rq_accept_statp = rpc_prog_mismatch; + + /* + * svc_authenticate() has already added the verifier and + * advanced the stream just past rq_accept_statp. + */ + xdr_stream_encode_u32(xdr, process.mismatch.lovers); + xdr_stream_encode_u32(xdr, process.mismatch.hivers); goto sendit; err_bad_proc: svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc); - serv->sv_stats->rpcbadfmt++; - svc_putnl(resv, RPC_PROC_UNAVAIL); + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; + *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; +} + +/* + * Drop request + */ +static void svc_drop(struct svc_rqst *rqstp) +{ + trace_svc_drop(rqstp); +} -err_garbage: - svc_printk(rqstp, "failed to decode args\n"); +static void svc_release_rqst(struct svc_rqst *rqstp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; - rpc_stat = rpc_garbage_args; -err_bad: - serv->sv_stats->rpcbadfmt++; - svc_putnl(resv, ntohl(rpc_stat)); - goto sendit; + if (procp && procp->pc_release) + procp->pc_release(rqstp); } -/* - * Process the RPC request. +/** + * svc_process - Execute one RPC transaction + * @rqstp: RPC transaction context + * */ -int -svc_process(struct svc_rqst *rqstp) +void svc_process(struct svc_rqst *rqstp) { - struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; - __be32 dir; + __be32 *p; #if IS_ENABLED(CONFIG_FAIL_SUNRPC) if (!fail_sunrpc.ignore_server_disconnect && @@ -1455,44 +1562,49 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_res.tail[0].iov_base = NULL; rqstp->rq_res.tail[0].iov_len = 0; - dir = svc_getu32(argv); - if (dir != rpc_call) + svcxdr_init_decode(rqstp); + p = xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2); + if (unlikely(!p)) + goto out_drop; + rqstp->rq_xid = *p++; + if (unlikely(*p != rpc_call)) goto out_baddir; - if (!svc_process_common(rqstp, argv, resv)) + + if (!svc_process_common(rqstp)) { + svc_release_rqst(rqstp); goto out_drop; - return svc_send(rqstp); + } + svc_send(rqstp); + svc_release_rqst(rqstp); + return; out_baddir: svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", - be32_to_cpu(dir)); - rqstp->rq_server->sv_stats->rpcbadfmt++; + be32_to_cpu(*p)); + if (rqstp->rq_server->sv_stats) + rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); - return 0; } -EXPORT_SYMBOL_GPL(svc_process); #if defined(CONFIG_SUNRPC_BACKCHANNEL) -/* - * Process a backchannel RPC request that arrived over an existing - * outbound connection +/** + * svc_process_bc - process a reverse-direction RPC request + * @req: RPC request to be used for client-side processing + * @rqstp: server-side execution context + * */ -int -bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, - struct svc_rqst *rqstp) +void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) { - struct kvec *argv = &rqstp->rq_arg.head[0]; - struct kvec *resv = &rqstp->rq_res.head[0]; + struct rpc_timeout timeout = { + .to_increment = 0, + }; struct rpc_task *task; int proc_error; - int error; - - dprintk("svc: %s(%p)\n", __func__, req); /* Build the svc_rqst used by the common processing routine */ rqstp->rq_xid = req->rq_xid; rqstp->rq_prot = req->rq_xprt->prot; - rqstp->rq_server = serv; rqstp->rq_bc_net = req->rq_xprt->xprt_net; rqstp->rq_addrlen = sizeof(req->rq_xprt->addr); @@ -1513,43 +1625,46 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, rqstp->rq_arg.len = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len; - /* reset result send buffer "put" position */ - resv->iov_len = 0; + /* Reset the response buffer */ + rqstp->rq_res.head[0].iov_len = 0; /* - * Skip the next two words because they've already been - * processed in the transport + * Skip the XID and calldir fields because they've already + * been processed by the caller. */ - svc_getu32(argv); /* XID */ - svc_getnl(argv); /* CALLDIR */ + svcxdr_init_decode(rqstp); + if (!xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2)) + return; /* Parse and execute the bc call */ - proc_error = svc_process_common(rqstp, argv, resv); + proc_error = svc_process_common(rqstp); atomic_dec(&req->rq_xprt->bc_slot_count); if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); - error = -EINVAL; - goto out; + svc_release_rqst(rqstp); + return; } /* Finally, send the reply synchronously */ - memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); - task = rpc_run_bc_task(req); - if (IS_ERR(task)) { - error = PTR_ERR(task); - goto out; + if (rqstp->bc_to_initval > 0) { + timeout.to_initval = rqstp->bc_to_initval; + timeout.to_retries = rqstp->bc_to_retries; + } else { + timeout.to_initval = req->rq_xprt->timeout->to_initval; + timeout.to_retries = req->rq_xprt->timeout->to_retries; } + timeout.to_maxval = timeout.to_initval; + memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); + task = rpc_run_bc_task(req, &timeout); + svc_release_rqst(rqstp); + + if (IS_ERR(task)) + return; WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); - error = task->tk_status; rpc_put_task(task); - -out: - dprintk("svc: %s(), error=%d\n", __func__, error); - return error; } -EXPORT_SYMBOL_GPL(bc_svc_process); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** @@ -1602,46 +1717,6 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** - * svc_fill_write_vector - Construct data argument for VFS write call - * @rqstp: svc_rqst to operate on - * @payload: xdr_buf containing only the write data payload - * - * Fills in rqstp::rq_vec, and returns the number of elements. - */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct xdr_buf *payload) -{ - struct page **pages = payload->pages; - struct kvec *first = payload->head; - struct kvec *vec = rqstp->rq_vec; - size_t total = payload->len; - unsigned int i; - - /* Some types of transport can present the write payload - * entirely in rq_arg.pages. In this case, @first is empty. - */ - i = 0; - if (first->iov_len) { - vec[i].iov_base = first->iov_base; - vec[i].iov_len = min_t(size_t, total, first->iov_len); - total -= vec[i].iov_len; - ++i; - } - - while (total) { - vec[i].iov_base = page_address(*pages); - vec[i].iov_len = min_t(size_t, total, PAGE_SIZE); - total -= vec[i].iov_len; - ++i; - ++pages; - } - - WARN_ON_ONCE(i > ARRAY_SIZE(rqstp->rq_vec)); - return i; -} -EXPORT_SYMBOL_GPL(svc_fill_write_vector); - -/** * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call * @rqstp: svc_rqst to operate on * @first: buffer containing first section of pathname diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c2ce12538008..6973184ff667 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -9,7 +9,6 @@ #include <linux/sched/mm.h> #include <linux/errno.h> #include <linux/freezer.h> -#include <linux/kthread.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/sunrpc/addr.h> @@ -17,6 +16,7 @@ #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/bc_xprt.h> #include <linux/module.h> #include <linux/netdevice.h> #include <trace/events/sunrpc.h> @@ -46,7 +46,6 @@ static LIST_HEAD(svc_xprt_class_list); /* SMP locking strategy: * - * svc_pool->sp_lock protects most of the fields of that pool. * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. * when both need to be taken (rare), svc_serv->sv_lock is first. * The "service mutex" protects svc_serv->sv_nrthread. @@ -74,13 +73,18 @@ static LIST_HEAD(svc_xprt_class_list); * that no other thread will be using the transport or will * try to set XPT_DEAD. */ + +/** + * svc_reg_xprt_class - Register a server-side RPC transport class + * @xcl: New transport class to be registered + * + * Returns zero on success; otherwise a negative errno is returned. + */ int svc_reg_xprt_class(struct svc_xprt_class *xcl) { struct svc_xprt_class *cl; int res = -EEXIST; - dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name); - INIT_LIST_HEAD(&xcl->xcl_list); spin_lock(&svc_xprt_class_lock); /* Make sure there isn't already a class with the same name */ @@ -96,9 +100,13 @@ out: } EXPORT_SYMBOL_GPL(svc_reg_xprt_class); +/** + * svc_unreg_xprt_class - Unregister a server-side RPC transport class + * @xcl: Transport class to be unregistered + * + */ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) { - dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); spin_lock(&svc_xprt_class_lock); list_del_init(&xcl->xcl_list); spin_unlock(&svc_xprt_class_lock); @@ -149,6 +157,7 @@ int svc_print_xprts(char *buf, int maxlen) */ void svc_xprt_deferred_close(struct svc_xprt *xprt) { + trace_svc_xprt_close(xprt); if (!test_and_set_bit(XPT_CLOSE, &xprt->xpt_flags)) svc_xprt_enqueue(xprt); } @@ -192,7 +201,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, kref_init(&xprt->xpt_ref); xprt->xpt_server = serv; INIT_LIST_HEAD(&xprt->xpt_list); - INIT_LIST_HEAD(&xprt->xpt_ready); INIT_LIST_HEAD(&xprt->xpt_deferred); INIT_LIST_HEAD(&xprt->xpt_users); mutex_init(&xprt->xpt_mutex); @@ -203,51 +211,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, } EXPORT_SYMBOL_GPL(svc_xprt_init); -static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, - struct svc_serv *serv, - struct net *net, - const int family, - const unsigned short port, - int flags) -{ - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_ANY), - .sin_port = htons(port), - }; -#if IS_ENABLED(CONFIG_IPV6) - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_ANY_INIT, - .sin6_port = htons(port), - }; -#endif - struct svc_xprt *xprt; - struct sockaddr *sap; - size_t len; - - switch (family) { - case PF_INET: - sap = (struct sockaddr *)&sin; - len = sizeof(sin); - break; -#if IS_ENABLED(CONFIG_IPV6) - case PF_INET6: - sap = (struct sockaddr *)&sin6; - len = sizeof(sin6); - break; -#endif - default: - return ERR_PTR(-EAFNOSUPPORT); - } - - xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); - if (IS_ERR(xprt)) - trace_svc_xprt_create_err(serv->sv_program->pg_name, - xcl->xcl_name, sap, len, xprt); - return xprt; -} - /** * svc_xprt_received - start next receiver thread * @xprt: controlling transport @@ -286,9 +249,8 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) } static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, - struct net *net, const int family, - const unsigned short port, int flags, - const struct cred *cred) + struct net *net, struct sockaddr *sap, + size_t len, int flags, const struct cred *cred) { struct svc_xprt_class *xcl; @@ -304,8 +266,11 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags); + newxprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); if (IS_ERR(newxprt)) { + trace_svc_xprt_create_err(serv->sv_programs->pg_name, + xcl->xcl_name, sap, len, + newxprt); module_put(xcl->xcl_owner); return PTR_ERR(newxprt); } @@ -322,6 +287,48 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, } /** + * svc_xprt_create_from_sa - Add a new listener to @serv from socket address + * @serv: target RPC service + * @xprt_name: transport class name + * @net: network namespace + * @sap: socket address pointer + * @flags: SVC_SOCK flags + * @cred: credential to bind to this transport + * + * Return local xprt port on success or %-EPROTONOSUPPORT on failure + */ +int svc_xprt_create_from_sa(struct svc_serv *serv, const char *xprt_name, + struct net *net, struct sockaddr *sap, + int flags, const struct cred *cred) +{ + size_t len; + int err; + + switch (sap->sa_family) { + case AF_INET: + len = sizeof(struct sockaddr_in); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + len = sizeof(struct sockaddr_in6); + break; +#endif + default: + return -EAFNOSUPPORT; + } + + err = _svc_xprt_create(serv, xprt_name, net, sap, len, flags, cred); + if (err == -EPROTONOSUPPORT) { + request_module("svc%s", xprt_name); + err = _svc_xprt_create(serv, xprt_name, net, sap, len, flags, + cred); + } + + return err; +} +EXPORT_SYMBOL_GPL(svc_xprt_create_from_sa); + +/** * svc_xprt_create - Add a new listener to @serv * @serv: target RPC service * @xprt_name: transport class name @@ -331,23 +338,41 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, * @flags: SVC_SOCK flags * @cred: credential to bind to this transport * - * Return values: - * %0: New listener added successfully - * %-EPROTONOSUPPORT: Requested transport type not supported + * Return local xprt port on success or %-EPROTONOSUPPORT on failure */ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred) { - int err; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(port), + }; +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; +#endif + struct sockaddr *sap; - err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred); - if (err == -EPROTONOSUPPORT) { - request_module("svc%s", xprt_name); - err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred); + switch (family) { + case PF_INET: + sap = (struct sockaddr *)&sin; + break; +#if IS_ENABLED(CONFIG_IPV6) + case PF_INET6: + sap = (struct sockaddr *)&sin6; + break; +#endif + default: + return -EAFNOSUPPORT; } - return err; + + return svc_xprt_create_from_sa(serv, xprt_name, net, sap, flags, cred); } EXPORT_SYMBOL_GPL(svc_xprt_create); @@ -425,9 +450,10 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) smp_rmb(); xpt_flags = READ_ONCE(xprt->xpt_flags); + trace_svc_xprt_enqueue(xprt, xpt_flags); if (xpt_flags & BIT(XPT_BUSY)) return false; - if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE))) + if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE) | BIT(XPT_HANDSHAKE))) return true; if (xpt_flags & (BIT(XPT_DATA) | BIT(XPT_DEFERRED))) { if (xprt->xpt_ops->xpo_has_wspace(xprt) && @@ -447,7 +473,6 @@ static bool svc_xprt_ready(struct svc_xprt *xprt) void svc_xprt_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; - struct svc_rqst *rqstp = NULL; if (!svc_xprt_ready(xprt)) return; @@ -462,28 +487,11 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) pool = svc_pool_for_cpu(xprt->xpt_server); - atomic_long_inc(&pool->sp_stats.packets); - - spin_lock_bh(&pool->sp_lock); - list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); - pool->sp_stats.sockets_queued++; - spin_unlock_bh(&pool->sp_lock); + percpu_counter_inc(&pool->sp_sockets_queued); + xprt->xpt_qtime = ktime_get(); + lwq_enqueue(&xprt->xpt_ready, &pool->sp_xprts); - /* find a thread for this xprt */ - rcu_read_lock(); - list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { - if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) - continue; - atomic_long_inc(&pool->sp_stats.threads_woken); - rqstp->rq_qtime = ktime_get(); - wake_up_process(rqstp->rq_task); - goto out_unlock; - } - set_bit(SP_CONGESTED, &pool->sp_flags); - rqstp = NULL; -out_unlock: - rcu_read_unlock(); - trace_svc_xprt_enqueue(xprt, rqstp); + svc_pool_wake_idle_thread(pool); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); @@ -494,18 +502,9 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) { struct svc_xprt *xprt = NULL; - if (list_empty(&pool->sp_sockets)) - goto out; - - spin_lock_bh(&pool->sp_lock); - if (likely(!list_empty(&pool->sp_sockets))) { - xprt = list_first_entry(&pool->sp_sockets, - struct svc_xprt, xpt_ready); - list_del_init(&xprt->xpt_ready); + xprt = lwq_dequeue(&pool->sp_xprts, struct svc_xprt, xpt_ready); + if (xprt) svc_xprt_get(xprt); - } - spin_unlock_bh(&pool->sp_lock); -out: return xprt; } @@ -534,17 +533,26 @@ void svc_reserve(struct svc_rqst *rqstp, int space) } EXPORT_SYMBOL_GPL(svc_reserve); +static void free_deferred(struct svc_xprt *xprt, struct svc_deferred_req *dr) +{ + if (!dr) + return; + + xprt->xpt_ops->xpo_release_ctxt(xprt, dr->xprt_ctxt); + kfree(dr); +} + static void svc_xprt_release(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; - xprt->xpt_ops->xpo_release_rqst(rqstp); + xprt->xpt_ops->xpo_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; - kfree(rqstp->rq_deferred); + free_deferred(xprt, rqstp->rq_deferred); rqstp->rq_deferred = NULL; - pagevec_release(&rqstp->rq_pvec); - svc_free_res_pages(rqstp); + svc_rqst_release_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -565,7 +573,10 @@ static void svc_xprt_release(struct svc_rqst *rqstp) svc_xprt_put(xprt); } -/* +/** + * svc_wake_up - Wake up a service thread for non-transport work + * @serv: RPC service + * * Some svc_serv's will have occasional work to do, even when a xprt is not * waiting to be serviced. This function is there to "kick" a task in one of * those services so that it can wake up and do that work. Note that we only @@ -574,27 +585,10 @@ static void svc_xprt_release(struct svc_rqst *rqstp) */ void svc_wake_up(struct svc_serv *serv) { - struct svc_rqst *rqstp; - struct svc_pool *pool; - - pool = &serv->sv_pools[0]; - - rcu_read_lock(); - list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { - /* skip any that aren't queued */ - if (test_bit(RQ_BUSY, &rqstp->rq_flags)) - continue; - rcu_read_unlock(); - wake_up_process(rqstp->rq_task); - trace_svc_wake_up(rqstp->rq_task->pid); - return; - } - rcu_read_unlock(); + struct svc_pool *pool = &serv->sv_pools[0]; - /* No free entries available */ set_bit(SP_TASK_PENDING, &pool->sp_flags); - smp_wmb(); - trace_svc_wake_up(0); + svc_pool_wake_idle_thread(pool); } EXPORT_SYMBOL_GPL(svc_wake_up); @@ -613,7 +607,8 @@ int svc_port_is_privileged(struct sockaddr *sin) } /* - * Make sure that we don't have too many active connections. If we have, + * Make sure that we don't have too many connections that have not yet + * demonstrated that they have access to the NFS server. If we have, * something must be dropped. It's not clear what will happen if we allow * "too many" connections, but when dealing with network-facing software, * we have to code defensively. Here we do that by imposing hard limits. @@ -625,34 +620,26 @@ int svc_port_is_privileged(struct sockaddr *sin) * The only somewhat efficient mechanism would be if drop old * connections from the same IP first. But right now we don't even * record the client IP in svc_sock. - * - * single-threaded services that expect a lot of clients will probably - * need to set sv_maxconn to override the default value which is based - * on the number of threads */ static void svc_check_conn_limits(struct svc_serv *serv) { - unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn : - (serv->sv_nrthreads+3) * 20; - - if (serv->sv_tmpcnt > limit) { - struct svc_xprt *xprt = NULL; + if (serv->sv_tmpcnt > XPT_MAX_TMP_CONN) { + struct svc_xprt *xprt = NULL, *xprti; spin_lock_bh(&serv->sv_lock); if (!list_empty(&serv->sv_tempsocks)) { - /* Try to help the admin */ - net_notice_ratelimited("%s: too many open connections, consider increasing the %s\n", - serv->sv_name, serv->sv_maxconn ? - "max number of connections" : - "number of threads"); /* * Always select the oldest connection. It's not fair, - * but so is life + * but nor is life. */ - xprt = list_entry(serv->sv_tempsocks.prev, - struct svc_xprt, - xpt_list); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_get(xprt); + list_for_each_entry_reverse(xprti, &serv->sv_tempsocks, + xpt_list) { + if (!test_bit(XPT_PEER_VALID, &xprti->xpt_flags)) { + xprt = xprti; + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_get(xprt); + break; + } + } } spin_unlock_bh(&serv->sv_lock); @@ -663,33 +650,22 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -static int svc_alloc_arg(struct svc_rqst *rqstp) +static bool svc_alloc_arg(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; unsigned long pages, filled, ret; - pagevec_init(&rqstp->rq_pvec); - - pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; - if (pages > RPCSVC_MAXPAGES) { - pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", - pages, RPCSVC_MAXPAGES); - /* use as many pages as possible */ - pages = RPCSVC_MAXPAGES; - } - + pages = rqstp->rq_maxpages; for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; - set_current_state(TASK_INTERRUPTIBLE); - if (signalled() || kthread_should_stop()) { + set_current_state(TASK_IDLE); + if (svc_thread_should_stop(rqstp)) { set_current_state(TASK_RUNNING); - return -EINTR; + return false; } trace_svc_alloc_arg_err(pages, ret); memalloc_retry_wait(GFP_KERNEL); @@ -706,84 +682,66 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) arg->page_len = (pages-2)*PAGE_SIZE; arg->len = (pages-1)*PAGE_SIZE; arg->tail[0].iov_len = 0; - return 0; + + rqstp->rq_xid = xdr_zero; + return true; } static bool -rqst_should_sleep(struct svc_rqst *rqstp) +svc_thread_should_sleep(struct svc_rqst *rqstp) { struct svc_pool *pool = rqstp->rq_pool; /* did someone call svc_wake_up? */ - if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags)) + if (test_bit(SP_TASK_PENDING, &pool->sp_flags)) return false; /* was a socket queued? */ - if (!list_empty(&pool->sp_sockets)) + if (!lwq_empty(&pool->sp_xprts)) return false; /* are we shutting down? */ - if (signalled() || kthread_should_stop()) + if (svc_thread_should_stop(rqstp)) return false; - /* are we freezing? */ - if (freezing(current)) - return false; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (svc_is_backchannel(rqstp)) { + if (!lwq_empty(&rqstp->rq_server->sv_cb_list)) + return false; + } +#endif return true; } -static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) +static void svc_thread_wait_for_work(struct svc_rqst *rqstp) { - struct svc_pool *pool = rqstp->rq_pool; - long time_left = 0; - - /* rq_xprt should be clear on entry */ - WARN_ON_ONCE(rqstp->rq_xprt); - - rqstp->rq_xprt = svc_xprt_dequeue(pool); - if (rqstp->rq_xprt) - goto out_found; - - /* - * We have to be able to interrupt this wait - * to bring down the daemons ... - */ - set_current_state(TASK_INTERRUPTIBLE); - smp_mb__before_atomic(); - clear_bit(SP_CONGESTED, &pool->sp_flags); - clear_bit(RQ_BUSY, &rqstp->rq_flags); - smp_mb__after_atomic(); - - if (likely(rqst_should_sleep(rqstp))) - time_left = schedule_timeout(timeout); - else + struct svc_pool *pool = rqstp->rq_pool; + + if (svc_thread_should_sleep(rqstp)) { + set_current_state(TASK_IDLE | TASK_FREEZABLE); + llist_add(&rqstp->rq_idle, &pool->sp_idle_threads); + if (likely(svc_thread_should_sleep(rqstp))) + schedule(); + + while (!llist_del_first_this(&pool->sp_idle_threads, + &rqstp->rq_idle)) { + /* Work just became available. This thread can only + * handle it after removing rqstp from the idle + * list. If that attempt failed, some other thread + * must have queued itself after finding no + * work to do, so that thread has taken responsibly + * for this new work. This thread can safely sleep + * until woken again. + */ + schedule(); + set_current_state(TASK_IDLE | TASK_FREEZABLE); + } __set_current_state(TASK_RUNNING); - + } else { + cond_resched(); + } try_to_freeze(); - - set_bit(RQ_BUSY, &rqstp->rq_flags); - smp_mb__after_atomic(); - rqstp->rq_xprt = svc_xprt_dequeue(pool); - if (rqstp->rq_xprt) - goto out_found; - - if (!time_left) - atomic_long_inc(&pool->sp_stats.threads_timedout); - - if (signalled() || kthread_should_stop()) - return ERR_PTR(-EINTR); - return ERR_PTR(-EAGAIN); -out_found: - /* Normally we will wait up to 5 seconds for any required - * cache information to be provided. - */ - if (!test_bit(SP_CONGESTED, &pool->sp_flags)) - rqstp->rq_chandle.thread_wait = 5*HZ; - else - rqstp->rq_chandle.thread_wait = 1*HZ; - trace_svc_xprt_dequeue(rqstp); - return rqstp->rq_xprt; } static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) @@ -802,7 +760,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt svc_xprt_received(newxpt); } -static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) +static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) { struct svc_serv *serv = rqstp->rq_server; int len = 0; @@ -831,100 +789,119 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) module_put(xprt->xpt_class->xcl_owner); } svc_xprt_received(xprt); + } else if (test_bit(XPT_HANDSHAKE, &xprt->xpt_flags)) { + xprt->xpt_ops->xpo_handshake(xprt); + svc_xprt_received(xprt); } else if (svc_xprt_reserve_slot(rqstp, xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ - dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", - rqstp, rqstp->rq_pool->sp_id, xprt, - kref_read(&xprt->xpt_ref)); rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) len = svc_deferred_recv(rqstp); else len = xprt->xpt_ops->xpo_recvfrom(rqstp); - rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + if (len <= 0) + goto out; + + trace_svc_xdr_recvfrom(&rqstp->rq_arg); + + clear_bit(XPT_OLD, &xprt->xpt_flags); + + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + percpu_counter_inc(&rqstp->rq_pool->sp_messages_arrived); + rqstp->rq_stime = ktime_get(); + svc_process(rqstp); } else svc_xprt_received(xprt); out: - return len; + rqstp->rq_res.len = 0; + svc_xprt_release(rqstp); } -/* - * Receive the next request on any transport. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. +static void svc_thread_wake_next(struct svc_rqst *rqstp) +{ + if (!svc_thread_should_sleep(rqstp)) + /* More work pending after I dequeued some, + * wake another worker + */ + svc_pool_wake_idle_thread(rqstp->rq_pool); +} + +/** + * svc_recv - Receive and process the next request on any transport + * @rqstp: an idle RPC service thread + * + * This code is carefully organised not to touch any cachelines in + * the shared svc_serv structure, only cachelines in the local + * svc_pool. */ -int svc_recv(struct svc_rqst *rqstp, long timeout) +void svc_recv(struct svc_rqst *rqstp) { - struct svc_xprt *xprt = NULL; - struct svc_serv *serv = rqstp->rq_server; - int len, err; + struct svc_pool *pool = rqstp->rq_pool; - err = svc_alloc_arg(rqstp); - if (err) - goto out; + if (!svc_alloc_arg(rqstp)) + return; - try_to_freeze(); - cond_resched(); - err = -EINTR; - if (signalled() || kthread_should_stop()) - goto out; + svc_thread_wait_for_work(rqstp); - xprt = svc_get_next_xprt(rqstp, timeout); - if (IS_ERR(xprt)) { - err = PTR_ERR(xprt); - goto out; + clear_bit(SP_TASK_PENDING, &pool->sp_flags); + + if (svc_thread_should_stop(rqstp)) { + svc_thread_wake_next(rqstp); + return; } - len = svc_handle_xprt(rqstp, xprt); + rqstp->rq_xprt = svc_xprt_dequeue(pool); + if (rqstp->rq_xprt) { + struct svc_xprt *xprt = rqstp->rq_xprt; - /* No data, incomplete (TCP) read, or accept() */ - err = -EAGAIN; - if (len <= 0) - goto out_release; - trace_svc_xdr_recvfrom(&rqstp->rq_arg); + svc_thread_wake_next(rqstp); + /* Normally we will wait up to 5 seconds for any required + * cache information to be provided. When there are no + * idle threads, we reduce the wait time. + */ + if (pool->sp_idle_threads.first) + rqstp->rq_chandle.thread_wait = 5 * HZ; + else + rqstp->rq_chandle.thread_wait = 1 * HZ; - clear_bit(XPT_OLD, &xprt->xpt_flags); + trace_svc_xprt_dequeue(rqstp); + svc_handle_xprt(rqstp, xprt); + } - xprt->xpt_ops->xpo_secure_port(rqstp); - rqstp->rq_chandle.defer = svc_defer; - rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]); +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (svc_is_backchannel(rqstp)) { + struct svc_serv *serv = rqstp->rq_server; + struct rpc_rqst *req; - if (serv->sv_stats) - serv->sv_stats->netcnt++; - return len; -out_release: - rqstp->rq_res.len = 0; - svc_xprt_release(rqstp); -out: - return err; + req = lwq_dequeue(&serv->sv_cb_list, + struct rpc_rqst, rq_bc_list); + if (req) { + svc_thread_wake_next(rqstp); + svc_process_bc(req, rqstp); + } + } +#endif } EXPORT_SYMBOL_GPL(svc_recv); -/* - * Drop request - */ -void svc_drop(struct svc_rqst *rqstp) -{ - trace_svc_drop(rqstp); - svc_xprt_release(rqstp); -} -EXPORT_SYMBOL_GPL(svc_drop); - -/* - * Return reply to client. +/** + * svc_send - Return reply to client + * @rqstp: RPC transaction context + * */ -int svc_send(struct svc_rqst *rqstp) +void svc_send(struct svc_rqst *rqstp) { struct svc_xprt *xprt; - int len = -EFAULT; struct xdr_buf *xb; + int status; xprt = rqstp->rq_xprt; - if (!xprt) - goto out; /* calculate over-all length */ xb = &rqstp->rq_res; @@ -934,15 +911,9 @@ int svc_send(struct svc_rqst *rqstp) trace_svc_xdr_sendto(rqstp->rq_xid, xb); trace_svc_stats_latency(rqstp); - len = xprt->xpt_ops->xpo_sendto(rqstp); + status = xprt->xpt_ops->xpo_sendto(rqstp); - trace_svc_send(rqstp, len); - svc_xprt_release(rqstp); - - if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) - len = 0; -out: - return len; + trace_svc_send(rqstp, status); } /* @@ -951,7 +922,7 @@ out: */ static void svc_age_temp_xprts(struct timer_list *t) { - struct svc_serv *serv = from_timer(serv, t, sv_temptimer); + struct svc_serv *serv = timer_container_of(serv, t, sv_temptimer); struct svc_xprt *xprt; struct list_head *le, *next; @@ -1043,6 +1014,19 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; + /* unregister with rpcbind for when transport type is TCP or UDP. + */ + if (test_bit(XPT_RPCB_UNREG, &xprt->xpt_flags)) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, + sk_xprt); + struct socket *sock = svsk->sk_sock; + + if (svc_register(serv, xprt->xpt_net, sock->sk->sk_family, + sock->sk->sk_protocol, 0) < 0) + pr_warn("failed to unregister %s with rpcbind\n", + xprt->xpt_class->xcl_name); + } + if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) return; @@ -1053,13 +1037,13 @@ static void svc_delete_xprt(struct svc_xprt *xprt) spin_lock_bh(&serv->sv_lock); list_del_init(&xprt->xpt_list); - WARN_ON_ONCE(!list_empty(&xprt->xpt_ready)); - if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + if (test_bit(XPT_TEMP, &xprt->xpt_flags) && + !test_bit(XPT_PEER_VALID, &xprt->xpt_flags)) serv->sv_tmpcnt--; spin_unlock_bh(&serv->sv_lock); while ((dr = svc_deferred_dequeue(xprt)) != NULL) - kfree(dr); + free_deferred(xprt, dr); call_xpt_users(xprt); svc_xprt_put(xprt); @@ -1104,36 +1088,26 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st return ret; } -static struct svc_xprt *svc_dequeue_net(struct svc_serv *serv, struct net *net) +static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) { - struct svc_pool *pool; struct svc_xprt *xprt; - struct svc_xprt *tmp; int i; for (i = 0; i < serv->sv_nrpools; i++) { - pool = &serv->sv_pools[i]; - - spin_lock_bh(&pool->sp_lock); - list_for_each_entry_safe(xprt, tmp, &pool->sp_sockets, xpt_ready) { - if (xprt->xpt_net != net) - continue; - list_del_init(&xprt->xpt_ready); - spin_unlock_bh(&pool->sp_lock); - return xprt; + struct svc_pool *pool = &serv->sv_pools[i]; + struct llist_node *q, **t1, *t2; + + q = lwq_dequeue_all(&pool->sp_xprts); + lwq_for_each_safe(xprt, t1, t2, &q, xpt_ready) { + if (xprt->xpt_net == net) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_delete_xprt(xprt); + xprt = NULL; + } } - spin_unlock_bh(&pool->sp_lock); - } - return NULL; -} - -static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) -{ - struct svc_xprt *xprt; - while ((xprt = svc_dequeue_net(serv, net))) { - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_delete_xprt(xprt); + if (q) + lwq_enqueue_batch(q, &pool->sp_xprts); } } @@ -1141,6 +1115,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1153,7 +1128,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1163,6 +1139,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); @@ -1181,8 +1160,8 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) { spin_unlock(&xprt->xpt_lock); trace_svc_defer_drop(dr); + free_deferred(xprt, dr); svc_xprt_put(xprt); - kfree(dr); return; } dr->xprt = NULL; @@ -1227,14 +1206,14 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr->addrlen = rqstp->rq_addrlen; dr->daddr = rqstp->rq_daddr; dr->argslen = rqstp->rq_arg.len >> 2; - dr->xprt_ctxt = rqstp->rq_xprt_ctxt; - rqstp->rq_xprt_ctxt = NULL; /* back up head to the start of the buffer and copy */ skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, dr->argslen << 2); } + dr->xprt_ctxt = rqstp->rq_xprt_ctxt; + rqstp->rq_xprt_ctxt = NULL; trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; @@ -1267,6 +1246,8 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_daddr = dr->daddr; rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_xprt_ctxt = dr->xprt_ctxt; + + dr->xprt_ctxt = NULL; svc_xprt_received(rqstp->rq_xprt); return dr->argslen << 2; } @@ -1291,6 +1272,40 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) } /** + * svc_find_listener - find an RPC transport instance + * @serv: pointer to svc_serv to search + * @xcl_name: C string containing transport's class name + * @net: owner net pointer + * @sa: sockaddr containing address + * + * Return the transport instance pointer for the endpoint accepting + * connections/peer traffic from the specified transport class, + * and matching sockaddr. + */ +struct svc_xprt *svc_find_listener(struct svc_serv *serv, const char *xcl_name, + struct net *net, const struct sockaddr *sa) +{ + struct svc_xprt *xprt; + struct svc_xprt *found = NULL; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + if (xprt->xpt_net != net) + continue; + if (strcmp(xprt->xpt_class->xcl_name, xcl_name)) + continue; + if (!rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local)) + continue; + found = xprt; + svc_xprt_get(xprt); + break; + } + spin_unlock_bh(&serv->sv_lock); + return found; +} +EXPORT_SYMBOL_GPL(svc_find_listener); + +/** * svc_find_xprt - find an RPC transport instance * @serv: pointer to svc_serv to search * @xcl_name: C string containing transport's class name @@ -1393,29 +1408,36 @@ int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen) } EXPORT_SYMBOL_GPL(svc_xprt_names); - /*----------------------------------------------------------------------------*/ static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) { unsigned int pidx = (unsigned int)*pos; - struct svc_serv *serv = m->private; + struct svc_info *si = m->private; dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); + mutex_lock(si->mutex); + if (!pidx) return SEQ_START_TOKEN; - return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]); + if (!si->serv) + return NULL; + return pidx > si->serv->sv_nrpools ? NULL + : &si->serv->sv_pools[pidx - 1]; } static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) { struct svc_pool *pool = p; - struct svc_serv *serv = m->private; + struct svc_info *si = m->private; + struct svc_serv *serv = si->serv; dprintk("svc_pool_stats_next, *pos=%llu\n", *pos); - if (p == SEQ_START_TOKEN) { + if (!serv) { + pool = NULL; + } else if (p == SEQ_START_TOKEN) { pool = &serv->sv_pools[0]; } else { unsigned int pidx = (pool - &serv->sv_pools[0]); @@ -1430,6 +1452,9 @@ static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) static void svc_pool_stats_stop(struct seq_file *m, void *p) { + struct svc_info *si = m->private; + + mutex_unlock(si->mutex); } static int svc_pool_stats_show(struct seq_file *m, void *p) @@ -1441,12 +1466,11 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) return 0; } - seq_printf(m, "%u %lu %lu %lu %lu\n", - pool->sp_id, - (unsigned long)atomic_long_read(&pool->sp_stats.packets), - pool->sp_stats.sockets_queued, - (unsigned long)atomic_long_read(&pool->sp_stats.threads_woken), - (unsigned long)atomic_long_read(&pool->sp_stats.threads_timedout)); + seq_printf(m, "%u %llu %llu %llu 0\n", + pool->sp_id, + percpu_counter_sum_positive(&pool->sp_messages_arrived), + percpu_counter_sum_positive(&pool->sp_sockets_queued), + percpu_counter_sum_positive(&pool->sp_threads_woken)); return 0; } @@ -1458,14 +1482,18 @@ static const struct seq_operations svc_pool_stats_seq_ops = { .show = svc_pool_stats_show, }; -int svc_pool_stats_open(struct svc_serv *serv, struct file *file) +int svc_pool_stats_open(struct svc_info *info, struct file *file) { + struct seq_file *seq; int err; err = seq_open(file, &svc_pool_stats_seq_ops); - if (!err) - ((struct seq_file *) file->private_data)->private = serv; - return err; + if (err) + return err; + seq = file->private_data; + seq->private = info; + + return 0; } EXPORT_SYMBOL(svc_pool_stats_open); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index e72ba2f13f6c..55b4d2874188 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -18,6 +18,7 @@ #include <linux/sunrpc/svcauth.h> #include <linux/err.h> #include <linux/hash.h> +#include <linux/user_namespace.h> #include <trace/events/sunrpc.h> @@ -60,17 +61,31 @@ svc_put_auth_ops(struct auth_ops *aops) module_put(aops->owner); } -int -svc_authenticate(struct svc_rqst *rqstp) +/** + * svc_authenticate - Initialize an outgoing credential + * @rqstp: RPC execution context + * + * Return values: + * %SVC_OK: XDR encoding of the result can begin + * %SVC_DENIED: Credential or verifier is not valid + * %SVC_GARBAGE: Failed to decode credential or verifier + * %SVC_COMPLETE: GSS context lifetime event; no further action + * %SVC_DROP: Drop this request; no further action + * %SVC_CLOSE: Like drop, but also close transport connection + */ +enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp) { - rpc_authflavor_t flavor; - struct auth_ops *aops; + struct auth_ops *aops; + u32 flavor; rqstp->rq_auth_stat = rpc_auth_ok; - flavor = svc_getnl(&rqstp->rq_arg.head[0]); - - dprintk("svc: svc_authenticate (%d)\n", flavor); + /* + * Decode the Call credential's flavor field. The credential's + * body field is decoded in the chosen ->accept method below. + */ + if (xdr_stream_decode_u32(&rqstp->rq_arg_stream, &flavor) < 0) + return SVC_GARBAGE; aops = svc_get_auth_ops(flavor); if (aops == NULL) { @@ -84,18 +99,29 @@ svc_authenticate(struct svc_rqst *rqstp) rqstp->rq_authop = aops; return aops->accept(rqstp); } -EXPORT_SYMBOL_GPL(svc_authenticate); -int svc_set_client(struct svc_rqst *rqstp) +/** + * svc_set_client - Assign an appropriate 'auth_domain' as the client + * @rqstp: RPC execution context + * + * Return values: + * %SVC_OK: Client was found and assigned + * %SVC_DENY: Client was explicitly denied + * %SVC_DROP: Ignore this request + * %SVC_CLOSE: Ignore this request and close the connection + */ +enum svc_auth_status svc_set_client(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; return rqstp->rq_authop->set_client(rqstp); } EXPORT_SYMBOL_GPL(svc_set_client); -/* A request, which was authenticated, has now executed. - * Time to finalise the credentials and verifier - * and release and resources +/** + * svc_authorise - Finalize credentials/verifier and release resources + * @rqstp: RPC execution context + * + * Returns zero on success, or a negative errno. */ int svc_authorise(struct svc_rqst *rqstp) { @@ -134,6 +160,49 @@ svc_auth_unregister(rpc_authflavor_t flavor) } EXPORT_SYMBOL_GPL(svc_auth_unregister); +/** + * svc_auth_flavor - return RPC transaction's RPC_AUTH flavor + * @rqstp: RPC transaction context + * + * Returns an RPC flavor or GSS pseudoflavor. + */ +rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp) +{ + struct auth_ops *aops = rqstp->rq_authop; + + if (!aops->pseudoflavor) + return aops->flavour; + return aops->pseudoflavor(rqstp); +} +EXPORT_SYMBOL_GPL(svc_auth_flavor); + +/** + * svcauth_map_clnt_to_svc_cred_local - maps a generic cred + * to a svc_cred suitable for use in nfsd. + * @clnt: rpc_clnt associated with nfs client + * @cred: generic cred associated with nfs client + * @svc: returned svc_cred that is suitable for use in nfsd + */ +void svcauth_map_clnt_to_svc_cred_local(struct rpc_clnt *clnt, + const struct cred *cred, + struct svc_cred *svc) +{ + struct user_namespace *userns = clnt->cl_cred ? + clnt->cl_cred->user_ns : &init_user_ns; + + memset(svc, 0, sizeof(struct svc_cred)); + + svc->cr_uid = KUIDT_INIT(from_kuid_munged(userns, cred->fsuid)); + svc->cr_gid = KGIDT_INIT(from_kgid_munged(userns, cred->fsgid)); + svc->cr_flavor = clnt->cl_auth->au_flavor; + if (cred->group_info) + svc->cr_group_info = get_group_info(cred->group_info); + /* These aren't relevant for local (network is bypassed) */ + svc->cr_principal = NULL; + svc->cr_gss_mech = NULL; +} +EXPORT_SYMBOL_GPL(svcauth_map_clnt_to_svc_cred_local); + /************************************************** * 'auth_domains' are stored in a hash table indexed by name. * When the last reference to an 'auth_domain' is dropped, diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index b1efc34db6ed..8ca98b146ec8 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -17,8 +17,9 @@ #include <net/ipv6.h> #include <linux/kernel.h> #include <linux/user_namespace.h> -#define RPCDBG_FACILITY RPCDBG_AUTH +#include <trace/events/sunrpc.h> +#define RPCDBG_FACILITY RPCDBG_AUTH #include "netns.h" @@ -225,9 +226,9 @@ static int ip_map_parse(struct cache_detail *cd, return -EINVAL; } - expiry = get_expiry(&mesg); - if (expiry ==0) - return -EINVAL; + err = get_expiry(&mesg, &expiry); + if (err) + return err; /* domainname, or empty for NEGATIVE */ len = qword_get(&mesg, buf, mlen); @@ -416,14 +417,23 @@ static int unix_gid_hash(kuid_t uid) return hash_long(from_kuid(&init_user_ns, uid), GID_HASHBITS); } -static void unix_gid_put(struct kref *kref) +static void unix_gid_free(struct rcu_head *rcu) { - struct cache_head *item = container_of(kref, struct cache_head, ref); - struct unix_gid *ug = container_of(item, struct unix_gid, h); + struct unix_gid *ug = container_of(rcu, struct unix_gid, rcu); + struct cache_head *item = &ug->h; + if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) put_group_info(ug->gi); - kfree_rcu(ug, rcu); + kfree(ug); +} + +static void unix_gid_put(struct kref *kref) +{ + struct cache_head *item = container_of(kref, struct cache_head, ref); + struct unix_gid *ug = container_of(item, struct unix_gid, h); + + call_rcu(&ug->rcu, unix_gid_free); } static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew) @@ -497,9 +507,9 @@ static int unix_gid_parse(struct cache_detail *cd, uid = make_kuid(current_user_ns(), id); ug.uid = uid; - expiry = get_expiry(&mesg); - if (expiry == 0) - return -EINVAL; + err = get_expiry(&mesg, &expiry); + if (err) + return err; rv = get_int(&mesg, &gids); if (rv || gids < 0 || gids > 8192) @@ -655,7 +665,7 @@ static struct group_info *unix_gid_find(kuid_t uid, struct svc_rqst *rqstp) } } -int +enum svc_auth_status svcauth_unix_set_client(struct svc_rqst *rqstp) { struct sockaddr_in *sin; @@ -687,7 +697,8 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) rqstp->rq_auth_stat = rpc_autherr_badcred; ipm = ip_map_cached_get(xprt); if (ipm == NULL) - ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class, + ipm = __ip_map_lookup(sn->ip_map_cache, + rqstp->rq_server->sv_programs->pg_class, &sin6->sin6_addr); if (ipm == NULL) @@ -726,26 +737,40 @@ out: rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } - EXPORT_SYMBOL_GPL(svcauth_unix_set_client); -static int +/** + * svcauth_null_accept - Decode and validate incoming RPC_AUTH_NULL credential + * @rqstp: RPC transaction + * + * Return values: + * %SVC_OK: Both credential and verifier are valid + * %SVC_DENIED: Credential or verifier is not valid + * %SVC_GARBAGE: Failed to decode credential or verifier + * %SVC_CLOSE: Temporary failure + * + * rqstp->rq_auth_stat is set as mandated by RFC 5531. + */ +static enum svc_auth_status svcauth_null_accept(struct svc_rqst *rqstp) { - struct kvec *argv = &rqstp->rq_arg.head[0]; - struct kvec *resv = &rqstp->rq_res.head[0]; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct svc_cred *cred = &rqstp->rq_cred; + u32 flavor, len; + void *body; - if (argv->iov_len < 3*4) + /* Length of Call's credential body field: */ + if (xdr_stream_decode_u32(xdr, &len) < 0) return SVC_GARBAGE; - - if (svc_getu32(argv) != 0) { - dprintk("svc: bad null cred\n"); + if (len != 0) { rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } - if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { - dprintk("svc: bad null verf\n"); + + /* Call's verf field: */ + if (xdr_stream_decode_opaque_auth(xdr, &flavor, &body, &len) < 0) + return SVC_GARBAGE; + if (flavor != RPC_AUTH_NULL || len != 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -757,9 +782,11 @@ svcauth_null_accept(struct svc_rqst *rqstp) if (cred->cr_group_info == NULL) return SVC_CLOSE; /* kmalloc failure - client must retry */ - /* Put NULL verifier */ - svc_putnl(resv, RPC_AUTH_NULL); - svc_putnl(resv, 0); + if (xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, + RPC_AUTH_NULL, NULL, 0) < 0) + return SVC_CLOSE; + if (!svcxdr_set_accept_stat(rqstp)) + return SVC_CLOSE; rqstp->rq_cred.cr_flavor = RPC_AUTH_NULL; return SVC_OK; @@ -783,31 +810,46 @@ struct auth_ops svcauth_null = { .name = "null", .owner = THIS_MODULE, .flavour = RPC_AUTH_NULL, - .accept = svcauth_null_accept, + .accept = svcauth_null_accept, .release = svcauth_null_release, .set_client = svcauth_unix_set_client, }; -static int +/** + * svcauth_tls_accept - Decode and validate incoming RPC_AUTH_TLS credential + * @rqstp: RPC transaction + * + * Return values: + * %SVC_OK: Both credential and verifier are valid + * %SVC_DENIED: Credential or verifier is not valid + * %SVC_GARBAGE: Failed to decode credential or verifier + * %SVC_CLOSE: Temporary failure + * + * rqstp->rq_auth_stat is set as mandated by RFC 5531. + */ +static enum svc_auth_status svcauth_tls_accept(struct svc_rqst *rqstp) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct svc_cred *cred = &rqstp->rq_cred; - struct kvec *argv = rqstp->rq_arg.head; - struct kvec *resv = rqstp->rq_res.head; + struct svc_xprt *xprt = rqstp->rq_xprt; + u32 flavor, len; + void *body; + __be32 *p; - if (argv->iov_len < XDR_UNIT * 3) + /* Length of Call's credential body field: */ + if (xdr_stream_decode_u32(xdr, &len) < 0) return SVC_GARBAGE; - - /* Call's cred length */ - if (svc_getu32(argv) != xdr_zero) { + if (len != 0) { rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } - /* Call's verifier flavor and its length */ - if (svc_getu32(argv) != rpc_auth_null || - svc_getu32(argv) != xdr_zero) { + /* Call's verf field: */ + if (xdr_stream_decode_opaque_auth(xdr, &flavor, &body, &len) < 0) + return SVC_GARBAGE; + if (flavor != RPC_AUTH_NULL || len != 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -818,21 +860,32 @@ svcauth_tls_accept(struct svc_rqst *rqstp) return SVC_DENIED; } - /* Mapping to nobody uid/gid is required */ + /* Signal that mapping to nobody uid/gid is required */ cred->cr_uid = INVALID_UID; cred->cr_gid = INVALID_GID; cred->cr_group_info = groups_alloc(0); if (cred->cr_group_info == NULL) - return SVC_CLOSE; /* kmalloc failure - client must retry */ + return SVC_CLOSE; - /* Reply's verifier */ - svc_putnl(resv, RPC_AUTH_NULL); - if (rqstp->rq_xprt->xpt_ops->xpo_start_tls) { - svc_putnl(resv, 8); - memcpy(resv->iov_base + resv->iov_len, "STARTTLS", 8); - resv->iov_len += 8; - } else - svc_putnl(resv, 0); + if (xprt->xpt_ops->xpo_handshake) { + p = xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2 + 8); + if (!p) + return SVC_CLOSE; + trace_svc_tls_start(xprt); + *p++ = rpc_auth_null; + *p++ = cpu_to_be32(8); + memcpy(p, "STARTTLS", 8); + + set_bit(XPT_HANDSHAKE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + } else { + trace_svc_tls_unavailable(xprt); + if (xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, + RPC_AUTH_NULL, NULL, 0) < 0) + return SVC_CLOSE; + } + if (!svcxdr_set_accept_stat(rqstp)) + return SVC_CLOSE; rqstp->rq_cred.cr_flavor = RPC_AUTH_TLS; return SVC_OK; @@ -842,32 +895,48 @@ struct auth_ops svcauth_tls = { .name = "tls", .owner = THIS_MODULE, .flavour = RPC_AUTH_TLS, - .accept = svcauth_tls_accept, + .accept = svcauth_tls_accept, .release = svcauth_null_release, .set_client = svcauth_unix_set_client, }; -static int +/** + * svcauth_unix_accept - Decode and validate incoming RPC_AUTH_SYS credential + * @rqstp: RPC transaction + * + * Return values: + * %SVC_OK: Both credential and verifier are valid + * %SVC_DENIED: Credential or verifier is not valid + * %SVC_GARBAGE: Failed to decode credential or verifier + * %SVC_CLOSE: Temporary failure + * + * rqstp->rq_auth_stat is set as mandated by RFC 5531. + */ +static enum svc_auth_status svcauth_unix_accept(struct svc_rqst *rqstp) { - struct kvec *argv = &rqstp->rq_arg.head[0]; - struct kvec *resv = &rqstp->rq_res.head[0]; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct svc_cred *cred = &rqstp->rq_cred; struct user_namespace *userns; - u32 slen, i; - int len = argv->iov_len; + u32 flavor, len, i; + void *body; + __be32 *p; - if ((len -= 3*4) < 0) + /* + * This implementation ignores the length of the Call's + * credential body field and the timestamp and machinename + * fields. + */ + p = xdr_inline_decode(xdr, XDR_UNIT * 3); + if (!p) + return SVC_GARBAGE; + len = be32_to_cpup(p + 2); + if (len > RPC_MAX_MACHINENAME) + return SVC_GARBAGE; + if (!xdr_inline_decode(xdr, len)) return SVC_GARBAGE; - svc_getu32(argv); /* length */ - svc_getu32(argv); /* time stamp */ - slen = XDR_QUADLEN(svc_getnl(argv)); /* machname length */ - if (slen > 64 || (len -= (slen + 3)*4) < 0) - goto badcred; - argv->iov_base = (void*)((__be32*)argv->iov_base + slen); /* skip machname */ - argv->iov_len -= slen*4; /* * Note: we skip uid_valid()/gid_valid() checks here for * backwards compatibility with clients that use -1 id's. @@ -877,27 +946,42 @@ svcauth_unix_accept(struct svc_rqst *rqstp) */ userns = (rqstp->rq_xprt && rqstp->rq_xprt->xpt_cred) ? rqstp->rq_xprt->xpt_cred->user_ns : &init_user_ns; - cred->cr_uid = make_kuid(userns, svc_getnl(argv)); /* uid */ - cred->cr_gid = make_kgid(userns, svc_getnl(argv)); /* gid */ - slen = svc_getnl(argv); /* gids length */ - if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0) + if (xdr_stream_decode_u32(xdr, &i) < 0) + return SVC_GARBAGE; + cred->cr_uid = make_kuid(userns, i); + if (xdr_stream_decode_u32(xdr, &i) < 0) + return SVC_GARBAGE; + cred->cr_gid = make_kgid(userns, i); + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return SVC_GARBAGE; + if (len > UNX_NGROUPS) goto badcred; - cred->cr_group_info = groups_alloc(slen); + p = xdr_inline_decode(xdr, XDR_UNIT * len); + if (!p) + return SVC_GARBAGE; + cred->cr_group_info = groups_alloc(len); if (cred->cr_group_info == NULL) return SVC_CLOSE; - for (i = 0; i < slen; i++) { - kgid_t kgid = make_kgid(userns, svc_getnl(argv)); + for (i = 0; i < len; i++) { + kgid_t kgid = make_kgid(userns, be32_to_cpup(p++)); cred->cr_group_info->gid[i] = kgid; } groups_sort(cred->cr_group_info); - if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { + + /* Call's verf field: */ + if (xdr_stream_decode_opaque_auth(xdr, &flavor, &body, &len) < 0) + return SVC_GARBAGE; + if (flavor != RPC_AUTH_NULL || len != 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } - /* Put NULL verifier */ - svc_putnl(resv, RPC_AUTH_NULL); - svc_putnl(resv, 0); + if (xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, + RPC_AUTH_NULL, NULL, 0) < 0) + return SVC_CLOSE; + if (!svcxdr_set_accept_stat(rqstp)) + return SVC_CLOSE; rqstp->rq_cred.cr_flavor = RPC_AUTH_UNIX; return SVC_OK; @@ -927,7 +1011,7 @@ struct auth_ops svcauth_unix = { .name = "unix", .owner = THIS_MODULE, .flavour = RPC_AUTH_UNIX, - .accept = svcauth_unix_accept, + .accept = svcauth_unix_accept, .release = svcauth_unix_release, .domain_release = svcauth_unix_domain_release, .set_client = svcauth_unix_set_client, diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 815baf308236..d61cd9b40491 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -36,6 +36,8 @@ #include <linux/skbuff.h> #include <linux/file.h> #include <linux/freezer.h> +#include <linux/bvec.h> + #include <net/sock.h> #include <net/checksum.h> #include <net/ip.h> @@ -43,9 +45,12 @@ #include <net/udp.h> #include <net/tcp.h> #include <net/tcp_states.h> +#include <net/tls_prot.h> +#include <net/handshake.h> #include <linux/uaccess.h> #include <linux/highmem.h> #include <asm/ioctls.h> +#include <linux/key.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/clnt.h> @@ -55,6 +60,7 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xprt.h> +#include <trace/events/sock.h> #include <trace/events/sunrpc.h> #include "socklib.h" @@ -62,6 +68,23 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* + * For UDP: + * 1 for header page + * enough pages for RPCSVC_MAXPAYLOAD_UDP + * 1 in case payload is not aligned + * 1 for tail page + */ +enum { + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1 +}; + +/* To-do: to avoid tying up an nfsd thread while waiting for a + * handshake request, the request could instead be deferred. + */ +enum { + SVC_HANDSHAKE_TO = 5U * HZ +}; static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int flags); @@ -111,27 +134,27 @@ static void svc_reclassify_socket(struct socket *sock) #endif /** - * svc_tcp_release_rqst - Release transport-related resources - * @rqstp: request structure with resources to be released + * svc_tcp_release_ctxt - Release transport-related resources + * @xprt: the transport which owned the context + * @ctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * */ -static void svc_tcp_release_rqst(struct svc_rqst *rqstp) +static void svc_tcp_release_ctxt(struct svc_xprt *xprt, void *ctxt) { } /** - * svc_udp_release_rqst - Release transport-related resources - * @rqstp: request structure with resources to be released + * svc_udp_release_ctxt - Release transport-related resources + * @xprt: the transport which owned the context + * @ctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * */ -static void svc_udp_release_rqst(struct svc_rqst *rqstp) +static void svc_udp_release_ctxt(struct svc_xprt *xprt, void *ctxt) { - struct sk_buff *skb = rqstp->rq_xprt_ctxt; + struct sk_buff *skb = ctxt; - if (skb) { - rqstp->rq_xprt_ctxt = NULL; + if (skb) consume_skb(skb); - } } union svc_pktinfo_u { @@ -215,6 +238,80 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) return len; } +static int +svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg, + struct cmsghdr *cmsg, int ret) +{ + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -ENOTCONN : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; + } + return ret; +} + +static int +svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags) +{ + union { + struct cmsghdr cmsg; + u8 buf[CMSG_SPACE(sizeof(u8))]; + } u; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; + int ret; + + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT); + if (ret > 0 && + tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, ret); + ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN); + } + return ret; +} + +static int +svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg) +{ + int ret; + struct socket *sock = svsk->sk_sock; + + ret = sock_recvmsg(sock, msg, MSG_DONTWAIT); + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags); + } + return ret; +} + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek) { @@ -252,11 +349,8 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) { - bvec[i].bv_page = rqstp->rq_pages[i]; - bvec[i].bv_len = PAGE_SIZE; - bvec[i].bv_offset = 0; - } + for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) + bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0); rqstp->rq_respages = &rqstp->rq_pages[i]; rqstp->rq_next_page = rqstp->rq_respages + 1; @@ -265,7 +359,7 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, iov_iter_advance(&msg.msg_iter, seek); buflen -= seek; } - len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len > 0) svc_flush_bvec(bvec, len, seek); @@ -310,11 +404,15 @@ static void svc_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + trace_sk_data_ready(sk); + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); svsk->sk_odata(sk); trace_svcsock_data_ready(&svsk->sk_xprt, 0); + if (test_bit(XPT_HANDSHAKE, &svsk->sk_xprt.xpt_flags)) + return; if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) svc_xprt_enqueue(&svsk->sk_xprt); } @@ -352,6 +450,88 @@ static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) sock_no_linger(svsk->sk_sock->sk); } +/** + * svc_tcp_handshake_done - Handshake completion handler + * @data: address of xprt to wake + * @status: status of handshake + * @peerid: serial number of key containing the remote peer's identity + * + * If a security policy is specified as an export option, we don't + * have a specific export here to check. So we set a "TLS session + * is present" flag on the xprt and let an upper layer enforce local + * security policy. + */ +static void svc_tcp_handshake_done(void *data, int status, key_serial_t peerid) +{ + struct svc_xprt *xprt = data; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + + if (!status) { + if (peerid != TLS_NO_PEERID) + set_bit(XPT_PEER_AUTH, &xprt->xpt_flags); + set_bit(XPT_TLS_SESSION, &xprt->xpt_flags); + } + clear_bit(XPT_HANDSHAKE, &xprt->xpt_flags); + complete_all(&svsk->sk_handshake_done); +} + +/** + * svc_tcp_handshake - Perform a transport-layer security handshake + * @xprt: connected transport endpoint + * + */ +static void svc_tcp_handshake(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct sock *sk = svsk->sk_sock->sk; + struct tls_handshake_args args = { + .ta_sock = svsk->sk_sock, + .ta_done = svc_tcp_handshake_done, + .ta_data = xprt, + }; + int ret; + + trace_svc_tls_upcall(xprt); + + clear_bit(XPT_TLS_SESSION, &xprt->xpt_flags); + init_completion(&svsk->sk_handshake_done); + + ret = tls_server_hello_x509(&args, GFP_KERNEL); + if (ret) { + trace_svc_tls_not_started(xprt); + goto out_failed; + } + + ret = wait_for_completion_interruptible_timeout(&svsk->sk_handshake_done, + SVC_HANDSHAKE_TO); + if (ret <= 0) { + if (tls_handshake_cancel(sk)) { + trace_svc_tls_timed_out(xprt); + goto out_close; + } + } + + if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) { + trace_svc_tls_unavailable(xprt); + goto out_close; + } + + /* Mark the transport ready in case the remote sent RPC + * traffic before the kernel received the handshake + * completion downcall. + */ + set_bit(XPT_DATA, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + return; + +out_close: + set_bit(XPT_CLOSE, &xprt->xpt_flags); +out_failed: + clear_bit(XPT_HANDSHAKE, &xprt->xpt_flags); + set_bit(XPT_DATA, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); +} + /* * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo */ @@ -508,6 +688,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (serv->sv_stats) serv->sv_stats->netudpcnt++; + svc_sock_secure_port(rqstp); svc_xprt_received(rqstp->rq_xprt); return len; @@ -554,12 +735,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) .msg_name = &rqstp->rq_addr, .msg_namelen = rqstp->rq_addrlen, .msg_control = cmh, + .msg_flags = MSG_SPLICE_PAGES, .msg_controllen = sizeof(buffer), }; - unsigned int sent; + unsigned int count; int err; - svc_udp_release_rqst(rqstp); + svc_udp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; svc_set_cmsg_data(rqstp, cmh); @@ -568,22 +751,22 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = xdr_alloc_bvec(xdr, GFP_KERNEL); - if (err < 0) - goto out_unlock; + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr); - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, + count, rqstp->rq_res.len); + err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, + count, rqstp->rq_res.len); + err = sock_sendmsg(svsk->sk_sock, &msg); } - xdr_free_bvec(xdr); + trace_svcsock_udp_send(xprt, err); -out_unlock: + mutex_unlock(&xprt->xpt_mutex); - if (err < 0) - return err; - return sent; + return err; out_notconn: mutex_unlock(&xprt->xpt_mutex); @@ -631,12 +814,11 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, .xpo_result_payload = svc_sock_result_payload, - .xpo_release_rqst = svc_udp_release_rqst, + .xpo_release_ctxt = svc_udp_release_ctxt, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, - .xpo_secure_port = svc_sock_secure_port, .xpo_kill_temp_xprt = svc_udp_kill_temp_xprt, }; @@ -665,6 +847,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { @@ -687,11 +870,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - if (svsk) { - /* Refer to svc_setup_socket() for details. */ - rmb(); - svsk->sk_odata(sk); - } + trace_sk_data_ready(sk); /* * This callback may called twice when a new connection @@ -701,13 +880,18 @@ static void svc_tcp_listen_data_ready(struct sock *sk) * when one of child sockets become ESTABLISHED. * 2) data_ready method of the child socket may be called * when it receives data before the socket is accepted. - * In case of 2, we should ignore it silently. + * In case of 2, we should ignore it silently and DO NOT + * dereference svsk. */ - if (sk->sk_state == TCP_LISTEN) { - if (svsk) { - set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } + if (sk->sk_state != TCP_LISTEN) + return; + + if (svsk) { + /* Refer to svc_setup_socket() for details. */ + rmb(); + svsk->sk_odata(sk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } } @@ -748,15 +932,13 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); if (err < 0) { - if (err == -ENOMEM) - printk(KERN_WARNING "%s: no more sockets!\n", - serv->sv_name); - else if (err != -EAGAIN) - net_warn_ratelimited("%s: accept failed (err %d)!\n", - serv->sv_name, -err); - trace_svcsock_accept_err(xprt, serv->sv_name, err); + if (err != -EAGAIN) + trace_svcsock_accept_err(xprt, serv->sv_name, err); return NULL; } + if (IS_ERR(sock_alloc_file(newsock, O_NONBLOCK, NULL))) + return NULL; + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_getpeername(newsock, sin); @@ -797,7 +979,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) return &newsvsk->sk_xprt; failed: - sock_release(newsock); + sockfd_put(newsock); return NULL; } @@ -875,7 +1057,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want); - len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len < 0) return len; svsk->sk_tcplen += len; @@ -891,9 +1073,10 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, return svc_sock_reclen(svsk); err_too_large: - net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n", - __func__, svsk->sk_xprt.xpt_server->sv_name, - svc_sock_reclen(svsk)); + net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n", + svsk->sk_xprt.xpt_server->sv_name, + svc_sock_reclen(svsk), + (struct sockaddr *)&svsk->sk_xprt.xpt_remote); svc_xprt_deferred_close(&svsk->sk_xprt); err_short: return -EAGAIN; @@ -905,18 +1088,14 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) struct rpc_rqst *req = NULL; struct kvec *src, *dst; __be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base; - __be32 xid; - __be32 calldir; - - xid = *p++; - calldir = *p; + __be32 xid = *p; if (!bc_xprt) return -EAGAIN; spin_lock(&bc_xprt->queue_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) - goto unlock_notfound; + goto unlock_eagain; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); /* @@ -933,12 +1112,6 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) rqstp->rq_arg.len = 0; spin_unlock(&bc_xprt->queue_lock); return 0; -unlock_notfound: - printk(KERN_NOTICE - "%s: Got unrecognized reply: " - "calldir 0x%x xpt_bc_xprt %p xid %08x\n", - __func__, ntohl(calldir), - bc_xprt, ntohl(xid)); unlock_eagain: spin_unlock(&bc_xprt->queue_lock); return -EAGAIN; @@ -1028,6 +1201,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (serv->sv_stats) serv->sv_stats->nettcpcnt++; + svc_sock_secure_port(rqstp); svc_xprt_received(rqstp->rq_xprt); return rqstp->rq_arg.len; @@ -1057,87 +1231,39 @@ err_noclose: return 0; /* record not complete */ } -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, - int flags) -{ - return kernel_sendpage(sock, virt_to_page(vec->iov_base), - offset_in_page(vec->iov_base), - vec->iov_len, flags); -} - /* - * kernel_sendpage() is used exclusively to reduce the number of + * MSG_SPLICE_PAGES is used exclusively to reduce the number of * copy operations in this path. Therefore the caller must ensure * that the pages backing @xdr are unchanging. - * - * In addition, the logic assumes that * .bv_len is never larger - * than PAGE_SIZE. */ -static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, - rpc_fraghdr marker, unsigned int *sentp) +static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, + rpc_fraghdr marker) { - const struct kvec *head = xdr->head; - const struct kvec *tail = xdr->tail; - struct kvec rm = { - .iov_base = &marker, - .iov_len = sizeof(marker), - }; struct msghdr msg = { - .msg_flags = 0, + .msg_flags = MSG_SPLICE_PAGES, }; + unsigned int count; + void *buf; int ret; - *sentp = 0; - ret = xdr_alloc_bvec(xdr, GFP_KERNEL); - if (ret < 0) - return ret; - - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != rm.iov_len) - return -EAGAIN; - - ret = svc_tcp_send_kvec(sock, head, 0); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != head->iov_len) - goto out; - - if (xdr->page_len) { - unsigned int offset, len, remaining; - struct bio_vec *bvec; - - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); - offset = offset_in_page(xdr->page_base); - remaining = xdr->page_len; - while (remaining > 0) { - len = min(remaining, bvec->bv_len - offset); - ret = kernel_sendpage(sock, bvec->bv_page, - bvec->bv_offset + offset, - len, 0); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != len) - goto out; - remaining -= len; - offset = 0; - bvec++; - } - } - - if (tail->iov_len) { - ret = svc_tcp_send_kvec(sock, tail, 0); - if (ret < 0) - return ret; - *sentp += ret; - } - -out: - return 0; + /* The stream record marker is copied into a temporary page + * fragment buffer so that it can be included in sk_bvec. + */ + buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), + GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, &marker, sizeof(marker)); + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker)); + + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages, + &rqstp->rq_res); + + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, + 1 + count, sizeof(marker) + rqstp->rq_res.len); + ret = sock_sendmsg(svsk->sk_sock, &msg); + page_frag_free(buf); + return ret; } /** @@ -1156,37 +1282,30 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) struct xdr_buf *xdr = &rqstp->rq_res; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); - unsigned int sent; - int err; + int sent; - svc_tcp_release_rqst(rqstp); + svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; - atomic_inc(&svsk->sk_sendqlen); mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - tcp_sock_set_cork(svsk->sk_sk, true); - err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent); - xdr_free_bvec(xdr); - trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); - if (err < 0 || sent != (xdr->len + sizeof(marker))) + sent = svc_tcp_sendmsg(svsk, rqstp, marker); + trace_svcsock_tcp_send(xprt, sent); + if (sent < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; - if (atomic_dec_and_test(&svsk->sk_sendqlen)) - tcp_sock_set_cork(svsk->sk_sk, false); mutex_unlock(&xprt->xpt_mutex); return sent; out_notconn: - atomic_dec(&svsk->sk_sendqlen); mutex_unlock(&xprt->xpt_mutex); return -ENOTCONN; out_close: - pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n", xprt->xpt_server->sv_name, - (err < 0) ? "got error" : "sent", - (err < 0) ? err : sent, xdr->len); + (sent < 0) ? "got error" : "sent", + sent, xdr->len + sizeof(marker)); svc_xprt_deferred_close(xprt); - atomic_dec(&svsk->sk_sendqlen); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; } @@ -1204,13 +1323,13 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, .xpo_result_payload = svc_sock_result_payload, - .xpo_release_rqst = svc_tcp_release_rqst, + .xpo_release_ctxt = svc_tcp_release_ctxt, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, - .xpo_secure_port = svc_sock_secure_port, .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt, + .xpo_handshake = svc_tcp_handshake, }; static struct svc_xprt_class svc_tcp_class = { @@ -1244,6 +1363,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) if (sk->sk_state == TCP_LISTEN) { strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { @@ -1254,7 +1374,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svsk->sk_marker = xdr_zero; svsk->sk_tcplen = 0; svsk->sk_datalen = 0; - memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); + memset(&svsk->sk_pages[0], 0, + svsk->sk_maxpages * sizeof(struct page *)); tcp_sock_set_nodelay(sk); @@ -1282,7 +1403,20 @@ void svc_sock_update_bufs(struct svc_serv *serv) set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); spin_unlock_bh(&serv->sv_lock); } -EXPORT_SYMBOL_GPL(svc_sock_update_bufs); + +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags) +{ + switch (sock->type) { + case SOCK_STREAM: + /* +1 for TCP record marker */ + if (flags & SVC_SOCK_TEMPORARY) + return svc_serv_maxpages(serv) + 1; + return 0; + case SOCK_DGRAM: + return SUNRPC_MAX_UDP_SENDPAGES; + } + return -EINVAL; +} /* * Initialize socket for RPC use and create svc_sock struct @@ -1294,23 +1428,41 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); - int err = 0; + int sendpages; + unsigned long pages; + + sendpages = svc_sock_sendpages(serv, sock, flags); + if (sendpages < 0) + return ERR_PTR(sendpages); - svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); + pages = svc_serv_maxpages(serv); + svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + if (sendpages) { + svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL); + if (!svsk->sk_bvec) { + kfree(svsk); + return ERR_PTR(-ENOMEM); + } + } + + svsk->sk_maxpages = pages; + inet = sock->sk; - /* Register socket with portmapper */ - if (pmap_register) + if (pmap_register) { + int err; + err = svc_register(serv, sock_net(sock->sk), inet->sk_family, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); - - if (err < 0) { - kfree(svsk); - return ERR_PTR(err); + if (err < 0) { + kfree(svsk->sk_bvec); + kfree(svsk); + return ERR_PTR(err); + } } svsk->sk_sock = sock; @@ -1320,7 +1472,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_owspace = inet->sk_write_space; /* * This barrier is necessary in order to prevent race condition - * with svc_data_ready(), svc_listen_data_ready() and others + * with svc_data_ready(), svc_tcp_listen_data_ready(), and others * when calling callbacks above. */ wmb(); @@ -1332,29 +1484,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - trace_svcsock_new_socket(sock); + trace_svcsock_new(svsk, sock); return svsk; } -bool svc_alien_sock(struct net *net, int fd) -{ - int err; - struct socket *sock = sockfd_lookup(fd, &err); - bool ret = false; - - if (!sock) - goto out; - if (sock_net(sock->sk) != net) - ret = true; - sockfd_put(sock); -out: - return ret; -} -EXPORT_SYMBOL_GPL(svc_alien_sock); - /** * svc_addsock - add a listener socket to an RPC service * @serv: pointer to RPC service to which to add a new listener + * @net: caller's network namespace * @fd: file descriptor of the new listener * @name_return: pointer to buffer to fill in with name of listener * @len: size of the buffer @@ -1364,8 +1501,8 @@ EXPORT_SYMBOL_GPL(svc_alien_sock); * Name is terminated with '\n'. On error, returns a negative errno * value. */ -int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, - const size_t len, const struct cred *cred) +int svc_addsock(struct svc_serv *serv, struct net *net, const int fd, + char *name_return, const size_t len, const struct cred *cred) { int err = 0; struct socket *so = sockfd_lookup(fd, &err); @@ -1376,6 +1513,9 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, if (!so) return err; + err = -EINVAL; + if (sock_net(so->sk) != net) + goto out; err = -EAFNOSUPPORT; if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6)) goto out; @@ -1458,7 +1598,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ - error = kernel_bind(sock, sin, len); + error = kernel_bind(sock, (struct sockaddr_unsized *)sin, len); if (error < 0) goto bummer; @@ -1468,7 +1608,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, newlen = error; if (protocol == IPPROTO_TCP) { - if ((error = kernel_listen(sock, 64)) < 0) + sk_net_refcnt_upgrade(sock->sk); + if ((error = kernel_listen(sock, SOMAXCONN)) < 0) goto bummer; } @@ -1509,6 +1650,8 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + tls_handshake_close(svsk->sk_sock); + svc_sock_detach(xprt); if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { @@ -1523,10 +1666,17 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) static void svc_sock_free(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct socket *sock = svsk->sk_sock; - if (svsk->sk_sock->file) - sockfd_put(svsk->sk_sock); + trace_svcsock_free(svsk, sock); + + tls_handshake_cancel(sock->sk); + if (sock->file) + sockfd_put(sock); else - sock_release(svsk->sk_sock); + sock_release(sock); + + page_frag_cache_drain(&svsk->sk_frag_cache); + kfree(svsk->sk_bvec); kfree(svsk); } diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 3aad6ef18504..bdb587a72422 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -40,26 +40,7 @@ EXPORT_SYMBOL_GPL(nlm_debug); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -static struct ctl_table_header *sunrpc_table_header; -static struct ctl_table sunrpc_table[]; - -void -rpc_register_sysctl(void) -{ - if (!sunrpc_table_header) - sunrpc_table_header = register_sysctl_table(sunrpc_table); -} - -void -rpc_unregister_sysctl(void) -{ - if (sunrpc_table_header) { - unregister_sysctl_table(sunrpc_table_header); - sunrpc_table_header = NULL; - } -} - -static int proc_do_xprt(struct ctl_table *table, int write, +static int proc_do_xprt(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; @@ -81,7 +62,7 @@ static int proc_do_xprt(struct ctl_table *table, int write, } static int -proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, +proc_dodebug(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[20], *s = NULL; @@ -142,6 +123,7 @@ done: return 0; } +static struct ctl_table_header *sunrpc_table_header; static struct ctl_table debug_table[] = { { @@ -178,16 +160,21 @@ static struct ctl_table debug_table[] = { .mode = 0444, .proc_handler = proc_do_xprt, }, - { } }; -static struct ctl_table sunrpc_table[] = { - { - .procname = "sunrpc", - .mode = 0555, - .child = debug_table - }, - { } -}; +void +rpc_register_sysctl(void) +{ + if (!sunrpc_table_header) + sunrpc_table_header = register_sysctl("sunrpc", debug_table); +} +void +rpc_unregister_sysctl(void) +{ + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +} #endif diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 1e05a2d723f4..8b01b7ae2690 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -36,7 +36,7 @@ rpc_sysfs_object_child_ns_type(const struct kobject *kobj) return &net_ns_type_operations; } -static struct kobj_type rpc_sysfs_object_type = { +static const struct kobj_type rpc_sysfs_object_type = { .release = rpc_sysfs_object_release, .sysfs_ops = &kobj_sysfs_ops, .child_ns_type = rpc_sysfs_object_child_ns_type, @@ -59,6 +59,16 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name, return NULL; } +static inline struct rpc_clnt * +rpc_sysfs_client_kobj_get_clnt(struct kobject *kobj) +{ + struct rpc_sysfs_client *c = container_of(kobj, + struct rpc_sysfs_client, kobject); + struct rpc_clnt *ret = c->clnt; + + return refcount_inc_not_zero(&ret->cl_count) ? ret : NULL; +} + static inline struct rpc_xprt * rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) { @@ -86,6 +96,51 @@ rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) return xprt_switch_get(x->xprt_switch); } +static ssize_t rpc_sysfs_clnt_version_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%u", clnt->cl_vers); + refcount_dec(&clnt->cl_count); + return ret; +} + +static ssize_t rpc_sysfs_clnt_program_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%s", clnt->cl_program->name); + refcount_dec(&clnt->cl_count); + return ret; +} + +static ssize_t rpc_sysfs_clnt_max_connect_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%u\n", clnt->cl_max_connect); + refcount_dec(&clnt->cl_count); + return ret; +} + static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -129,6 +184,31 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, return ret; } +static const char *xprtsec_strings[] = { + [RPC_XPRTSEC_NONE] = "none", + [RPC_XPRTSEC_TLS_ANON] = "tls-anon", + [RPC_XPRTSEC_TLS_X509] = "tls-x509", +}; + +static ssize_t rpc_sysfs_xprt_xprtsec_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt) { + ret = sprintf(buf, "<closed>\n"); + goto out; + } + + ret = sprintf(buf, "%s\n", xprtsec_strings[xprt->xprtsec.policy]); + xprt_put(xprt); +out: + return ret; + +} + static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -206,6 +286,14 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, return ret; } +static ssize_t rpc_sysfs_xprt_del_xprt_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "# delete this xprt\n"); +} + + static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -225,6 +313,55 @@ static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, return ret; } +static ssize_t rpc_sysfs_xprt_switch_add_xprt_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "# add one xprt to this xprt_switch\n"); +} + +static ssize_t rpc_sysfs_xprt_switch_add_xprt_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt_switch *xprt_switch = + rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); + struct xprt_create xprt_create_args; + struct rpc_xprt *xprt, *new; + + if (!xprt_switch) + return 0; + + xprt = rpc_xprt_switch_get_main_xprt(xprt_switch); + if (!xprt) + goto out; + + xprt_create_args.ident = xprt->xprt_class->ident; + xprt_create_args.net = xprt->xprt_net; + xprt_create_args.dstaddr = (struct sockaddr *)&xprt->addr; + xprt_create_args.addrlen = xprt->addrlen; + xprt_create_args.servername = xprt->servername; + xprt_create_args.bc_xprt = xprt->bc_xprt; + xprt_create_args.xprtsec = xprt->xprtsec; + xprt_create_args.connect_timeout = xprt->connect_timeout; + xprt_create_args.reconnect_timeout = xprt->max_reconnect_timeout; + + new = xprt_create_transport(&xprt_create_args); + if (IS_ERR_OR_NULL(new)) { + count = PTR_ERR(new); + goto out_put_xprt; + } + + rpc_xprt_switch_add_xprt(xprt_switch, new); + xprt_put(new); + +out_put_xprt: + xprt_put(xprt); +out: + xprt_switch_put(xprt_switch); + return count; +} + static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -239,6 +376,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, if (!xprt) return 0; if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP || + xprt->xprt_class->ident == XPRT_TRANSPORT_TCP_TLS || xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) { xprt_put(xprt); return -EOPNOTSUPP; @@ -251,7 +389,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, saddr = (struct sockaddr *)&xprt->addr; port = rpc_get_port(saddr); - /* buf_len is the len until the first occurence of either + /* buf_len is the len until the first occurrence of either * '\n' or '\0' */ buf_len = strcspn(buf, "\n"); @@ -334,6 +472,40 @@ out_put: return count; } +static ssize_t rpc_sysfs_xprt_del_xprt(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); + + if (!xprt || !xps) { + count = 0; + goto out; + } + + if (xprt->main) { + count = -EINVAL; + goto release_tasks; + } + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + + xprt_set_offline_locked(xprt, xps); + xprt_delete_locked(xprt, xps); + +release_tasks: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + xprt_switch_put(xps); +out: + return count; +} + int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); @@ -397,23 +569,48 @@ static const void *rpc_sysfs_xprt_namespace(const struct kobject *kobj) kobject)->xprt->xprt_net; } +static struct kobj_attribute rpc_sysfs_clnt_version = __ATTR(rpc_version, + 0444, rpc_sysfs_clnt_version_show, NULL); + +static struct kobj_attribute rpc_sysfs_clnt_program = __ATTR(program, + 0444, rpc_sysfs_clnt_program_show, NULL); + +static struct kobj_attribute rpc_sysfs_clnt_max_connect = __ATTR(max_connect, + 0444, rpc_sysfs_clnt_max_connect_show, NULL); + +static struct attribute *rpc_sysfs_rpc_clnt_attrs[] = { + &rpc_sysfs_clnt_version.attr, + &rpc_sysfs_clnt_program.attr, + &rpc_sysfs_clnt_max_connect.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rpc_sysfs_rpc_clnt); + static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, 0644, rpc_sysfs_xprt_srcaddr_show, NULL); +static struct kobj_attribute rpc_sysfs_xprt_xprtsec = __ATTR(xprtsec, + 0644, rpc_sysfs_xprt_xprtsec_show, NULL); + static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); +static struct kobj_attribute rpc_sysfs_xprt_del = __ATTR(del_xprt, + 0644, rpc_sysfs_xprt_del_xprt_show, rpc_sysfs_xprt_del_xprt); + static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, &rpc_sysfs_xprt_srcaddr.attr, + &rpc_sysfs_xprt_xprtsec.attr, &rpc_sysfs_xprt_info.attr, &rpc_sysfs_xprt_change_state.attr, + &rpc_sysfs_xprt_del.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt); @@ -421,26 +618,32 @@ ATTRIBUTE_GROUPS(rpc_sysfs_xprt); static struct kobj_attribute rpc_sysfs_xprt_switch_info = __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); +static struct kobj_attribute rpc_sysfs_xprt_switch_add_xprt = + __ATTR(add_xprt, 0644, rpc_sysfs_xprt_switch_add_xprt_show, + rpc_sysfs_xprt_switch_add_xprt_store); + static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { &rpc_sysfs_xprt_switch_info.attr, + &rpc_sysfs_xprt_switch_add_xprt.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); -static struct kobj_type rpc_sysfs_client_type = { +static const struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, + .default_groups = rpc_sysfs_rpc_clnt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_client_namespace, }; -static struct kobj_type rpc_sysfs_xprt_switch_type = { +static const struct kobj_type rpc_sysfs_xprt_switch_type = { .release = rpc_sysfs_xprt_switch_release, .default_groups = rpc_sysfs_xprt_switch_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_switch_namespace, }; -static struct kobj_type rpc_sysfs_xprt_type = { +static const struct kobj_type rpc_sysfs_xprt_type = { .release = rpc_sysfs_xprt_release, .default_groups = rpc_sysfs_xprt_groups, .sysfs_ops = &kobj_sysfs_ops, diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h index 6620cebd1037..d2dd77a0a0e9 100644 --- a/net/sunrpc/sysfs.h +++ b/net/sunrpc/sysfs.h @@ -5,13 +5,6 @@ #ifndef __SUNRPC_SYSFS_H #define __SUNRPC_SYSFS_H -struct rpc_sysfs_client { - struct kobject kobject; - struct net *net; - struct rpc_clnt *clnt; - struct rpc_xprt_switch *xprt_switch; -}; - struct rpc_sysfs_xprt_switch { struct kobject kobject; struct net *net; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index f7767bf22406..70efc727a9cd 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -37,19 +37,6 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj) } EXPORT_SYMBOL_GPL(xdr_encode_netobj); -__be32 * -xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj) -{ - unsigned int len; - - if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ) - return NULL; - obj->len = len; - obj->data = (u8 *) p; - return p + XDR_QUADLEN(len); -} -EXPORT_SYMBOL_GPL(xdr_decode_netobj); - /** * xdr_encode_opaque_fixed - Encode fixed length opaque data * @p: pointer to current position in XDR buffer. @@ -102,21 +89,6 @@ xdr_encode_string(__be32 *p, const char *string) } EXPORT_SYMBOL_GPL(xdr_encode_string); -__be32 * -xdr_decode_string_inplace(__be32 *p, char **sp, - unsigned int *lenp, unsigned int maxlen) -{ - u32 len; - - len = be32_to_cpu(*p++); - if (len > maxlen) - return NULL; - *lenp = len; - *sp = (char *) p; - return p + XDR_QUADLEN(len); -} -EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); - /** * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf * @buf: XDR buffer where string resides @@ -150,9 +122,8 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) if (!buf->bvec) return -ENOMEM; for (i = 0; i < n; i++) { - buf->bvec[i].bv_page = buf->pages[i]; - buf->bvec[i].bv_len = PAGE_SIZE; - buf->bvec[i].bv_offset = 0; + bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE, + 0); } } return 0; @@ -166,6 +137,57 @@ xdr_free_bvec(struct xdr_buf *buf) } /** + * xdr_buf_to_bvec - Copy components of an xdr_buf into a bio_vec array + * @bvec: bio_vec array to populate + * @bvec_size: element count of @bio_vec + * @xdr: xdr_buf to be copied + * + * Returns the number of entries consumed in @bvec. + */ +unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size, + const struct xdr_buf *xdr) +{ + const struct kvec *head = xdr->head; + const struct kvec *tail = xdr->tail; + unsigned int count = 0; + + if (head->iov_len) { + bvec_set_virt(bvec++, head->iov_base, head->iov_len); + ++count; + } + + if (xdr->page_len) { + unsigned int offset, len, remaining; + struct page **pages = xdr->pages; + + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining > 0) { + len = min_t(unsigned int, remaining, + PAGE_SIZE - offset); + bvec_set_page(bvec++, *pages++, len, offset); + remaining -= len; + offset = 0; + if (unlikely(++count > bvec_size)) + goto bvec_overflow; + } + } + + if (tail->iov_len) { + bvec_set_virt(bvec, tail->iov_base, tail->iov_len); + if (unlikely(++count > bvec_size)) + goto bvec_overflow; + } + + return count; + +bvec_overflow: + pr_warn_once("%s: bio_vec array overflow\n", __func__); + return count - 1; +} +EXPORT_SYMBOL_GPL(xdr_buf_to_bvec); + +/** * xdr_inline_pages - Prepare receive buffer for a large reply * @xdr: xdr_buf into which reply will be placed * @offset: expected offset where data payload will start, in bytes @@ -863,13 +885,6 @@ static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len) return shift; } -void -xdr_shift_buf(struct xdr_buf *buf, size_t len) -{ - xdr_shrink_bufhead(buf, buf->head->iov_len - len); -} -EXPORT_SYMBOL_GPL(xdr_shift_buf); - /** * xdr_stream_pos - Return the current offset from the start of the xdr_stream * @xdr: pointer to struct xdr_stream @@ -950,21 +965,18 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer into which to encode data - * @pages: list of pages to decode into - * @rqst: pointer to controlling rpc_rqst, for debugging * */ -void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, - struct page **pages, struct rpc_rqst *rqst) +void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf) { xdr_reset_scratch_buffer(xdr); xdr->buf = buf; - xdr->page_ptr = pages; + xdr->page_ptr = buf->pages; xdr->iov = NULL; - xdr->p = page_address(*pages); + xdr->p = page_address(*xdr->page_ptr); xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); - xdr->rqst = rqst; + xdr->rqst = NULL; } EXPORT_SYMBOL_GPL(xdr_init_encode_pages); @@ -1055,6 +1067,12 @@ out_overflow: * Checks that we have enough buffer space to encode 'nbytes' more * bytes of data. If so, update the total xdr_buf length, and * adjust the length of the current kvec. + * + * The returned pointer is valid only until the next call to + * xdr_reserve_space() or xdr_commit_encode() on @xdr. The current + * implementation of this API guarantees that space reserved for a + * four-byte data item remains valid until @xdr is destroyed, but + * that might not always be true in the future. */ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) { @@ -1078,22 +1096,22 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) } EXPORT_SYMBOL_GPL(xdr_reserve_space); - /** * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending * @xdr: pointer to xdr_stream - * @vec: pointer to a kvec array * @nbytes: number of bytes to reserve * - * Reserves enough buffer space to encode 'nbytes' of data and stores the - * pointers in 'vec'. The size argument passed to xdr_reserve_space() is - * determined based on the number of bytes remaining in the current page to - * avoid invalidating iov_base pointers when xdr_commit_encode() is called. + * The size argument passed to xdr_reserve_space() is determined based + * on the number of bytes remaining in the current page to avoid + * invalidating iov_base pointers when xdr_commit_encode() is called. + * + * Return values: + * %0: success + * %-EMSGSIZE: not enough space is available in @xdr */ -int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes) +int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes) { - int thislen; - int v = 0; + size_t thislen; __be32 *p; /* @@ -1105,21 +1123,19 @@ int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbyte xdr->end = xdr->p; } + /* XXX: Let's find a way to make this more efficient */ while (nbytes) { thislen = xdr->buf->page_len % PAGE_SIZE; thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen); p = xdr_reserve_space(xdr, thislen); if (!p) - return -EIO; + return -EMSGSIZE; - vec[v].iov_base = p; - vec[v].iov_len = thislen; - v++; nbytes -= thislen; } - return v; + return 0; } EXPORT_SYMBOL_GPL(xdr_reserve_space_vec); @@ -1193,6 +1209,21 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) EXPORT_SYMBOL(xdr_truncate_encode); /** + * xdr_truncate_decode - Truncate a decoding stream + * @xdr: pointer to struct xdr_stream + * @len: Number of bytes to remove + * + */ +void xdr_truncate_decode(struct xdr_stream *xdr, size_t len) +{ + unsigned int nbytes = xdr_align_size(len); + + xdr->buf->len -= nbytes; + xdr->nwords -= XDR_QUADLEN(nbytes); +} +EXPORT_SYMBOL_GPL(xdr_truncate_decode); + +/** * xdr_restrict_buflen - decrease available buffer space * @xdr: pointer to xdr_stream * @newbuflen: new maximum number of bytes available @@ -1283,6 +1314,14 @@ static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, return xdr_set_iov(xdr, buf->tail, base, len); } +static void xdr_stream_unmap_current_page(struct xdr_stream *xdr) +{ + if (xdr->page_kaddr) { + kunmap_local(xdr->page_kaddr); + xdr->page_kaddr = NULL; + } +} + static unsigned int xdr_set_page_base(struct xdr_stream *xdr, unsigned int base, unsigned int len) { @@ -1300,12 +1339,18 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr, if (len > maxlen) len = maxlen; + xdr_stream_unmap_current_page(xdr); xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; xdr->page_ptr = &xdr->buf->pages[pgnr]; - kaddr = page_address(*xdr->page_ptr); + + if (PageHighMem(*xdr->page_ptr)) { + xdr->page_kaddr = kmap_local_page(*xdr->page_ptr); + kaddr = xdr->page_kaddr; + } else + kaddr = page_address(*xdr->page_ptr); pgoff = base & ~PAGE_MASK; xdr->p = (__be32*)(kaddr + pgoff); @@ -1359,6 +1404,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst) { xdr->buf = buf; + xdr->page_kaddr = NULL; xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && @@ -1391,6 +1437,16 @@ void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, } EXPORT_SYMBOL_GPL(xdr_init_decode_pages); +/** + * xdr_finish_decode - Clean up the xdr_stream after decoding data. + * @xdr: pointer to xdr_stream struct + */ +void xdr_finish_decode(struct xdr_stream *xdr) +{ + xdr_stream_unmap_current_page(xdr); +} +EXPORT_SYMBOL(xdr_finish_decode); + static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) { unsigned int nwords = XDR_QUADLEN(nbytes); @@ -2161,116 +2217,91 @@ out: EXPORT_SYMBOL_GPL(xdr_process_buf); /** - * xdr_stream_decode_opaque - Decode variable length opaque - * @xdr: pointer to xdr_stream - * @ptr: location to store opaque data - * @size: size of storage buffer @ptr - * - * Return values: - * On success, returns size of object stored in *@ptr - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE on overflow of storage buffer @ptr - */ -ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, size); - if (ret <= 0) - return ret; - memcpy(ptr, p, ret); - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque); - -/** - * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque + * xdr_stream_decode_string_dup - Decode and duplicate variable length string * @xdr: pointer to xdr_stream - * @ptr: location to store pointer to opaque data - * @maxlen: maximum acceptable object size + * @str: location to store pointer to string + * @maxlen: maximum acceptable string length * @gfp_flags: GFP mask to use * * Return values: - * On success, returns size of object stored in *@ptr + * On success, returns length of NUL-terminated string stored in *@ptr * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE if the size of the object would exceed @maxlen + * %-EMSGSIZE if the size of the string would exceed @maxlen * %-ENOMEM on memory allocation failure */ -ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, +ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, size_t maxlen, gfp_t gfp_flags) { - ssize_t ret; void *p; + ssize_t ret; ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); if (ret > 0) { - *ptr = kmemdup(p, ret, gfp_flags); - if (*ptr != NULL) - return ret; + char *s = kmemdup_nul(p, ret, gfp_flags); + if (s != NULL) { + *str = s; + return strlen(s); + } ret = -ENOMEM; } - *ptr = NULL; + *str = NULL; return ret; } -EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup); +EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup); /** - * xdr_stream_decode_string - Decode variable length string + * xdr_stream_decode_opaque_auth - Decode struct opaque_auth (RFC5531 S8.2) * @xdr: pointer to xdr_stream - * @str: location to store string - * @size: size of storage buffer @str + * @flavor: location to store decoded flavor + * @body: location to store decode body + * @body_len: location to store length of decoded body * * Return values: - * On success, returns length of NUL-terminated string stored in *@str + * On success, returns the number of buffer bytes consumed * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE on overflow of storage buffer @str + * %-EMSGSIZE if the decoded size of the body field exceeds 400 octets */ -ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size) +ssize_t xdr_stream_decode_opaque_auth(struct xdr_stream *xdr, u32 *flavor, + void **body, unsigned int *body_len) { - ssize_t ret; - void *p; + ssize_t ret, len; - ret = xdr_stream_decode_opaque_inline(xdr, &p, size); - if (ret > 0) { - memcpy(str, p, ret); - str[ret] = '\0'; - return strlen(str); - } - *str = '\0'; - return ret; + len = xdr_stream_decode_u32(xdr, flavor); + if (unlikely(len < 0)) + return len; + ret = xdr_stream_decode_opaque_inline(xdr, body, RPC_MAX_AUTH_SIZE); + if (unlikely(ret < 0)) + return ret; + *body_len = ret; + return len + ret; } -EXPORT_SYMBOL_GPL(xdr_stream_decode_string); +EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_auth); /** - * xdr_stream_decode_string_dup - Decode and duplicate variable length string + * xdr_stream_encode_opaque_auth - Encode struct opaque_auth (RFC5531 S8.2) * @xdr: pointer to xdr_stream - * @str: location to store pointer to string - * @maxlen: maximum acceptable string length - * @gfp_flags: GFP mask to use + * @flavor: verifier flavor to encode + * @body: content of body to encode + * @body_len: length of body to encode * * Return values: - * On success, returns length of NUL-terminated string stored in *@ptr + * On success, returns length in bytes of XDR buffer consumed * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE if the size of the string would exceed @maxlen - * %-ENOMEM on memory allocation failure + * %-EMSGSIZE if the size of @body exceeds 400 octets */ -ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, - size_t maxlen, gfp_t gfp_flags) -{ - void *p; - ssize_t ret; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); - if (ret > 0) { - char *s = kmemdup_nul(p, ret, gfp_flags); - if (s != NULL) { - *str = s; - return strlen(s); - } - ret = -ENOMEM; - } - *str = NULL; - return ret; +ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, + void *body, unsigned int body_len) +{ + ssize_t ret, len; + + if (unlikely(body_len > RPC_MAX_AUTH_SIZE)) + return -EMSGSIZE; + len = xdr_stream_encode_u32(xdr, flavor); + if (unlikely(len < 0)) + return len; + ret = xdr_stream_encode_opaque(xdr, body, body_len); + if (unlikely(ret < 0)) + return ret; + return len + ret; } -EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup); +EXPORT_SYMBOL_GPL(xdr_stream_encode_opaque_auth); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ab453ede54f0..1023361845f9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -283,7 +283,7 @@ out_unlock: xprt_clear_locked(xprt); out_sleep: task->tk_status = -EAGAIN; - if (RPC_IS_SOFT(task)) + if (RPC_IS_SOFT(task) || RPC_IS_SOFTCONN(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, xprt_request_timeout(req)); else @@ -349,7 +349,7 @@ out_unlock: xprt_clear_locked(xprt); out_sleep: task->tk_status = -EAGAIN; - if (RPC_IS_SOFT(task)) + if (RPC_IS_SOFT(task) || RPC_IS_SOFTCONN(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, xprt_request_timeout(req)); else @@ -651,9 +651,9 @@ static unsigned long xprt_abs_ktime_to_jiffies(ktime_t abstime) jiffies + nsecs_to_jiffies(-delta); } -static unsigned long xprt_calc_majortimeo(struct rpc_rqst *req) +static unsigned long xprt_calc_majortimeo(struct rpc_rqst *req, + const struct rpc_timeout *to) { - const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; unsigned long majortimeo = req->rq_timeout; if (to->to_exponential) @@ -665,9 +665,10 @@ static unsigned long xprt_calc_majortimeo(struct rpc_rqst *req) return majortimeo; } -static void xprt_reset_majortimeo(struct rpc_rqst *req) +static void xprt_reset_majortimeo(struct rpc_rqst *req, + const struct rpc_timeout *to) { - req->rq_majortimeo += xprt_calc_majortimeo(req); + req->rq_majortimeo += xprt_calc_majortimeo(req, to); } static void xprt_reset_minortimeo(struct rpc_rqst *req) @@ -675,7 +676,8 @@ static void xprt_reset_minortimeo(struct rpc_rqst *req) req->rq_minortimeo += req->rq_timeout; } -static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req) +static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req, + const struct rpc_timeout *to) { unsigned long time_init; struct rpc_xprt *xprt = req->rq_xprt; @@ -684,8 +686,9 @@ static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req) time_init = jiffies; else time_init = xprt_abs_ktime_to_jiffies(task->tk_start); - req->rq_timeout = task->tk_client->cl_timeout->to_initval; - req->rq_majortimeo = time_init + xprt_calc_majortimeo(req); + + req->rq_timeout = to->to_initval; + req->rq_majortimeo = time_init + xprt_calc_majortimeo(req, to); req->rq_minortimeo = time_init + req->rq_timeout; } @@ -713,7 +716,7 @@ int xprt_adjust_timeout(struct rpc_rqst *req) } else { req->rq_timeout = to->to_initval; req->rq_retries = 0; - xprt_reset_majortimeo(req); + xprt_reset_majortimeo(req, to); /* Reset the RTT counters == "slow start" */ spin_lock(&xprt->transport_lock); rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); @@ -851,7 +854,7 @@ xprt_schedule_autodisconnect(struct rpc_xprt *xprt) static void xprt_init_autodisconnect(struct timer_list *t) { - struct rpc_xprt *xprt = from_timer(xprt, t, timer); + struct rpc_xprt *xprt = timer_container_of(xprt, t, timer); if (!RB_EMPTY_ROOT(&xprt->recv_queue)) return; @@ -1164,7 +1167,7 @@ xprt_request_enqueue_receive(struct rpc_task *task) spin_unlock(&xprt->queue_lock); /* Turn off autodisconnect */ - del_timer_sync(&xprt->timer); + timer_delete_sync(&xprt->timer); return 0; } @@ -1362,7 +1365,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) INIT_LIST_HEAD(&req->rq_xmit2); goto out; } - } else if (!req->rq_seqno) { + } else if (req->rq_seqno_count == 0) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) continue; @@ -1395,6 +1398,12 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task) if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) return; if (!list_empty(&req->rq_xmit)) { + struct rpc_xprt *xprt = req->rq_xprt; + + if (list_is_first(&req->rq_xmit, &xprt->xmit_queue) && + xprt->ops->abort_send_request) + xprt->ops->abort_send_request(req); + list_del(&req->rq_xmit); if (!list_empty(&req->rq_xmit2)) { struct rpc_rqst *next = list_first_entry(&req->rq_xmit2, @@ -1538,6 +1547,9 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) int is_retrans = RPC_WAS_SENT(task); int status; + if (test_bit(XPRT_CLOSE_WAIT, &xprt->state)) + return -ENOTCONN; + if (!req->rq_bytes_sent) { if (xprt_request_data_received(task)) { status = 0; @@ -1886,7 +1898,8 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; - xprt_init_majortimeo(task, req); + req->rq_seqno_count = 0; + xprt_init_majortimeo(task, req, task->tk_client->cl_timeout); trace_xprt_reserve(req); } @@ -1983,7 +1996,8 @@ void xprt_release(struct rpc_task *task) #ifdef CONFIG_SUNRPC_BACKCHANNEL void -xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task) +xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task, + const struct rpc_timeout *to) { struct xdr_buf *xbufp = &req->rq_snd_buf; @@ -1996,6 +2010,13 @@ xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task) */ xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + xbufp->tail[0].iov_len; + /* + * Backchannel Replies are sent with !RPC_TASK_SOFT and + * RPC_TASK_NO_RETRANS_TIMEOUT. The major timeout setting + * affects only how long each Reply waits to be sent when + * a transport connection cannot be established. + */ + xprt_init_majortimeo(task, req, to); } #endif @@ -2118,7 +2139,7 @@ static void xprt_destroy(struct rpc_xprt *xprt) * can only run *before* del_time_sync(), never after. */ spin_lock(&xprt->transport_lock); - del_timer_sync(&xprt->timer); + timer_delete_sync(&xprt->timer); spin_unlock(&xprt->transport_lock); /* diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 701250b305db..4c5e08b0aa64 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -92,6 +92,27 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, xprt_put(xprt); } +/** + * rpc_xprt_switch_get_main_xprt - Get the 'main' xprt for an xprt switch. + * @xps: pointer to struct rpc_xprt_switch. + */ +struct rpc_xprt *rpc_xprt_switch_get_main_xprt(struct rpc_xprt_switch *xps) +{ + struct rpc_xprt_iter xpi; + struct rpc_xprt *xprt; + + xprt_iter_init_listall(&xpi, xps); + + xprt = xprt_iter_get_next(&xpi); + while (xprt && !xprt->main) { + xprt_put(xprt); + xprt = xprt_iter_get_next(&xpi); + } + + xprt_iter_destroy(&xpi); + return xprt; +} + static DEFINE_IDA(rpc_xprtswitch_ids); void xprt_multipath_cleanup_ids(void) @@ -284,7 +305,7 @@ struct rpc_xprt *_xprt_switch_find_current_entry(struct list_head *head, if (cur == pos) found = true; if (found && ((find_active && xprt_is_active(pos)) || - (!find_active && xprt_is_active(pos)))) + (!find_active && !xprt_is_active(pos)))) return pos; } return NULL; @@ -336,8 +357,9 @@ struct rpc_xprt *xprt_iter_current_entry_offline(struct rpc_xprt_iter *xpi) xprt_switch_find_current_entry_offline); } -bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, - const struct sockaddr *sap) +static +bool __rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) { struct list_head *head; struct rpc_xprt *pos; @@ -356,6 +378,18 @@ bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, return false; } +bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) +{ + bool res; + + rcu_read_lock(); + res = __rpc_xprt_switch_has_addr(xps, sap); + rcu_read_unlock(); + + return res; +} + static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur, bool check_active) @@ -590,23 +624,6 @@ struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi, } /** - * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor - * @xpi: pointer to rpc_xprt_iter - * - * Returns a reference to the struct rpc_xprt that is currently - * pointed to by the cursor. - */ -struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi) -{ - struct rpc_xprt *xprt; - - rcu_read_lock(); - xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt); - rcu_read_unlock(); - return xprt; -} - -/** * xprt_iter_get_next - Returns the next rpc_xprt following the cursor * @xpi: pointer to rpc_xprt_iter * diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 55b21bae866d..3232aa23cdb4 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o -rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ +rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o ib_client.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ svc_rdma_pcl.o module.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index e4d84a13c566..8c817e755262 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -263,11 +263,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; xprt_get(xprt); - spin_lock(&bc_serv->sv_cb_lock); - list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list); - spin_unlock(&bc_serv->sv_cb_lock); + lwq_enqueue(&rqst->rq_bc_list, &bc_serv->sv_cb_list); - wake_up(&bc_serv->sv_cb_waitq); + svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]); r_xprt->rx_stats.bcall_count++; return; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ffbf99894970..31434aeb8e29 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -54,7 +54,7 @@ static void frwr_cid_init(struct rpcrdma_ep *ep, cid->ci_completion_id = mr->mr_ibmr->res.id; } -static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +static void frwr_mr_unmap(struct rpcrdma_mr *mr) { if (mr->mr_device) { trace_xprtrdma_mr_unmap(mr); @@ -73,7 +73,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr) { int rc; - frwr_mr_unmap(mr->mr_xprt, mr); + frwr_mr_unmap(mr); rc = ib_dereg_mr(mr->mr_ibmr); if (rc) @@ -84,7 +84,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr) static void frwr_mr_put(struct rpcrdma_mr *mr) { - frwr_mr_unmap(mr->mr_xprt, mr); + frwr_mr_unmap(mr); /* The MR is returned to the req's MR free list instead * of to the xprt's MR free list. No spinlock is needed. @@ -92,7 +92,8 @@ static void frwr_mr_put(struct rpcrdma_mr *mr) rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } -/* frwr_reset - Place MRs back on the free list +/** + * frwr_reset - Place MRs back on @req's free list * @req: request to reset * * Used after a failed marshal. For FRWR, this means the MRs diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c new file mode 100644 index 000000000000..28c68b5f6823 --- /dev/null +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2024 Oracle. All rights reserved. + */ + +/* #include <linux/module.h> +#include <linux/slab.h> */ +#include <linux/xarray.h> +#include <linux/types.h> +#include <linux/kref.h> +#include <linux/completion.h> + +#include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/rdma_rn.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +/* Per-ib_device private data for rpcrdma */ +struct rpcrdma_device { + struct kref rd_kref; + unsigned long rd_flags; + struct ib_device *rd_device; + struct xarray rd_xa; + struct completion rd_done; +}; + +#define RPCRDMA_RD_F_REMOVING (0) + +static struct ib_client rpcrdma_ib_client; + +/* + * Listeners have no associated device, so we never register them. + * Note that ib_get_client_data() does not check if @device is + * NULL for us. + */ +static struct rpcrdma_device *rpcrdma_get_client_data(struct ib_device *device) +{ + if (!device) + return NULL; + return ib_get_client_data(device, &rpcrdma_ib_client); +} + +/** + * rpcrdma_rn_register - register to get device removal notifications + * @device: device to monitor + * @rn: notification object that wishes to be notified + * @done: callback to notify caller of device removal + * + * Returns zero on success. The callback in rn_done is guaranteed + * to be invoked when the device is removed, unless this notification + * is unregistered first. + * + * On failure, a negative errno is returned. + */ +int rpcrdma_rn_register(struct ib_device *device, + struct rpcrdma_notification *rn, + void (*done)(struct rpcrdma_notification *rn)) +{ + struct rpcrdma_device *rd = rpcrdma_get_client_data(device); + + if (!rd || test_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags)) + return -ENETUNREACH; + + if (xa_alloc(&rd->rd_xa, &rn->rn_index, rn, xa_limit_32b, GFP_KERNEL) < 0) + return -ENOMEM; + kref_get(&rd->rd_kref); + rn->rn_done = done; + trace_rpcrdma_client_register(device, rn); + return 0; +} + +static void rpcrdma_rn_release(struct kref *kref) +{ + struct rpcrdma_device *rd = container_of(kref, struct rpcrdma_device, + rd_kref); + + trace_rpcrdma_client_completion(rd->rd_device); + complete(&rd->rd_done); +} + +/** + * rpcrdma_rn_unregister - stop device removal notifications + * @device: monitored device + * @rn: notification object that no longer wishes to be notified + */ +void rpcrdma_rn_unregister(struct ib_device *device, + struct rpcrdma_notification *rn) +{ + struct rpcrdma_device *rd = rpcrdma_get_client_data(device); + + if (!rd) + return; + + trace_rpcrdma_client_unregister(device, rn); + xa_erase(&rd->rd_xa, rn->rn_index); + kref_put(&rd->rd_kref, rpcrdma_rn_release); +} + +/** + * rpcrdma_add_one - ib_client device insertion callback + * @device: device about to be inserted + * + * Returns zero on success. xprtrdma private data has been allocated + * for this device. On failure, a negative errno is returned. + */ +static int rpcrdma_add_one(struct ib_device *device) +{ + struct rpcrdma_device *rd; + + rd = kzalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return -ENOMEM; + + kref_init(&rd->rd_kref); + xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC); + rd->rd_device = device; + init_completion(&rd->rd_done); + ib_set_client_data(device, &rpcrdma_ib_client, rd); + + trace_rpcrdma_client_add_one(device); + return 0; +} + +/** + * rpcrdma_remove_one - ib_client device removal callback + * @device: device about to be removed + * @client_data: this module's private per-device data + * + * Upon return, all transports associated with @device have divested + * themselves from IB hardware resources. + */ +static void rpcrdma_remove_one(struct ib_device *device, + void *client_data) +{ + struct rpcrdma_device *rd = client_data; + struct rpcrdma_notification *rn; + unsigned long index; + + trace_rpcrdma_client_remove_one(device); + + set_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags); + xa_for_each(&rd->rd_xa, index, rn) + rn->rn_done(rn); + + /* + * Wait only if there are still outstanding notification + * registrants for this device. + */ + if (!refcount_dec_and_test(&rd->rd_kref.refcount)) { + trace_rpcrdma_client_wait_on(device); + wait_for_completion(&rd->rd_done); + } + + trace_rpcrdma_client_remove_one_done(device); + xa_destroy(&rd->rd_xa); + kfree(rd); +} + +static struct ib_client rpcrdma_ib_client = { + .name = "rpcrdma", + .add = rpcrdma_add_one, + .remove = rpcrdma_remove_one, +}; + +/** + * rpcrdma_ib_client_unregister - unregister ib_client for xprtrdma + * + * cel: watch for orphaned rpcrdma_device objects on module unload + */ +void rpcrdma_ib_client_unregister(void) +{ + ib_unregister_client(&rpcrdma_ib_client); +} + +/** + * rpcrdma_ib_client_register - register ib_client for rpcrdma + * + * Returns zero on success, or a negative errno. + */ +int rpcrdma_ib_client_register(void) +{ + return ib_register_client(&rpcrdma_ib_client); +} diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 45c5b41ac8dc..697f571d4c01 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/rdma_rn.h> #include <asm/swab.h> @@ -30,21 +31,32 @@ static void __exit rpc_rdma_cleanup(void) { xprt_rdma_cleanup(); svc_rdma_cleanup(); + rpcrdma_ib_client_unregister(); } static int __init rpc_rdma_init(void) { int rc; + rc = rpcrdma_ib_client_register(); + if (rc) + goto out_rc; + rc = svc_rdma_init(); if (rc) - goto out; + goto out_ib_client; rc = xprt_rdma_init(); if (rc) - svc_rdma_cleanup(); + goto out_svc_rdma; -out: + return 0; + +out_svc_rdma: + svc_rdma_cleanup(); +out_ib_client: + rpcrdma_ib_client_unregister(); +out_rc: return rc; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 190a4de239c8..3aac1456e23e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -190,7 +190,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); while (len > 0) { if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + *ppages = alloc_page(GFP_NOWAIT); if (!*ppages) return -ENOBUFS; ppages++; @@ -1471,8 +1471,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_ep->re_max_requests) credits = r_xprt->rx_ep->re_max_requests; - rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1), - false); + rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1)); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 5bc20e9d09cd..415c0310101f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -74,7 +74,7 @@ enum { SVCRDMA_COUNTER_BUFSIZ = sizeof(unsigned long long), }; -static int svcrdma_counter_handler(struct ctl_table *table, int write, +static int svcrdma_counter_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct percpu_counter *stat = (struct percpu_counter *)table->data; @@ -209,25 +209,6 @@ static struct ctl_table svcrdma_parm_table[] = { .extra1 = &zero, .extra2 = &zero, }, - { }, -}; - -static struct ctl_table svcrdma_table[] = { - { - .procname = "svc_rdma", - .mode = 0555, - .child = svcrdma_parm_table - }, - { }, -}; - -static struct ctl_table svcrdma_root_table[] = { - { - .procname = "sunrpc", - .mode = 0555, - .child = svcrdma_table - }, - { }, }; static void svc_rdma_proc_cleanup(void) @@ -252,49 +233,75 @@ static int svc_rdma_proc_init(void) rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err; rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_read; rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_recv; rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_sq; + + svcrdma_table_header = register_sysctl("sunrpc/svc_rdma", + svcrdma_parm_table); + if (!svcrdma_table_header) + goto err_write; - svcrdma_table_header = register_sysctl_table(svcrdma_root_table); return 0; -out_err: +err_write: + rc = -ENOMEM; + percpu_counter_destroy(&svcrdma_stat_write); +err_sq: percpu_counter_destroy(&svcrdma_stat_sq_starve); +err_recv: percpu_counter_destroy(&svcrdma_stat_recv); +err_read: percpu_counter_destroy(&svcrdma_stat_read); +err: return rc; } +struct workqueue_struct *svcrdma_wq; + void svc_rdma_cleanup(void) { - dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); svc_unreg_xprt_class(&svc_rdma_class); svc_rdma_proc_cleanup(); + if (svcrdma_wq) { + struct workqueue_struct *wq = svcrdma_wq; + + svcrdma_wq = NULL; + destroy_workqueue(wq); + } + + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); } int svc_rdma_init(void) { + struct workqueue_struct *wq; int rc; - dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); - dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); - dprintk("\tmax_requests : %u\n", svcrdma_max_requests); - dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); - dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; rc = svc_rdma_proc_init(); - if (rc) + if (rc) { + destroy_workqueue(wq); return rc; + } - /* Register RDMA with the SVC transport switch */ + svcrdma_wq = wq; svc_reg_xprt_class(&svc_rdma_class); + + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index aa2227a7e552..e5a78b761012 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -76,15 +76,12 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst, struct svc_rdma_send_ctxt *sctxt) { - struct svc_rdma_recv_ctxt *rctxt; + struct svc_rdma_pcl empty_pcl; int ret; - rctxt = svc_rdma_recv_ctxt_get(rdma); - if (!rctxt) - return -EIO; - - ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf); - svc_rdma_recv_ctxt_put(rdma, rctxt); + pcl_init(&empty_pcl); + ret = svc_rdma_map_reply_msg(rdma, sctxt, &empty_pcl, &empty_pcl, + &rqst->rq_snd_buf); if (ret < 0) return -EIO; @@ -93,13 +90,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, */ get_page(virt_to_page(rqst->rq_buffer)); sctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(rdma, sctxt); - if (ret < 0) - return ret; - - ret = wait_for_completion_killable(&sctxt->sc_done); - svc_rdma_send_ctxt_put(rdma, sctxt); - return ret; + return svc_rdma_post_send(rdma, sctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 5242ad121450..e7e4a39ca6c6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -94,7 +94,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -115,24 +115,22 @@ svc_rdma_next_recv_ctxt(struct list_head *list) rc_list); } -static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) -{ - cid->ci_queue_id = rdma->sc_rq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} - static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { + int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_recv_ctxt *ctxt; + unsigned long pages; dma_addr_t addr; void *buffer; - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server); + ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages), + GFP_KERNEL, node); if (!ctxt) goto fail0; - buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + ctxt->rc_maxpages = pages; + buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; addr = ib_dma_map_single(rdma->sc_pd->device, buffer, @@ -155,7 +153,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->rc_recv_sge.length = rdma->sc_max_req_size; ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; ctxt->rc_recv_buf = buffer; - ctxt->rc_temp = false; + svc_rdma_cc_init(rdma, &ctxt->rc_cc); return ctxt; fail2: @@ -204,18 +202,11 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) node = llist_del_first(&rdma->sc_recv_ctxts); if (!node) - goto out_empty; - ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); + return NULL; -out: + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); ctxt->rc_page_count = 0; return ctxt; - -out_empty: - ctxt = svc_rdma_recv_ctxt_alloc(rdma); - if (!ctxt) - return NULL; - goto out; } /** @@ -227,39 +218,42 @@ out_empty: void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { + svc_rdma_cc_release(rdma, &ctxt->rc_cc, DMA_FROM_DEVICE); + + /* @rc_page_count is normally zero here, but error flows + * can leave pages in @rc_pages. + */ + release_pages(ctxt->rc_pages, ctxt->rc_page_count); + pcl_free(&ctxt->rc_call_pcl); pcl_free(&ctxt->rc_read_pcl); pcl_free(&ctxt->rc_write_pcl); pcl_free(&ctxt->rc_reply_pcl); - if (!ctxt->rc_temp) - llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); - else - svc_rdma_recv_ctxt_destroy(rdma, ctxt); + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); } /** - * svc_rdma_release_rqst - Release transport-specific per-rqst resources - * @rqstp: svc_rqst being released + * svc_rdma_release_ctxt - Release transport-specific per-rqst resources + * @xprt: the transport which owned the context + * @vctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * * Ensure that the recv_ctxt is released whether or not a Reply * was sent. For example, the client could close the connection, * or svc_process could drop an RPC, before the Reply is sent. */ -void svc_rdma_release_rqst(struct svc_rqst *rqstp) +void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt) { - struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt; - struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_rdma_recv_ctxt *ctxt = vctxt; struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - rqstp->rq_xprt_ctxt = NULL; if (ctxt) svc_rdma_recv_ctxt_put(rdma, ctxt); } static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, - unsigned int wanted, bool temp) + unsigned int wanted) { const struct ib_recv_wr *bad_wr = NULL; struct svc_rdma_recv_ctxt *ctxt; @@ -275,14 +269,13 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, if (!ctxt) break; - trace_svcrdma_post_recv(ctxt); - ctxt->rc_temp = temp; + trace_svcrdma_post_recv(&ctxt->rc_cid); ctxt->rc_recv_wr.next = recv_chain; recv_chain = &ctxt->rc_recv_wr; rdma->sc_pending_recvs++; } if (!recv_chain) - return false; + return true; ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); if (ret) @@ -306,11 +299,28 @@ err_free: * svc_rdma_post_recvs - Post initial set of Recv WRs * @rdma: fresh svcxprt_rdma * - * Returns true if successful, otherwise false. + * Return values: + * %true: Receive Queue initialization successful + * %false: memory allocation or DMA error */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); + unsigned int total; + + /* For each credit, allocate enough recv_ctxts for one + * posted Receive and one RPC in process. + */ + total = (rdma->sc_max_requests * 2) + rdma->sc_recv_batch; + while (total--) { + struct svc_rdma_recv_ctxt *ctxt; + + ctxt = svc_rdma_recv_ctxt_alloc(rdma); + if (!ctxt) + return false; + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); + } + + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests); } /** @@ -344,7 +354,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) * client reconnects. */ if (rdma->sc_pending_recvs < rdma->sc_max_requests) - if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false)) + if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch)) goto dropped; /* All wc fields are now known to be valid */ @@ -378,6 +388,10 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt); + } while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { list_del(&ctxt->rc_list); svc_rdma_recv_ctxt_put(rdma, ctxt); @@ -483,7 +497,13 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount)) return false; - /* A bogus segcount causes this buffer overflow check to fail. */ + /* Before trusting the segcount value enough to use it in + * a computation, perform a simple range check. This is an + * arbitrary but sensible limit (ie, not architectural). + */ + if (unlikely(segcount > rctxt->rc_maxpages)) + return false; + p = xdr_inline_decode(&rctxt->rc_stream, segcount * rpcrdma_segment_maxsz * sizeof(*p)); return p != NULL; @@ -759,6 +779,122 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, return true; } +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_MSG type message + * with a single Read chunk (only the upper layer data payload + * was conveyed via RDMA Read). + */ +static void svc_rdma_read_complete_one(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct svc_rdma_chunk *chunk = pcl_first_chunk(&ctxt->rc_read_pcl); + struct xdr_buf *buf = &rqstp->rq_arg; + unsigned int length; + + /* Split the Receive buffer between the head and tail + * buffers at Read chunk's position. XDR roundup of the + * chunk is not included in either the pagelist or in + * the tail. + */ + buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; + buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; + buf->head[0].iov_len = chunk->ch_position; + + /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). + * + * If the client already rounded up the chunk length, the + * length does not change. Otherwise, the length of the page + * list is increased to include XDR round-up. + * + * Currently these chunks always start at page offset 0, + * thus the rounded-up length never crosses a page boundary. + */ + buf->pages = &rqstp->rq_pages[0]; + length = xdr_align_size(chunk->ch_length); + buf->page_len = length; + buf->len += length; + buf->buflen += length; +} + +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_MSG type message + * with payload in multiple Read chunks and no PZRC. + */ +static void svc_rdma_read_complete_multiple(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct xdr_buf *buf = &rqstp->rq_arg; + + buf->len += ctxt->rc_readbytes; + buf->buflen += ctxt->rc_readbytes; + + buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, ctxt->rc_readbytes); + buf->pages = &rqstp->rq_pages[1]; + buf->page_len = ctxt->rc_readbytes - buf->head[0].iov_len; +} + +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_NOMSG type message + * (the RPC message body was conveyed via RDMA Read). + */ +static void svc_rdma_read_complete_pzrc(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct xdr_buf *buf = &rqstp->rq_arg; + + buf->len += ctxt->rc_readbytes; + buf->buflen += ctxt->rc_readbytes; + + buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, ctxt->rc_readbytes); + buf->pages = &rqstp->rq_pages[1]; + buf->page_len = ctxt->rc_readbytes - buf->head[0].iov_len; +} + +static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + unsigned int i; + + /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing + * the rq_pages that were already allocated for this rqstp. + */ + release_pages(rqstp->rq_respages, ctxt->rc_page_count); + for (i = 0; i < ctxt->rc_page_count; i++) + rqstp->rq_pages[i] = ctxt->rc_pages[i]; + + /* Update @rqstp's result send buffer to start after the + * last page in the RDMA Read payload. + */ + rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Prevent svc_rdma_recv_ctxt_put() from releasing the + * pages in ctxt::rc_pages a second time. + */ + ctxt->rc_page_count = 0; + + /* Finish constructing the RPC Call message. The exact + * procedure for that depends on what kind of RPC/RDMA + * chunks were provided by the client. + */ + rqstp->rq_arg = ctxt->rc_saved_arg; + if (pcl_is_empty(&ctxt->rc_call_pcl)) { + if (ctxt->rc_read_pcl.cl_count == 1) + svc_rdma_read_complete_one(rqstp, ctxt); + else + svc_rdma_read_complete_multiple(rqstp, ctxt); + } else { + svc_rdma_read_complete_pzrc(rqstp, ctxt); + } + + trace_svcrdma_read_finished(&ctxt->rc_cid); +} + /** * svc_rdma_recvfrom - Receive an RPC call * @rqstp: request structure into which to receive an RPC Call @@ -776,9 +912,6 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, * * The next ctxt is removed from the "receive" lists. * - * - If the ctxt completes a Read, then finish assembling the Call - * message and return the number of bytes in the message. - * * - If the ctxt completes a Receive, then construct the Call * message from the contents of the Receive buffer. * @@ -787,7 +920,8 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, * in the message. * * - If there are Read chunks in this message, post Read WRs to - * pull that payload and return 0. + * pull that payload. When the Read WRs complete, build the + * full message and return the number of bytes in it. */ int svc_rdma_recvfrom(struct svc_rqst *rqstp) { @@ -797,10 +931,23 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *ctxt; int ret; + /* Prevent svc_xprt_release() from releasing pages in rq_pages + * when returning 0 or an error. + */ + rqstp->rq_respages = rqstp->rq_pages; + rqstp->rq_next_page = rqstp->rq_respages; + rqstp->rq_xprt_ctxt = NULL; - ctxt = NULL; spin_lock(&rdma_xprt->sc_rq_dto_lock); + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); + if (ctxt) { + list_del(&ctxt->rc_list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + svc_xprt_received(xprt); + svc_rdma_read_complete(rqstp, ctxt); + goto complete; + } ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); if (ctxt) list_del(&ctxt->rc_list); @@ -820,12 +967,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); - /* Prevent svc_xprt_release from releasing pages in rq_pages - * if we return 0 or an error. - */ - rqstp->rq_respages = rqstp->rq_pages; - rqstp->rq_next_page = rqstp->rq_respages; - ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; @@ -838,15 +979,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) svc_rdma_get_inv_rkey(rdma_xprt, ctxt); if (!pcl_is_empty(&ctxt->rc_read_pcl) || - !pcl_is_empty(&ctxt->rc_call_pcl)) { - ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); - if (ret < 0) - goto out_readfail; - } + !pcl_is_empty(&ctxt->rc_call_pcl)) + goto out_readlist; +complete: rqstp->rq_xprt_ctxt = ctxt; rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); + set_bit(RQ_SECURE, &rqstp->rq_flags); return rqstp->rq_arg.len; out_err: @@ -854,11 +994,23 @@ out_err: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; -out_readfail: - if (ret == -EINVAL) - svc_rdma_send_error(rdma_xprt, ctxt, ret); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); - return ret; +out_readlist: + /* This @rqstp is about to be recycled. Save the work + * already done constructing the Call message in rq_arg + * so it can be restored when the RDMA Reads have + * completed. + */ + ctxt->rc_saved_arg = rqstp->rq_arg; + + ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); + if (ret < 0) { + if (ret == -EINVAL) + svc_rdma_send_error(rdma_xprt, ctxt, ret); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); + svc_xprt_deferred_close(xprt); + return ret; + } + return 0; out_backchannel: svc_rdma_handle_bc_reply(rqstp, ctxt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 11cf7c646644..661b3fe2779f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -39,6 +39,7 @@ struct svc_rdma_rw_ctxt { struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; + unsigned int rw_first_sgl_nents; struct sg_table rw_sg_table; struct scatterlist rw_first_sgl[]; }; @@ -53,6 +54,8 @@ svc_rdma_next_ctxt(struct list_head *list) static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { + struct ib_device *dev = rdma->sc_cm_id->device; + unsigned int first_sgl_nents = dev->attrs.max_send_sge; struct svc_rdma_rw_ctxt *ctxt; struct llist_node *node; @@ -62,40 +65,40 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), - GFP_KERNEL); + ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), + GFP_KERNEL, ibdev_to_node(dev)); if (!ctxt) goto out_noctx; INIT_LIST_HEAD(&ctxt->rw_list); + ctxt->rw_first_sgl_nents = first_sgl_nents; } ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, ctxt->rw_sg_table.sgl, - SG_CHUNK_SIZE)) + first_sgl_nents)) goto out_free; return ctxt; out_free: kfree(ctxt); out_noctx: - trace_svcrdma_no_rwctx_err(rdma, sges); + trace_svcrdma_rwctx_empty(rdma, sges); return NULL; } -static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, - struct svc_rdma_rw_ctxt *ctxt, +static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { - sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); llist_add(&ctxt->rw_node, list); } static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt) { - __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); + __svc_rdma_put_rw_ctxt(ctxt, &rdma->sc_rw_ctxts); } /** @@ -136,61 +139,46 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, ctxt->rw_sg_table.sgl, ctxt->rw_nents, 0, offset, handle, direction); if (unlikely(ret < 0)) { + trace_svcrdma_dma_map_rw_err(rdma, offset, handle, + ctxt->rw_nents, ret); svc_rdma_put_rw_ctxt(rdma, ctxt); - trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret); } return ret; } -/* A chunk context tracks all I/O for moving one Read or Write - * chunk. This is a set of rdma_rw's that handle data movement - * for all segments of one chunk. - * - * These are small, acquired with a single allocator call, and - * no more than one is needed per chunk. They are allocated on - * demand, and not cached. +/** + * svc_rdma_cc_init - Initialize an svc_rdma_chunk_ctxt + * @rdma: controlling transport instance + * @cc: svc_rdma_chunk_ctxt to be initialized */ -struct svc_rdma_chunk_ctxt { - struct rpc_rdma_cid cc_cid; - struct ib_cqe cc_cqe; - struct svcxprt_rdma *cc_rdma; - struct list_head cc_rwctxts; - ktime_t cc_posttime; - int cc_sqecount; - enum ib_wc_status cc_status; - struct completion cc_done; -}; - -static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) +void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc) { - cid->ci_queue_id = rdma->sc_sq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} + struct rpc_rdma_cid *cid = &cc->cc_cid; -static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, - struct svc_rdma_chunk_ctxt *cc) -{ - svc_rdma_cc_cid_init(rdma, &cc->cc_cid); - cc->cc_rdma = rdma; + if (unlikely(!cid->ci_completion_id)) + svc_rdma_send_cid_init(rdma, cid); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; } -/* - * The consumed rw_ctx's are cleaned and placed on a local llist so - * that only one atomic llist operation is needed to put them all - * back on the free list. +/** + * svc_rdma_cc_release - Release resources held by a svc_rdma_chunk_ctxt + * @rdma: controlling transport instance + * @cc: svc_rdma_chunk_ctxt to be released + * @dir: DMA direction */ -static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, - enum dma_data_direction dir) +void svc_rdma_cc_release(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir) { - struct svcxprt_rdma *rdma = cc->cc_rdma; struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; LLIST_HEAD(free); + trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); @@ -198,7 +186,7 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + __svc_rdma_put_rw_ctxt(ctxt, &free); ctxt->rw_node.next = first; first = &ctxt->rw_node; @@ -209,49 +197,82 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, llist_add_batch(first, last, &rdma->sc_rw_ctxts); } -/* State for sending a Write or Reply chunk. - * - Tracks progress of writing one chunk over all its segments - * - Stores arguments for the SGL constructor functions - */ -struct svc_rdma_write_info { - const struct svc_rdma_chunk *wi_chunk; - - /* write state of this chunk */ - unsigned int wi_seg_off; - unsigned int wi_seg_no; - - /* SGL constructor arguments */ - const struct xdr_buf *wi_xdr; - unsigned char *wi_base; - unsigned int wi_next_off; - - struct svc_rdma_chunk_ctxt wi_cc; -}; - static struct svc_rdma_write_info * svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk) { struct svc_rdma_write_info *info; - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc_node(sizeof(*info), GFP_KERNEL, + ibdev_to_node(rdma->sc_cm_id->device)); if (!info) return info; + info->wi_rdma = rdma; info->wi_chunk = chunk; - info->wi_seg_off = 0; - info->wi_seg_no = 0; svc_rdma_cc_init(rdma, &info->wi_cc); info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; } -static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +static void svc_rdma_write_info_free_async(struct work_struct *work) { - svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); + struct svc_rdma_write_info *info; + + info = container_of(work, struct svc_rdma_write_info, wi_work); + svc_rdma_cc_release(info->wi_rdma, &info->wi_cc, DMA_TO_DEVICE); kfree(info); } +static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +{ + INIT_WORK(&info->wi_work, svc_rdma_write_info_free_async); + queue_work(svcrdma_wq, &info->wi_work); +} + +/** + * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + */ +void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_chunk_ctxt *cc = &ctxt->sc_reply_info.wi_cc; + + if (!cc->cc_sqecount) + return; + svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE); +} + +/** + * svc_rdma_reply_done - Reply chunk Write completion handler + * @cq: controlling Completion Queue + * @wc: Work Completion report + * + * Pages under I/O are released by a subsequent Send completion. + */ +static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cq->cq_context; + + switch (wc->status) { + case IB_WC_SUCCESS: + trace_svcrdma_wc_reply(&cc->cc_cid); + return; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_reply_err(wc, &cc->cc_cid); + } + + svc_xprt_deferred_close(&rdma->sc_xprt); +} + /** * svc_rdma_write_done - Write chunk completion * @cq: controlling Completion Queue @@ -261,16 +282,16 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) */ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) { + struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svcxprt_rdma *rdma = cc->cc_rdma; struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); switch (wc->status) { case IB_WC_SUCCESS: - trace_svcrdma_wc_write(wc, &cc->cc_cid); + trace_svcrdma_wc_write(&cc->cc_cid); break; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); @@ -287,38 +308,6 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) svc_rdma_write_info_free(info); } -/* State for pulling a Read chunk. - */ -struct svc_rdma_read_info { - struct svc_rqst *ri_rqst; - struct svc_rdma_recv_ctxt *ri_readctxt; - unsigned int ri_pageno; - unsigned int ri_pageoff; - unsigned int ri_totalbytes; - - struct svc_rdma_chunk_ctxt ri_cc; -}; - -static struct svc_rdma_read_info * -svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) -{ - struct svc_rdma_read_info *info; - - info = kmalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return info; - - svc_rdma_cc_init(rdma, &info->ri_cc); - info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done; - return info; -} - -static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) -{ - svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE); - kfree(info); -} - /** * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx * @cq: controlling Completion Queue @@ -327,17 +316,27 @@ static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) */ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) { + struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svc_rdma_read_info *info; + struct svc_rdma_recv_ctxt *ctxt; + + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); + ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc); switch (wc->status) { case IB_WC_SUCCESS: - info = container_of(cc, struct svc_rdma_read_info, ri_cc); - trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes, + trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes, cc->cc_posttime); - break; + + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&ctxt->rc_list, &rdma->sc_read_complete_q); + /* the unlock pairs with the smp_rmb in svc_xprt_ready */ + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + spin_unlock(&rdma->sc_rq_dto_lock); + svc_xprt_enqueue(&rdma->sc_xprt); + return; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_read_flush(wc, &cc->cc_cid); break; @@ -345,28 +344,32 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_read_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(cc->cc_rdma, cc->cc_sqecount); - cc->cc_status = wc->status; - complete(&cc->cc_done); - return; + /* The RDMA Read has flushed, so the incoming RPC message + * cannot be constructed and must be dropped. Signal the + * loss to the client by closing the connection. + */ + svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE); + svc_rdma_recv_ctxt_put(rdma, ctxt); + svc_xprt_deferred_close(&rdma->sc_xprt); } -/* This function sleeps when the transport's Send Queue is congested. - * +/* * Assumptions: * - If ib_post_send() succeeds, only one completion is expected, * even if one or more WRs are flushed. This is true when posting * an rdma_rw_ctx or when posting a single signaled WR. */ -static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) +static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc) { - struct svcxprt_rdma *rdma = cc->cc_rdma; struct ib_send_wr *first_wr; const struct ib_send_wr *bad_wr; struct list_head *tmp; struct ib_cqe *cqe; int ret; + might_sleep(); + if (cc->cc_sqecount > rdma->sc_sq_depth) return -EINVAL; @@ -392,14 +395,14 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) } percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma); + trace_svcrdma_sq_full(rdma, &cc->cc_cid); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); - trace_svcrdma_sq_retry(rdma); + trace_svcrdma_sq_retry(rdma, &cc->cc_cid); } while (1); - trace_svcrdma_sq_post_err(rdma, ret); + trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret); svc_xprt_deferred_close(&rdma->sc_xprt); /* If even one was posted, there will be a completion. */ @@ -469,7 +472,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, unsigned int remaining) { struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; - struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svcxprt_rdma *rdma = info->wi_rdma; const struct svc_rdma_segment *seg; struct svc_rdma_rw_ctxt *ctxt; int ret; @@ -512,7 +515,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, return 0; out_overflow: - trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, + trace_svcrdma_small_wrch_err(&cc->cc_cid, remaining, info->wi_seg_no, info->wi_chunk->ch_segcount); return -E2BIG; } @@ -598,41 +601,33 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -/** - * svc_rdma_send_write_chunk - Write all segments in a Write chunk - * @rdma: controlling RDMA transport - * @chunk: Write chunk provided by the client - * @xdr: xdr_buf containing the data payload - * - * Returns a non-negative number of bytes the chunk consumed, or - * %-E2BIG if the payload was larger than the Write chunk, - * %-EINVAL if client provided too many segments, - * %-ENOMEM if rdma_rw context pool was exhausted, - * %-ENOTCONN if posting failed (connection is lost), - * %-EIO if rdma_rw initialization failed (DMA mapping, etc). - */ -int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; + struct xdr_buf payload; int ret; + if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, + chunk->ch_payload_length)) + return -EMSGSIZE; + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; cc = &info->wi_cc; - ret = svc_rdma_xb_write(xdr, info); - if (ret != xdr->len) + ret = svc_rdma_xb_write(&payload, info); + if (ret != payload.len) goto out_err; trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(cc); + ret = svc_rdma_post_chunk_ctxt(rdma, cc); if (ret < 0) goto out_err; - return xdr->len; + return 0; out_err: svc_rdma_write_info_free(info); @@ -640,9 +635,37 @@ out_err: } /** - * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk + * svc_rdma_send_write_list - Send all chunks on the Write list + * @rdma: controlling RDMA transport + * @rctxt: Write list provisioned by the client + * @xdr: xdr_buf containing an RPC Reply message + * + * Returns zero on success, or a negative errno if one or more + * Write chunks could not be sent. + */ +int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) +{ + struct svc_rdma_chunk *chunk; + int ret; + + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { + if (!chunk->ch_payload_length) + break; + ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); + if (ret < 0) + return ret; + } + return 0; +} + +/** + * svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk * @rdma: controlling RDMA transport - * @rctxt: Write and Reply chunks from client + * @write_pcl: Write chunk list provided by client + * @reply_pcl: Reply chunk provided by client + * @sctxt: Send WR resources * @xdr: xdr_buf containing an RPC Reply * * Returns a non-negative number of bytes the chunk consumed, or @@ -652,44 +675,51 @@ out_err: * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr) +int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) { - struct svc_rdma_write_info *info; - struct svc_rdma_chunk_ctxt *cc; - struct svc_rdma_chunk *chunk; + struct svc_rdma_write_info *info = &sctxt->sc_reply_info; + struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; + struct ib_send_wr *first_wr; + struct list_head *pos; + struct ib_cqe *cqe; int ret; - if (pcl_is_empty(&rctxt->rc_reply_pcl)) - return 0; - - chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); - info = svc_rdma_write_info_alloc(rdma, chunk); - if (!info) - return -ENOMEM; - cc = &info->wi_cc; + info->wi_rdma = rdma; + info->wi_chunk = pcl_first_chunk(reply_pcl); + info->wi_seg_off = 0; + info->wi_seg_no = 0; + info->wi_cc.cc_cqe.done = svc_rdma_reply_done; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_write, info); if (ret < 0) - goto out_err; + return ret; - trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(cc); - if (ret < 0) - goto out_err; + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; - return xdr->len; + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; -out_err: - svc_rdma_write_info_free(info); - return ret; + trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); + return xdr->len; } /** * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment - * @info: context for ongoing I/O + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @segment: co-ordinates of remote memory to be read * * Returns: @@ -698,20 +728,20 @@ out_err: * %-ENOMEM: allocating a local resources failed * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, +static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_segment *segment) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; - struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; - struct svc_rqst *rqstp = info->ri_rqst; + struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; unsigned int sge_no, seg_len, len; struct svc_rdma_rw_ctxt *ctxt; struct scatterlist *sg; int ret; len = segment->rs_length; - sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; - ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); + sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); if (!ctxt) return -ENOMEM; ctxt->rw_nents = sge_no; @@ -719,29 +749,27 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, sg = ctxt->rw_sg_table.sgl; for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { seg_len = min_t(unsigned int, len, - PAGE_SIZE - info->ri_pageoff); + PAGE_SIZE - head->rc_pageoff); - if (!info->ri_pageoff) + if (!head->rc_pageoff) head->rc_page_count++; - sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], - seg_len, info->ri_pageoff); + sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], + seg_len, head->rc_pageoff); sg = sg_next(sg); - info->ri_pageoff += seg_len; - if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; - info->ri_pageoff = 0; + head->rc_pageoff += seg_len; + if (head->rc_pageoff == PAGE_SIZE) { + head->rc_curpage++; + head->rc_pageoff = 0; } len -= seg_len; - /* Safety check */ - if (len && - &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end) + if (len && ((head->rc_curpage + 1) > rqstp->rq_maxpages)) goto out_overrun; } - ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset, + ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) return -EIO; @@ -752,13 +780,14 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, return 0; out_overrun: - trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno); + trace_svcrdma_page_overrun_err(&cc->cc_cid, head->rc_curpage); return -EINVAL; } /** * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk - * @info: context for ongoing I/O + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @chunk: Read chunk to pull * * Return values: @@ -767,7 +796,8 @@ out_overrun: * %-ENOMEM: allocating a local resources failed * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, +static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_chunk *chunk) { const struct svc_rdma_segment *segment; @@ -775,56 +805,56 @@ static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, ret = -EINVAL; pcl_for_each_segment(segment, chunk) { - ret = svc_rdma_build_read_segment(info, segment); + ret = svc_rdma_build_read_segment(rqstp, head, segment); if (ret < 0) break; - info->ri_totalbytes += segment->rs_length; + head->rc_readbytes += segment->rs_length; } return ret; } /** * svc_rdma_copy_inline_range - Copy part of the inline content into pages - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @offset: offset into the Receive buffer of region to copy * @remaining: length of region to copy * * Take a page at a time from rqstp->rq_pages and copy the inline * content from the Receive buffer into that page. Update - * info->ri_pageno and info->ri_pageoff so that the next RDMA Read + * head->rc_curpage and head->rc_pageoff so that the next RDMA Read * result will land contiguously with the copied content. * * Return values: * %0: Inline content was successfully copied * %-EINVAL: offset or length was incorrect */ -static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, +static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, unsigned int offset, unsigned int remaining) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; unsigned char *dst, *src = head->rc_recv_buf; - struct svc_rqst *rqstp = info->ri_rqst; unsigned int page_no, numpages; - numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT; + numpages = PAGE_ALIGN(head->rc_pageoff + remaining) >> PAGE_SHIFT; for (page_no = 0; page_no < numpages; page_no++) { unsigned int page_len; page_len = min_t(unsigned int, remaining, - PAGE_SIZE - info->ri_pageoff); + PAGE_SIZE - head->rc_pageoff); - if (!info->ri_pageoff) + if (!head->rc_pageoff) head->rc_page_count++; - dst = page_address(rqstp->rq_pages[info->ri_pageno]); - memcpy(dst + info->ri_pageno, src + offset, page_len); + dst = page_address(rqstp->rq_pages[head->rc_curpage]); + memcpy(dst + head->rc_curpage, src + offset, page_len); - info->ri_totalbytes += page_len; - info->ri_pageoff += page_len; - if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; - info->ri_pageoff = 0; + head->rc_readbytes += page_len; + head->rc_pageoff += page_len; + if (head->rc_pageoff == PAGE_SIZE) { + head->rc_curpage++; + head->rc_pageoff = 0; } remaining -= page_len; offset += page_len; @@ -835,7 +865,8 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, /** * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The chunk data lands in rqstp->rq_arg as a series of contiguous pages, * like an incoming TCP call. @@ -847,11 +878,11 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info) +static noinline int +svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; - struct xdr_buf *buf = &info->ri_rqst->rq_arg; struct svc_rdma_chunk *chunk, *next; unsigned int start, length; int ret; @@ -859,12 +890,12 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf start = 0; chunk = pcl_first_chunk(pcl); length = chunk->ch_position; - ret = svc_rdma_copy_inline_range(info, start, length); + ret = svc_rdma_copy_inline_range(rqstp, head, start, length); if (ret < 0) return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(info, chunk); + ret = svc_rdma_build_read_chunk(rqstp, head, chunk); if (ret < 0) return ret; @@ -873,31 +904,21 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf break; start += length; - length = next->ch_position - info->ri_totalbytes; - ret = svc_rdma_copy_inline_range(info, start, length); + length = next->ch_position - head->rc_readbytes; + ret = svc_rdma_copy_inline_range(rqstp, head, start, length); if (ret < 0) return ret; } start += length; length = head->rc_byte_len - start; - ret = svc_rdma_copy_inline_range(info, start, length); - if (ret < 0) - return ret; - - buf->len += info->ri_totalbytes; - buf->buflen += info->ri_totalbytes; - - buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); - buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); - buf->pages = &info->ri_rqst->rq_pages[1]; - buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; - return 0; + return svc_rdma_copy_inline_range(rqstp, head, start, length); } /** * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The chunk data lands in the page list of rqstp->rq_arg.pages. * @@ -912,50 +933,17 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) +static int svc_rdma_read_data_item(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; - struct xdr_buf *buf = &info->ri_rqst->rq_arg; - struct svc_rdma_chunk *chunk; - unsigned int length; - int ret; - - chunk = pcl_first_chunk(&head->rc_read_pcl); - ret = svc_rdma_build_read_chunk(info, chunk); - if (ret < 0) - goto out; - - /* Split the Receive buffer between the head and tail - * buffers at Read chunk's position. XDR roundup of the - * chunk is not included in either the pagelist or in - * the tail. - */ - buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; - buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; - buf->head[0].iov_len = chunk->ch_position; - - /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). - * - * If the client already rounded up the chunk length, the - * length does not change. Otherwise, the length of the page - * list is increased to include XDR round-up. - * - * Currently these chunks always start at page offset 0, - * thus the rounded-up length never crosses a page boundary. - */ - buf->pages = &info->ri_rqst->rq_pages[0]; - length = xdr_align_size(chunk->ch_length); - buf->page_len = length; - buf->len += length; - buf->buflen += length; - -out: - return ret; + return svc_rdma_build_read_chunk(rqstp, head, + pcl_first_chunk(&head->rc_read_pcl)); } /** - * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk - * @info: context for RDMA Reads + * svc_rdma_read_chunk_range - Build RDMA Read WRs for portion of a chunk + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @chunk: parsed Call chunk to pull * @offset: offset of region to pull * @length: length of region to pull @@ -967,7 +955,8 @@ out: * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, +static int svc_rdma_read_chunk_range(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_chunk *chunk, unsigned int offset, unsigned int length) { @@ -987,11 +976,11 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; dummy.rs_offset = segment->rs_offset + offset; - ret = svc_rdma_build_read_segment(info, &dummy); + ret = svc_rdma_build_read_segment(rqstp, head, &dummy); if (ret < 0) break; - info->ri_totalbytes += dummy.rs_length; + head->rc_readbytes += dummy.rs_length; length -= dummy.rs_length; offset = 0; } @@ -1000,7 +989,8 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, /** * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * Return values: * %0: RDMA Read WQEs were successfully built @@ -1009,9 +999,9 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) +static int svc_rdma_read_call_chunk(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_chunk *call_chunk = pcl_first_chunk(&head->rc_call_pcl); const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; @@ -1020,17 +1010,18 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) int ret; if (pcl_is_empty(pcl)) - return svc_rdma_build_read_chunk(info, call_chunk); + return svc_rdma_build_read_chunk(rqstp, head, call_chunk); start = 0; chunk = pcl_first_chunk(pcl); length = chunk->ch_position; - ret = svc_rdma_read_chunk_range(info, call_chunk, start, length); + ret = svc_rdma_read_chunk_range(rqstp, head, call_chunk, + start, length); if (ret < 0) return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(info, chunk); + ret = svc_rdma_build_read_chunk(rqstp, head, chunk); if (ret < 0) return ret; @@ -1039,8 +1030,8 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) break; start += length; - length = next->ch_position - info->ri_totalbytes; - ret = svc_rdma_read_chunk_range(info, call_chunk, + length = next->ch_position - head->rc_readbytes; + ret = svc_rdma_read_chunk_range(rqstp, head, call_chunk, start, length); if (ret < 0) return ret; @@ -1048,12 +1039,14 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) start += length; length = call_chunk->ch_length - start; - return svc_rdma_read_chunk_range(info, call_chunk, start, length); + return svc_rdma_read_chunk_range(rqstp, head, call_chunk, + start, length); } /** * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The start of the data lands in the first page just after the * Transport header, and the rest lands in rqstp->rq_arg.pages. @@ -1069,25 +1062,31 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info) +static noinline int svc_rdma_read_special(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct xdr_buf *buf = &info->ri_rqst->rq_arg; - int ret; - - ret = svc_rdma_read_call_chunk(info); - if (ret < 0) - goto out; - - buf->len += info->ri_totalbytes; - buf->buflen += info->ri_totalbytes; + return svc_rdma_read_call_chunk(rqstp, head); +} - buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); - buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); - buf->pages = &info->ri_rqst->rq_pages[1]; - buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; +/* Pages under I/O have been copied to head->rc_pages. Ensure that + * svc_xprt_release() does not put them when svc_rdma_recvfrom() + * returns. This has to be done after all Read WRs are constructed + * to properly handle a page that happens to be part of I/O on behalf + * of two different RDMA segments. + * + * Note: if the subsequent post_send fails, these pages have already + * been moved to head->rc_pages and thus will be cleaned up by + * svc_rdma_recv_ctxt_put(). + */ +static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) +{ + unsigned int i; -out: - return ret; + for (i = 0; i < head->rc_page_count; i++) { + head->rc_pages[i] = rqstp->rq_pages[i]; + rqstp->rq_pages[i] = NULL; + } } /** @@ -1117,49 +1116,27 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_read_info *info; - struct svc_rdma_chunk_ctxt *cc; + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; int ret; - info = svc_rdma_read_info_alloc(rdma); - if (!info) - return -ENOMEM; - cc = &info->ri_cc; - info->ri_rqst = rqstp; - info->ri_readctxt = head; - info->ri_pageno = 0; - info->ri_pageoff = 0; - info->ri_totalbytes = 0; + cc->cc_cqe.done = svc_rdma_wc_read_done; + cc->cc_sqecount = 0; + head->rc_pageoff = 0; + head->rc_curpage = 0; + head->rc_readbytes = 0; if (pcl_is_empty(&head->rc_call_pcl)) { if (head->rc_read_pcl.cl_count == 1) - ret = svc_rdma_read_data_item(info); + ret = svc_rdma_read_data_item(rqstp, head); else - ret = svc_rdma_read_multiple_chunks(info); + ret = svc_rdma_read_multiple_chunks(rqstp, head); } else - ret = svc_rdma_read_special(info); + ret = svc_rdma_read_special(rqstp, head); + svc_rdma_clear_rqst_pages(rqstp, head); if (ret < 0) - goto out_err; + return ret; trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); - init_completion(&cc->cc_done); - ret = svc_rdma_post_chunk_ctxt(cc); - if (ret < 0) - goto out_err; - - ret = 1; - wait_for_completion(&cc->cc_done); - if (cc->cc_status != IB_WC_SUCCESS) - ret = -EIO; - - /* rq_respages starts after the last arg page */ - rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */ - head->rc_page_count = 0; - -out_err: - svc_rdma_read_info_free(info); - return ret; + ret = svc_rdma_post_chunk_ctxt(rdma, cc); + return ret < 0 ? ret : 1; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 22a871e6fe4d..914cd263c2f1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -100,7 +100,7 @@ */ #include <linux/spinlock.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -113,42 +113,41 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) -{ - cid->ci_queue_id = rdma->sc_sq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} - static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { + int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_send_ctxt *ctxt; + unsigned long pages; dma_addr_t addr; void *buffer; - size_t size; int i; - size = sizeof(*ctxt); - size += rdma->sc_max_send_sges * sizeof(struct ib_sge); - ctxt = kmalloc(size, GFP_KERNEL); + ctxt = kzalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges), + GFP_KERNEL, node); if (!ctxt) goto fail0; - buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); - if (!buffer) + pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server); + ctxt->sc_pages = kcalloc_node(pages, sizeof(struct page *), + GFP_KERNEL, node); + if (!ctxt->sc_pages) goto fail1; + ctxt->sc_maxpages = pages; + buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); + if (!buffer) + goto fail2; addr = ib_dma_map_single(rdma->sc_pd->device, buffer, rdma->sc_max_req_size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) - goto fail2; + goto fail3; svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); + ctxt->sc_rdma = rdma; ctxt->sc_send_wr.next = NULL; ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; - init_completion(&ctxt->sc_done); ctxt->sc_cqe.done = svc_rdma_wc_send; ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, @@ -159,8 +158,10 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; return ctxt; -fail2: +fail3: kfree(buffer); +fail2: + kfree(ctxt->sc_pages); fail1: kfree(ctxt); fail0: @@ -184,6 +185,7 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) rdma->sc_max_req_size, DMA_TO_DEVICE); kfree(ctxt->sc_xprt_buf); + kfree(ctxt->sc_pages); kfree(ctxt); } } @@ -202,55 +204,82 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) spin_lock(&rdma->sc_send_lock); node = llist_del_first(&rdma->sc_send_ctxts); + spin_unlock(&rdma->sc_send_lock); if (!node) goto out_empty; + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); - spin_unlock(&rdma->sc_send_lock); out: rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, NULL); + svc_rdma_cc_init(rdma, &ctxt->sc_reply_info.wi_cc); ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; + ctxt->sc_page_count = 0; + ctxt->sc_wr_chain = &ctxt->sc_send_wr; + ctxt->sc_sqecount = 1; + return ctxt; out_empty: - spin_unlock(&rdma->sc_send_lock); ctxt = svc_rdma_send_ctxt_alloc(rdma); if (!ctxt) return NULL; goto out; } -/** - * svc_rdma_send_ctxt_put - Return send_ctxt to free list - * @rdma: controlling svcxprt_rdma - * @ctxt: object to return to the free list - */ -void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt) +static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) { struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + svc_rdma_reply_chunk_release(rdma, ctxt); + + if (ctxt->sc_page_count) + release_pages(ctxt->sc_pages, ctxt->sc_page_count); + /* The first SGE contains the transport header, which * remains mapped until @ctxt is destroyed. */ for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) { + trace_svcrdma_dma_unmap_page(&ctxt->sc_cid, + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length); ib_dma_unmap_page(device, ctxt->sc_sges[i].addr, ctxt->sc_sges[i].length, DMA_TO_DEVICE); - trace_svcrdma_dma_unmap_page(rdma, - ctxt->sc_sges[i].addr, - ctxt->sc_sges[i].length); } llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); } +static void svc_rdma_send_ctxt_put_async(struct work_struct *work) +{ + struct svc_rdma_send_ctxt *ctxt; + + ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work); + svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt); +} + +/** + * svc_rdma_send_ctxt_put - Return send_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * + * Pages left in sc_pages are DMA unmapped and released. + */ +void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async); + queue_work(svcrdma_wq, &ctxt->sc_work); +} + /** * svc_rdma_wake_send_waiters - manage Send Queue accounting * @rdma: controlling transport @@ -280,13 +309,13 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_send_ctxt *ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - svc_rdma_wake_send_waiters(rdma, 1); - complete(&ctxt->sc_done); + svc_rdma_wake_send_waiters(rdma, ctxt->sc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) goto flushed; - trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + trace_svcrdma_wc_send(&ctxt->sc_cid); + svc_rdma_send_ctxt_put(rdma, ctxt); return; flushed: @@ -294,55 +323,81 @@ flushed: trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid); else trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid); + svc_rdma_send_ctxt_put(rdma, ctxt); svc_xprt_deferred_close(&rdma->sc_xprt); } /** - * svc_rdma_send - Post a single Send WR - * @rdma: transport on which to post the WR - * @ctxt: send ctxt with a Send WR ready to post + * svc_rdma_post_send - Post a WR chain to the Send Queue + * @rdma: transport context + * @ctxt: WR chain to post + * + * Copy fields in @ctxt to stack variables in order to guarantee + * that these values remain available after the ib_post_send() call. + * In some error flow cases, svc_rdma_wc_send() releases @ctxt. + * + * Note there is potential for starvation when the Send Queue is + * full because there is no order to when waiting threads are + * awoken. The transport is typically provisioned with a deep + * enough Send Queue that SQ exhaustion should be a rare event. * - * Returns zero if the Send WR was posted successfully. Otherwise, a - * negative errno is returned. + * Return values: + * %0: @ctxt's WR chain was posted successfully + * %-ENOTCONN: The connection was lost */ -int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) +int svc_rdma_post_send(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) { - struct ib_send_wr *wr = &ctxt->sc_send_wr; - int ret; + struct ib_send_wr *first_wr = ctxt->sc_wr_chain; + struct ib_send_wr *send_wr = &ctxt->sc_send_wr; + const struct ib_send_wr *bad_wr = first_wr; + struct rpc_rdma_cid cid = ctxt->sc_cid; + int ret, sqecount = ctxt->sc_sqecount; - reinit_completion(&ctxt->sc_done); + might_sleep(); /* Sync the transport header buffer */ ib_dma_sync_single_for_device(rdma->sc_pd->device, - wr->sg_list[0].addr, - wr->sg_list[0].length, + send_wr->sg_list[0].addr, + send_wr->sg_list[0].length, DMA_TO_DEVICE); /* If the SQ is full, wait until an SQ entry is available */ - while (1) { - if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { + while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { + if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { + svc_rdma_wake_send_waiters(rdma, sqecount); + + /* When the transport is torn down, assume + * ib_drain_sq() will trigger enough Send + * completions to wake us. The XPT_CLOSE test + * above should then cause the while loop to + * exit. + */ percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma); - atomic_inc(&rdma->sc_sq_avail); + trace_svcrdma_sq_full(rdma, &cid); wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > 1); - if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) - return -ENOTCONN; - trace_svcrdma_sq_retry(rdma); + atomic_read(&rdma->sc_sq_avail) > 0); + trace_svcrdma_sq_retry(rdma, &cid); continue; } trace_svcrdma_post_send(ctxt); - ret = ib_post_send(rdma->sc_qp, wr, NULL); - if (ret) - break; + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) { + trace_svcrdma_sq_post_err(rdma, &cid, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + + /* If even one WR was posted, there will be a + * Send completion that bumps sc_sq_avail. + */ + if (bad_wr == first_wr) { + svc_rdma_wake_send_waiters(rdma, sqecount); + break; + } + } return 0; } - - trace_svcrdma_sq_post_err(rdma, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - wake_up(&rdma->sc_send_wait); - return ret; + return -ENOTCONN; } /** @@ -529,14 +584,14 @@ static int svc_rdma_page_dma_map(void *data, struct page *page, if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; - trace_svcrdma_dma_map_page(rdma, dma_addr, len); + trace_svcrdma_dma_map_page(&ctxt->sc_cid, dma_addr, len); ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; ctxt->sc_send_wr.num_sge++; return 0; out_maperr: - trace_svcrdma_dma_map_err(rdma, dma_addr, len); + trace_svcrdma_dma_map_err(&ctxt->sc_cid, dma_addr, len); return -EIO; } @@ -648,7 +703,7 @@ static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, * svc_rdma_pull_up_needed - Determine whether to use pull-up * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client * @xdr: xdr_buf containing RPC message to transmit * * Returns: @@ -657,7 +712,7 @@ static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, */ static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, const struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, const struct xdr_buf *xdr) { /* Resources needed for the transport header */ @@ -667,7 +722,7 @@ static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, }; int ret; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_count_sges, &args); if (ret < 0) return false; @@ -723,7 +778,7 @@ static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client * @xdr: prepared xdr_buf containing RPC message * * The device is not capable of sending the reply directly. @@ -738,7 +793,7 @@ static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, */ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, const struct xdr_buf *xdr) { struct svc_rdma_pullup_data args = { @@ -746,7 +801,7 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, }; int ret; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_linearize, &args); if (ret < 0) return ret; @@ -759,7 +814,8 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, /* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client + * @reply_pcl: Reply chunk provided by client * @xdr: prepared xdr_buf containing RPC message * * Returns: @@ -771,7 +827,8 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, */ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, const struct xdr_buf *xdr) { struct svc_rdma_map_data args = { @@ -784,36 +841,49 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; /* If there is a Reply chunk, nothing follows the transport - * header, and we're done here. + * header, so there is nothing to map. */ - if (!pcl_is_empty(&rctxt->rc_reply_pcl)) + if (!pcl_is_empty(reply_pcl)) return 0; /* For pull-up, svc_rdma_send() will sync the transport header. * No additional DMA mapping is necessary. */ - if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) - return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); + if (svc_rdma_pull_up_needed(rdma, sctxt, write_pcl, xdr)) + return svc_rdma_pull_up_reply_msg(rdma, sctxt, write_pcl, xdr); - return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + return pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_dma_map, &args); } +/* The svc_rqst and all resources it owns are released as soon as + * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt + * so they are released by the Send completion handler. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + struct svc_rdma_send_ctxt *ctxt) +{ + int i, pages = rqstp->rq_next_page - rqstp->rq_respages; + + ctxt->sc_page_count += pages; + for (i = 0; i < pages; i++) { + ctxt->sc_pages[i] = rqstp->rq_respages[i]; + rqstp->rq_respages[i] = NULL; + } + + /* Prevent svc_xprt_release from releasing pages in rq_pages */ + rqstp->rq_next_page = rqstp->rq_respages; +} + /* Prepare the portion of the RPC Reply that will be transmitted * via RDMA Send. The RPC-over-RDMA transport header is prepared * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. * * Depending on whether a Write list or Reply chunk is present, - * the server may send all, a portion of, or none of the xdr_buf. + * the server may Send all, a portion of, or none of the xdr_buf. * In the latter case, only the transport header (sc_sges[0]) is * transmitted. * - * RDMA Send is the last step of transmitting an RPC reply. Pages - * involved in the earlier RDMA Writes are here transferred out - * of the rqstp and into the sctxt's page array. These pages are - * DMA unmapped by each Write completion, but the subsequent Send - * completion finally releases these pages. - * * Assumptions: * - The Reply's transport header will never be larger than a page. */ @@ -822,26 +892,27 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, const struct svc_rdma_recv_ctxt *rctxt, struct svc_rqst *rqstp) { + struct ib_send_wr *send_wr = &sctxt->sc_send_wr; int ret; - ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res); + ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl, + &rctxt->rc_reply_pcl, &rqstp->rq_res); if (ret < 0) return ret; + /* Transfer pages involved in RDMA Writes to the sctxt's + * page array. Completion handling releases these pages. + */ + svc_rdma_save_io_pages(rqstp, sctxt); + if (rctxt->rc_inv_rkey) { - sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; - sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; + send_wr->opcode = IB_WR_SEND_WITH_INV; + send_wr->ex.invalidate_rkey = rctxt->rc_inv_rkey; } else { - sctxt->sc_send_wr.opcode = IB_WR_SEND; + send_wr->opcode = IB_WR_SEND; } - ret = svc_rdma_send(rdma, sctxt); - if (ret < 0) - return ret; - - ret = wait_for_completion_killable(&sctxt->sc_done); - svc_rdma_send_ctxt_put(rdma, sctxt); - return ret; + return svc_rdma_post_send(rdma, sctxt); } /** @@ -905,10 +976,9 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; - if (svc_rdma_send(rdma, sctxt)) + if (svc_rdma_post_send(rdma, sctxt)) goto put_ctxt; - - wait_for_completion_killable(&sctxt->sc_done); + return; put_ctxt: svc_rdma_send_ctxt_put(rdma, sctxt); @@ -953,10 +1023,19 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); if (ret < 0) - goto reply_chunk; - rc_size = ret; + goto put_ctxt; + + rc_size = 0; + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) { + ret = svc_rdma_prepare_reply_chunk(rdma, &rctxt->rc_write_pcl, + &rctxt->rc_reply_pcl, sctxt, + &rqstp->rq_res); + if (ret < 0) + goto reply_chunk; + rc_size = ret; + } *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); @@ -976,17 +1055,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) goto put_ctxt; - - /* Prevent svc_xprt_release() from releasing the page backing - * rq_res.head[0].iov_base. It's no longer being accessed by - * the I/O device. */ - rqstp->rq_respages++; return 0; reply_chunk: if (ret != -E2BIG && ret != -EINVAL) goto put_ctxt; + /* Send completion releases payload pages that were part + * of previously posted RDMA Writes. + */ + svc_rdma_save_io_pages(rqstp, sctxt); svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); return 0; @@ -1000,45 +1078,33 @@ drop_connection: /** * svc_rdma_result_payload - special processing for a result payload - * @rqstp: svc_rqst to operate on - * @offset: payload's byte offset in @xdr + * @rqstp: RPC transaction context + * @offset: payload's byte offset in @rqstp->rq_res * @length: size of payload, in bytes * + * Assign the passed-in result payload to the current Write chunk, + * and advance to cur_result_payload to the next Write chunk, if + * there is one. + * * Return values: * %0 if successful or nothing needed to be done - * %-EMSGSIZE on XDR buffer overflow * %-E2BIG if the payload was larger than the Write chunk - * %-EINVAL if client provided too many segments - * %-ENOMEM if rdma_rw context pool was exhausted - * %-ENOTCONN if posting failed (connection is lost) - * %-EIO if rdma_rw initialization failed (DMA mapping, etc) */ int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length) { struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; struct svc_rdma_chunk *chunk; - struct svcxprt_rdma *rdma; - struct xdr_buf subbuf; - int ret; chunk = rctxt->rc_cur_result_payload; if (!length || !chunk) return 0; rctxt->rc_cur_result_payload = pcl_next_chunk(&rctxt->rc_write_pcl, chunk); + if (length > chunk->ch_length) return -E2BIG; - chunk->ch_position = offset; chunk->ch_payload_length = length; - - if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length)) - return -EMSGSIZE; - - rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); - ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); - if (ret < 0) - return ret; return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 94b20fb47135..b7b318ad25c4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -64,7 +64,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, - struct net *net); + struct net *net, int node); +static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -73,7 +75,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); -static void svc_rdma_secure_port(struct svc_rqst *); static void svc_rdma_kill_temp_xprt(struct svc_xprt *); static const struct svc_xprt_ops svc_rdma_ops = { @@ -81,12 +82,11 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, .xpo_result_payload = svc_rdma_result_payload, - .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_release_ctxt = svc_rdma_release_ctxt, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, - .xpo_secure_port = svc_rdma_secure_port, .xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt, }; @@ -124,18 +124,57 @@ static void qp_event_handler(struct ib_event *event, void *context) } } +static struct rdma_cm_id * +svc_rdma_create_listen_id(struct net *net, struct sockaddr *sap, + void *context) +{ + struct rdma_cm_id *listen_id; + int ret; + + listen_id = rdma_create_id(net, svc_rdma_listen_handler, context, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(listen_id)) + return listen_id; + + /* Allow both IPv4 and IPv6 sockets to bind a single port + * at the same time. + */ +#if IS_ENABLED(CONFIG_IPV6) + ret = rdma_set_afonly(listen_id, 1); + if (ret) + goto out_destroy; +#endif + ret = rdma_bind_addr(listen_id, sap); + if (ret) + goto out_destroy; + + ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); + if (ret) + goto out_destroy; + + return listen_id; + +out_destroy: + rdma_destroy_id(listen_id); + return ERR_PTR(ret); +} + static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, - struct net *net) + struct net *net, int node) { - struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); + static struct lock_class_key svcrdma_rwctx_lock; + static struct lock_class_key svcrdma_sctx_lock; + static struct lock_class_key svcrdma_dto_lock; + struct svcxprt_rdma *cma_xprt; - if (!cma_xprt) { - dprintk("svcrdma: failed to create new transport\n"); + cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node); + if (!cma_xprt) return NULL; - } + svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); init_llist_head(&cma_xprt->sc_rw_ctxts); @@ -143,8 +182,11 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); + lockdep_set_class(&cma_xprt->sc_rq_dto_lock, &svcrdma_dto_lock); spin_lock_init(&cma_xprt->sc_send_lock); + lockdep_set_class(&cma_xprt->sc_send_lock, &svcrdma_sctx_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); + lockdep_set_class(&cma_xprt->sc_rw_ctxt_lock, &svcrdma_rwctx_lock); /* * Note that this implies that the underlying transport support @@ -195,9 +237,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, struct svcxprt_rdma *newxprt; struct sockaddr *sa; - /* Create a new transport */ newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, - listen_xprt->sc_xprt.xpt_net); + listen_xprt->sc_xprt.xpt_net, + ibdev_to_node(new_cma_id->device)); if (!newxprt) return; newxprt->sc_cm_id = new_cma_id; @@ -242,17 +284,31 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, * * Return values: * %0: Do not destroy @cma_id - * %1: Destroy @cma_id (never returned here) + * %1: Destroy @cma_id * * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners. */ static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { + struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; + struct svcxprt_rdma *cma_xprt = cma_id->context; + struct svc_xprt *cma_rdma = &cma_xprt->sc_xprt; + struct rdma_cm_id *listen_id; + switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: handle_connect_req(cma_id, &event->param.conn); break; + case RDMA_CM_EVENT_ADDR_CHANGE: + listen_id = svc_rdma_create_listen_id(cma_rdma->xpt_net, + sap, cma_xprt); + if (IS_ERR(listen_id)) { + pr_err("Listener dead, address change failed for device %s\n", + cma_id->device->name); + } else + cma_xprt->sc_cm_id = listen_id; + return 1; default: break; } @@ -283,7 +339,6 @@ static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id, svc_xprt_enqueue(xprt); break; case RDMA_CM_EVENT_DISCONNECTED: - case RDMA_CM_EVENT_DEVICE_REMOVAL: svc_xprt_deferred_close(xprt); break; default: @@ -302,40 +357,22 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, { struct rdma_cm_id *listen_id; struct svcxprt_rdma *cma_xprt; - int ret; if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) return ERR_PTR(-EAFNOSUPPORT); - cma_xprt = svc_rdma_create_xprt(serv, net); + cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE); if (!cma_xprt) return ERR_PTR(-ENOMEM); set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); - listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt, - RDMA_PS_TCP, IB_QPT_RC); + listen_id = svc_rdma_create_listen_id(net, sa, cma_xprt); if (IS_ERR(listen_id)) { - ret = PTR_ERR(listen_id); - goto err0; + kfree(cma_xprt); + return ERR_CAST(listen_id); } - - /* Allow both IPv4 and IPv6 sockets to bind a single port - * at the same time. - */ -#if IS_ENABLED(CONFIG_IPV6) - ret = rdma_set_afonly(listen_id, 1); - if (ret) - goto err1; -#endif - ret = rdma_bind_addr(listen_id, sa); - if (ret) - goto err1; cma_xprt->sc_cm_id = listen_id; - ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); - if (ret) - goto err1; - /* * We need to use the address from the cm_id in case the * caller specified 0 for the port number. @@ -344,12 +381,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); return &cma_xprt->sc_xprt; +} - err1: - rdma_destroy_id(listen_id); - err0: - kfree(cma_xprt); - return ERR_PTR(ret); +static void svc_rdma_xprt_done(struct rpcrdma_notification *rn) +{ + struct svcxprt_rdma *rdma = container_of(rn, struct svcxprt_rdma, + sc_rn); + struct rdma_cm_id *id = rdma->sc_cm_id; + + trace_svcrdma_device_removal(id); + svc_xprt_close(&rdma->sc_xprt); } /* @@ -365,12 +406,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, */ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) { + unsigned int ctxts, rq_depth, maxpayload; struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; - unsigned int ctxts, rq_depth; struct ib_device *dev; int ret = 0; RPC_IFDEBUG(struct sockaddr *sap); @@ -393,37 +434,45 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dev = newxprt->sc_cm_id->device; newxprt->sc_port_num = newxprt->sc_cm_id->port_num; - /* Qualify the transport resource defaults with the - * capabilities of this particular device */ + if (rpcrdma_rn_register(dev, &newxprt->sc_rn, svc_rdma_xprt_done)) + goto errout; + + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = svcrdma_max_requests; + newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; + newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); + + /* Qualify the transport's resource defaults with the + * capabilities of this particular device. + */ + /* Transport header, head iovec, tail iovec */ newxprt->sc_max_send_sges = 3; /* Add one SGE per page list entry */ newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1; if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) newxprt->sc_max_send_sges = dev->attrs.max_send_sge; - newxprt->sc_max_req_size = svcrdma_max_req_size; - newxprt->sc_max_requests = svcrdma_max_requests; - newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; - newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + - newxprt->sc_recv_batch; + newxprt->sc_recv_batch + 1 /* drain */; if (rq_depth > dev->attrs.max_qp_wr) { - pr_warn("svcrdma: reducing receive depth to %d\n", - dev->attrs.max_qp_wr); rq_depth = dev->attrs.max_qp_wr; newxprt->sc_recv_batch = 1; newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } - newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); - ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); - ctxts *= newxprt->sc_max_requests; + + /* Arbitrary estimate of the needed number of rdma_rw contexts. + */ + maxpayload = min(xprt->xpt_server->sv_max_payload, + RPCSVC_MAXPAYLOAD_RDMA); + ctxts = newxprt->sc_max_requests * 3 * + rdma_rw_mr_factor(dev, newxprt->sc_port_num, + maxpayload >> PAGE_SHIFT); + newxprt->sc_sq_depth = rq_depth + ctxts; - if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) { - pr_warn("svcrdma: reducing send depth to %d\n", - dev->attrs.max_qp_wr); + if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; - } atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); newxprt->sc_pd = ib_alloc_pd(dev, 0); @@ -453,18 +502,18 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = newxprt->sc_sq_cq; qp_attr.recv_cq = newxprt->sc_rq_cq; - dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n", - newxprt->sc_cm_id, newxprt->sc_pd); dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n", qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge); - + dprintk(" send CQ depth = %u, recv CQ depth = %u\n", + newxprt->sc_sq_depth, rq_depth); ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { trace_svcrdma_qp_err(newxprt, ret); goto errout; } + newxprt->sc_max_send_sges = qp_attr.cap.max_send_sge; newxprt->sc_qp = newxprt->sc_cm_id->qp; if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) @@ -508,7 +557,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - dprintk("svcrdma: new connection %p accepted:\n", newxprt); + dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; @@ -528,6 +577,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) ib_destroy_qp(newxprt->sc_qp); rdma_destroy_id(newxprt->sc_cm_id); + rpcrdma_rn_unregister(dev, &newxprt->sc_rn); /* This call to put will destroy the transport */ svc_xprt_put(&newxprt->sc_xprt); return NULL; @@ -541,14 +591,22 @@ static void svc_rdma_detach(struct svc_xprt *xprt) rdma_disconnect(rdma->sc_cm_id); } -static void __svc_rdma_free(struct work_struct *work) +/** + * svc_rdma_free - Release class-specific transport resources + * @xprt: Generic svc transport object + */ +static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = - container_of(work, struct svcxprt_rdma, sc_work); + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct ib_device *device = rdma->sc_cm_id->device; + + might_sleep(); /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); + flush_workqueue(svcrdma_wq); svc_rdma_flush_recv_queues(rdma); @@ -572,18 +630,11 @@ static void __svc_rdma_free(struct work_struct *work) /* Destroy the CM ID */ rdma_destroy_id(rdma->sc_cm_id); + if (!test_bit(XPT_LISTENER, &rdma->sc_xprt.xpt_flags)) + rpcrdma_rn_unregister(device, &rdma->sc_rn); kfree(rdma); } -static void svc_rdma_free(struct svc_xprt *xprt) -{ - struct svcxprt_rdma *rdma = - container_of(xprt, struct svcxprt_rdma, sc_xprt); - - INIT_WORK(&rdma->sc_work, __svc_rdma_free); - schedule_work(&rdma->sc_work); -} - static int svc_rdma_has_wspace(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = @@ -600,11 +651,6 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } -static void svc_rdma_secure_port(struct svc_rqst *rqstp) -{ - set_bit(RQ_SECURE, &rqstp->rq_flags); -} - static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) { } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 10bb2b929c6d..9a8ce5df83ca 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -137,16 +137,6 @@ static struct ctl_table xr_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, -}; - -static struct ctl_table sunrpc_table[] = { - { - .procname = "sunrpc", - .mode = 0555, - .child = xr_tunables_table - }, - { }, }; #endif @@ -799,7 +789,7 @@ int xprt_rdma_init(void) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) - sunrpc_table_header = register_sysctl_table(sunrpc_table); + sunrpc_table_header = register_sysctl("sunrpc", xr_tunables_table); #endif return 0; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b098fde373ab..63262ef0c2e3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -49,14 +49,14 @@ * o buffer memory */ +#include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc_rdma.h> #include <linux/log2.h> -#include <asm-generic/barrier.h> -#include <asm/bitops.h> +#include <asm/barrier.h> #include <rdma/ib_cm.h> @@ -69,13 +69,15 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_ep_get(struct rpcrdma_ep *ep); static int rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, + int node); +static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); @@ -222,7 +224,6 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; struct rpcrdma_ep *ep = id->context; might_sleep(); @@ -241,10 +242,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_async_rc = -ENETUNREACH; complete(&ep->re_done); return 0; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - pr_info("rpcrdma: removing device %s for %pISpc\n", - ep->re_id->device->name, sap); - fallthrough; case RDMA_CM_EVENT_ADDR_CHANGE: ep->re_connect_status = -ENODEV; goto disconnected; @@ -280,6 +277,14 @@ disconnected: return 0; } +static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn) +{ + struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn); + + trace_xprtrdma_device_removal(ep->re_id); + xprt_force_disconnect(ep->re_xprt); +} + static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep) { @@ -319,6 +324,10 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, if (rc) goto out; + rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done); + if (rc) + goto out; + return id; out: @@ -346,6 +355,8 @@ static void rpcrdma_ep_destroy(struct kref *kref) ib_dealloc_pd(ep->re_pd); ep->re_pd = NULL; + rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn); + kfree(ep); module_put(THIS_MODULE); } @@ -501,7 +512,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) * outstanding Receives. */ rpcrdma_ep_get(ep); - rpcrdma_post_recvs(r_xprt, 1, true); + rpcrdma_post_recvs(r_xprt, 1); rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) @@ -893,6 +904,8 @@ static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt) static void rpcrdma_req_reset(struct rpcrdma_req *req) { + struct rpcrdma_mr *mr; + /* Credits are valid for only one connection */ req->rl_slot.rq_cong = 0; @@ -902,7 +915,19 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req) rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); - frwr_reset(req); + /* The verbs consumer can't know the state of an MR on the + * req->rl_registered list unless a successful completion + * has occurred, so they cannot be re-used. + */ + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { + struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&mr->mr_all); + spin_unlock(&buf->rb_lock); + + frwr_mr_release(mr); + } } /* ASSUMPTION: the rb_allreqs list is stable for the duration, @@ -920,24 +945,23 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) } static noinline -struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, - bool temp) +struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct ib_device *device = ep->re_id->device; struct rpcrdma_rep *rep; rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, - DMA_FROM_DEVICE); + rep->rr_rdmabuf = rpcrdma_regbuf_alloc_node(ep->re_inline_recv, + DMA_FROM_DEVICE, + ibdev_to_node(device)); if (!rep->rr_rdmabuf) goto out_free; - if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) - goto out_free_regbuf; - rep->rr_cid.ci_completion_id = atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); @@ -949,15 +973,12 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; - rep->rr_temp = temp; spin_lock(&buf->rb_lock); list_add(&rep->rr_all, &buf->rb_all_reps); spin_unlock(&buf->rb_lock); return rep; -out_free_regbuf: - rpcrdma_regbuf_free(rep->rr_rdmabuf); out_free: kfree(rep); out: @@ -970,17 +991,6 @@ static void rpcrdma_rep_free(struct rpcrdma_rep *rep) kfree(rep); } -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) -{ - struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf; - - spin_lock(&buf->rb_lock); - list_del(&rep->rr_all); - spin_unlock(&buf->rb_lock); - - rpcrdma_rep_free(rep); -} - static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) { struct llist_node *node; @@ -1012,10 +1022,8 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - list_for_each_entry(rep, &buf->rb_all_reps, rr_all) { + list_for_each_entry(rep, &buf->rb_all_reps, rr_all) rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); - rep->rr_temp = true; /* Mark this rep for destruction */ - } } static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) @@ -1232,14 +1240,15 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) * or Replies they may be registered externally via frwr_map. */ static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) +rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, + int node) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS); + rb = kmalloc_node(sizeof(*rb), XPRTRDMA_GFP_FLAGS, node); if (!rb) return NULL; - rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS); + rb->rg_data = kmalloc_node(size, XPRTRDMA_GFP_FLAGS, node); if (!rb->rg_data) { kfree(rb); return NULL; @@ -1251,6 +1260,12 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) return rb; } +static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) +{ + return rpcrdma_regbuf_alloc_node(size, direction, NUMA_NO_NODE); +} + /** * rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer * @rb: regbuf to reallocate @@ -1328,10 +1343,9 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance * @needed: current credit grant - * @temp: mark Receive buffers to be deleted after one use * */ -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; @@ -1345,8 +1359,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) if (likely(ep->re_receive_count > needed)) goto out; needed -= ep->re_receive_count; - if (!temp) - needed += RPCRDMA_MAX_RECV_BATCH; + needed += RPCRDMA_MAX_RECV_BATCH; if (atomic_inc_return(&ep->re_receiving) > 1) goto out; @@ -1355,17 +1368,17 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) wr = NULL; while (needed) { rep = rpcrdma_rep_get_locked(buf); - if (rep && rep->rr_temp) { - rpcrdma_rep_destroy(rep); - continue; - } if (!rep) - rep = rpcrdma_rep_create(r_xprt, temp); + rep = rpcrdma_rep_create(r_xprt); if (!rep) break; + if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) { + rpcrdma_rep_put(buf, rep); + break; + } rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; - trace_xprtrdma_post_recv(rep); + trace_xprtrdma_post_recv(&rep->rr_cid); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; --needed; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5e5ff6784ef5..8147d2b41494 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -56,6 +56,7 @@ #include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ +#include <linux/sunrpc/rdma_rn.h> /* removal notifications */ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ @@ -92,6 +93,7 @@ struct rpcrdma_ep { struct rpcrdma_connect_private re_cm_private; struct rdma_conn_param re_remote_cma; + struct rpcrdma_notification re_rn; int re_receive_count; unsigned int re_max_requests; /* depends on device */ unsigned int re_inline_send; /* negotiated */ @@ -198,7 +200,6 @@ struct rpcrdma_rep { __be32 rr_proc; int rr_wc_flags; u32 rr_inv_rkey; - bool rr_temp; struct rpcrdma_regbuf *rr_rdmabuf; struct rpcrdma_xprt *rr_rxprt; struct rpc_rqst *rr_rqst; @@ -466,7 +467,7 @@ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed); /* * Buffer calls - xprtrdma/verbs.c @@ -593,7 +594,6 @@ void xprt_rdma_cleanup(void); int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *); -int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); void xprt_rdma_bc_free_rqst(struct rpc_rqst *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index aaa5b2741b79..2e1fe6013361 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -47,17 +47,22 @@ #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> +#include <net/tls_prot.h> +#include <net/handshake.h> + #include <linux/bvec.h> #include <linux/highmem.h> #include <linux/uio.h> #include <linux/sched/mm.h> +#include <trace/events/sock.h> #include <trace/events/sunrpc.h> #include "socklib.h" #include "sunrpc.h" static void xs_close(struct rpc_xprt *xprt); +static void xs_reset_srcport(struct sock_xprt *transport); static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -77,7 +82,7 @@ static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; /* * We can register our own files under /proc/sys/sunrpc by - * calling register_sysctl_table() again. The files in that + * calling register_sysctl() again. The files in that * directory become the union of all files registered there. * * We simply need to make sure that we don't collide with @@ -95,6 +100,7 @@ static struct ctl_table_header *sunrpc_table_header; static struct xprt_class xs_local_transport; static struct xprt_class xs_udp_transport; static struct xprt_class xs_tcp_transport; +static struct xprt_class xs_tcp_tls_transport; static struct xprt_class xs_bc_tcp_transport; /* @@ -154,16 +160,6 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, -}; - -static struct ctl_table sunrpc_table[] = { - { - .procname = "sunrpc", - .mode = 0555, - .child = xs_tunables_table - }, - { }, }; /* @@ -195,6 +191,11 @@ static struct ctl_table sunrpc_table[] = { */ #define XS_IDLE_DISC_TO (5U * 60 * HZ) +/* + * TLS handshake timeout. + */ +#define XS_TLS_HANDSHAKE_TO (10U * HZ) + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS @@ -261,7 +262,12 @@ static void xs_format_common_peer_addresses(struct rpc_xprt *xprt) switch (sap->sa_family) { case AF_LOCAL: sun = xs_addr_un(xprt); - strscpy(buf, sun->sun_path, sizeof(buf)); + if (sun->sun_path[0]) { + strscpy(buf, sun->sun_path, sizeof(buf)); + } else { + buf[0] = '@'; + strscpy(buf+1, sun->sun_path+1, sizeof(buf)-1); + } xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); break; @@ -350,6 +356,66 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) return want; } +static int +xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, + unsigned int *msg_flags, struct cmsghdr *cmsg, int ret) +{ + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + *msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -EACCES : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; + } + return ret; +} + +static int +xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) +{ + union { + struct cmsghdr cmsg; + u8 buf[CMSG_SPACE(sizeof(u8))]; + } u; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; + int ret; + + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, flags); + if (ret > 0) { + if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) + iov_iter_revert(&msg.msg_iter, ret); + ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, + -EAGAIN); + } + return ret; +} + static ssize_t xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) { @@ -357,6 +423,12 @@ xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) if (seek != 0) iov_iter_advance(&msg->msg_iter, seek); ret = sock_recvmsg(sock, msg, flags); + /* Handle TLS inband control message lazily */ + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags); + } return ret > 0 ? ret + seek : ret; } @@ -382,7 +454,7 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { iov_iter_discard(&msg->msg_iter, ITER_DEST, count); - return sock_recvmsg(sock, msg, flags); + return xs_sock_recvmsg(sock, msg, flags, 0); } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE @@ -703,6 +775,8 @@ static void xs_poll_check_readable(struct sock_xprt *transport) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + if (test_bit(XPRT_SOCK_IGNORE_RECV, &transport->sock_state)) + return; if (!xs_poll_socket_readable(transport)) return; if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) @@ -726,6 +800,8 @@ static void xs_stream_data_receive(struct sock_xprt *transport) } if (ret == -ESHUTDOWN) kernel_sock_shutdown(transport->sock, SHUT_RDWR); + else if (ret == -EACCES) + xprt_wake_pending_tasks(&transport->xprt, -EACCES); else xs_poll_check_readable(transport); out: @@ -827,6 +903,17 @@ static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf) return xdr_alloc_bvec(buf, rpc_task_gfp_mask()); } +static void xs_stream_abort_send_request(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + + if (transport->xmit.offset != 0 && + !test_bit(XPRT_CLOSE_WAIT, &xprt->state)) + xprt_force_disconnect(xprt); +} + /* * Determine if the previous message in the stream was aborted before it * could complete transmission. @@ -1125,11 +1212,13 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + transport->xprt_err = 0; clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); + clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); } static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) @@ -1199,6 +1288,8 @@ static void xs_reset_transport(struct sock_xprt *transport) if (atomic_read(&transport->xprt.swapper)) sk_clear_memalloc(sk); + tls_handshake_cancel(sk); + kernel_sock_shutdown(sock, SHUT_RDWR); mutex_lock(&transport->recv_mutex); @@ -1208,6 +1299,7 @@ static void xs_reset_transport(struct sock_xprt *transport) transport->file = NULL; sk->sk_user_data = NULL; + sk->sk_sndtimeo = 0; xs_restore_old_callbacks(transport, sk); xprt_clear_connected(xprt); @@ -1239,6 +1331,8 @@ static void xs_close(struct rpc_xprt *xprt) dprintk("RPC: xs_close xprt %p\n", xprt); + if (transport->sock) + tls_handshake_close(transport->sock); xs_reset_transport(transport); xprt->reestablish_timeout = 0; } @@ -1378,6 +1472,8 @@ static void xs_data_ready(struct sock *sk) { struct rpc_xprt *xprt; + trace_sk_data_ready(sk); + xprt = xprt_from_sock(sk); if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, @@ -1386,6 +1482,10 @@ static void xs_data_ready(struct sock *sk) trace_xs_data_ready(xprt); transport->old_data_ready(sk); + + if (test_bit(XPRT_SOCK_IGNORE_RECV, &transport->sock_state)) + return; + /* Any data means we had a useful conversation, so * then we don't need to delay the next reconnect */ @@ -1498,8 +1598,10 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_CLOSE: if (test_and_clear_bit(XPRT_SOCK_CONNECTING, - &transport->sock_state)) + &transport->sock_state)) { + xs_reset_srcport(transport); xprt_clear_connecting(xprt); + } clear_bit(XPRT_CLOSING, &xprt->state); /* Trigger the socket release */ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); @@ -1655,6 +1757,11 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) xs_update_peer_port(xprt); } +static void xs_reset_srcport(struct sock_xprt *transport) +{ + transport->srcport = 0; +} + static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) { if (transport->srcport == 0 && transport->xprt.reuseport) @@ -1738,8 +1845,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); - err = kernel_bind(sock, (struct sockaddr *)&myaddr, - transport->xprt.addrlen); + err = kernel_bind(sock, (struct sockaddr_unsized *)&myaddr, + transport->xprt.addrlen); if (err == 0) { if (transport->xprt.reuseport) transport->srcport = port; @@ -1854,6 +1961,9 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, goto out; } + if (protocol == IPPROTO_TCP) + sk_net_refcnt_upgrade(sock->sk); + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) return ERR_CAST(filp); @@ -1895,7 +2005,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, xs_stream_start_connect(transport); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), xprt->addrlen, 0); } /** @@ -2155,6 +2265,7 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) switch (skst) { case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: + case TCP_LAST_ACK: break; case TCP_ESTABLISHED: case TCP_CLOSE_WAIT: @@ -2170,9 +2281,13 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct net *net = sock_net(sock->sk); + unsigned long connect_timeout; + unsigned long syn_retries; unsigned int keepidle; unsigned int keepcnt; unsigned int timeo; + unsigned long t; spin_lock(&xprt->transport_lock); keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); @@ -2190,6 +2305,35 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP user timeout (see RFC5482) */ tcp_sock_set_user_timeout(sock->sk, timeo); + + /* Connect timeout */ + connect_timeout = max_t(unsigned long, + DIV_ROUND_UP(xprt->connect_timeout, HZ), 1); + syn_retries = max_t(unsigned long, + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries), 1); + for (t = 0; t <= syn_retries && (1UL << t) < connect_timeout; t++) + ; + if (t <= syn_retries) + tcp_sock_set_syncnt(sock->sk, t - 1); +} + +static void xs_tcp_do_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout) +{ + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct rpc_timeout to; + unsigned long initval; + + memcpy(&to, xprt->timeout, sizeof(to)); + /* Arbitrary lower limit */ + initval = max_t(unsigned long, connect_timeout, XS_TCP_INIT_REEST_TO); + to.to_initval = initval; + to.to_maxval = initval; + to.to_retries = 0; + memcpy(&transport->tcp_timeout, &to, sizeof(transport->tcp_timeout)); + xprt->timeout = &transport->tcp_timeout; + xprt->connect_timeout = connect_timeout; } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, @@ -2197,25 +2341,12 @@ static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, unsigned long reconnect_timeout) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct rpc_timeout to; - unsigned long initval; spin_lock(&xprt->transport_lock); if (reconnect_timeout < xprt->max_reconnect_timeout) xprt->max_reconnect_timeout = reconnect_timeout; - if (connect_timeout < xprt->connect_timeout) { - memcpy(&to, xprt->timeout, sizeof(to)); - initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1); - /* Arbitrary lower limit */ - if (initval < XS_TCP_INIT_REEST_TO << 1) - initval = XS_TCP_INIT_REEST_TO << 1; - to.to_initval = initval; - to.to_maxval = initval; - memcpy(&transport->tcp_timeout, &to, - sizeof(transport->tcp_timeout)); - xprt->timeout = &transport->tcp_timeout; - xprt->connect_timeout = connect_timeout; - } + if (connect_timeout < xprt->connect_timeout) + xs_tcp_do_set_connect_timeout(xprt, connect_timeout); set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); spin_unlock(&xprt->transport_lock); } @@ -2274,7 +2405,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), + xprt->addrlen, O_NONBLOCK); } /** @@ -2335,6 +2467,13 @@ static void xs_tcp_setup_socket(struct work_struct *work) transport->srcport = 0; status = -EAGAIN; break; + case -EPERM: + /* Happens, for instance, if a BPF program is preventing + * the connect. Remap the error so upper layers can better + * deal with it. + */ + status = -ECONNREFUSED; + fallthrough; case -EINVAL: /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id. @@ -2346,6 +2485,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: + case -ENOTCONN: break; default: printk("%s: connect returned unhandled error %d\n", @@ -2365,6 +2505,275 @@ out_unlock: current_restore_flags(pflags, PF_MEMALLOC); } +/* + * Transfer the connected socket to @upper_transport, then mark that + * xprt CONNECTED. + */ +static int xs_tcp_tls_finish_connecting(struct rpc_xprt *lower_xprt, + struct sock_xprt *upper_transport) +{ + struct sock_xprt *lower_transport = + container_of(lower_xprt, struct sock_xprt, xprt); + struct rpc_xprt *upper_xprt = &upper_transport->xprt; + + if (!upper_transport->inet) { + struct socket *sock = lower_transport->sock; + struct sock *sk = sock->sk; + + /* Avoid temporary address, they are bad for long-lived + * connections such as NFS mounts. + * RFC4941, section 3.6 suggests that: + * Individual applications, which have specific + * knowledge about the normal duration of connections, + * MAY override this as appropriate. + */ + if (xs_addr(upper_xprt)->sa_family == PF_INET6) + ip6_sock_set_addr_preferences(sk, IPV6_PREFER_SRC_PUBLIC); + + xs_tcp_set_socket_timeouts(upper_xprt, sock); + tcp_sock_set_nodelay(sk); + + lock_sock(sk); + + /* @sk is already connected, so it now has the RPC callbacks. + * Reach into @lower_transport to save the original ones. + */ + upper_transport->old_data_ready = lower_transport->old_data_ready; + upper_transport->old_state_change = lower_transport->old_state_change; + upper_transport->old_write_space = lower_transport->old_write_space; + upper_transport->old_error_report = lower_transport->old_error_report; + sk->sk_user_data = upper_xprt; + + /* socket options */ + sock_reset_flag(sk, SOCK_LINGER); + + xprt_clear_connected(upper_xprt); + + upper_transport->sock = sock; + upper_transport->inet = sk; + upper_transport->file = lower_transport->file; + + release_sock(sk); + + /* Reset lower_transport before shutting down its clnt */ + mutex_lock(&lower_transport->recv_mutex); + lower_transport->inet = NULL; + lower_transport->sock = NULL; + lower_transport->file = NULL; + + xprt_clear_connected(lower_xprt); + xs_sock_reset_connection_flags(lower_xprt); + xs_stream_reset_connect(lower_transport); + mutex_unlock(&lower_transport->recv_mutex); + } + + if (!xprt_bound(upper_xprt)) + return -ENOTCONN; + + xs_set_memalloc(upper_xprt); + + if (!xprt_test_and_set_connected(upper_xprt)) { + upper_xprt->connect_cookie++; + clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); + xprt_clear_connecting(upper_xprt); + + upper_xprt->stat.connect_count++; + upper_xprt->stat.connect_time += (long)jiffies - + upper_xprt->stat.connect_start; + xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); + } + return 0; +} + +/** + * xs_tls_handshake_done - TLS handshake completion handler + * @data: address of xprt to wake + * @status: status of handshake + * @peerid: serial number of key containing the remote's identity + * + */ +static void xs_tls_handshake_done(void *data, int status, key_serial_t peerid) +{ + struct rpc_xprt *lower_xprt = data; + struct sock_xprt *lower_transport = + container_of(lower_xprt, struct sock_xprt, xprt); + + switch (status) { + case 0: + case -EACCES: + case -ETIMEDOUT: + lower_transport->xprt_err = status; + break; + default: + lower_transport->xprt_err = -EACCES; + } + complete(&lower_transport->handshake_done); + xprt_put(lower_xprt); +} + +static int xs_tls_handshake_sync(struct rpc_xprt *lower_xprt, struct xprtsec_parms *xprtsec) +{ + struct sock_xprt *lower_transport = + container_of(lower_xprt, struct sock_xprt, xprt); + struct tls_handshake_args args = { + .ta_sock = lower_transport->sock, + .ta_done = xs_tls_handshake_done, + .ta_data = xprt_get(lower_xprt), + .ta_peername = lower_xprt->servername, + }; + struct sock *sk = lower_transport->inet; + int rc; + + init_completion(&lower_transport->handshake_done); + set_bit(XPRT_SOCK_IGNORE_RECV, &lower_transport->sock_state); + lower_transport->xprt_err = -ETIMEDOUT; + switch (xprtsec->policy) { + case RPC_XPRTSEC_TLS_ANON: + rc = tls_client_hello_anon(&args, GFP_KERNEL); + if (rc) + goto out_put_xprt; + break; + case RPC_XPRTSEC_TLS_X509: + args.ta_my_cert = xprtsec->cert_serial; + args.ta_my_privkey = xprtsec->privkey_serial; + rc = tls_client_hello_x509(&args, GFP_KERNEL); + if (rc) + goto out_put_xprt; + break; + default: + rc = -EACCES; + goto out_put_xprt; + } + + rc = wait_for_completion_interruptible_timeout(&lower_transport->handshake_done, + XS_TLS_HANDSHAKE_TO); + if (rc <= 0) { + tls_handshake_cancel(sk); + if (rc == 0) + rc = -ETIMEDOUT; + goto out_put_xprt; + } + + rc = lower_transport->xprt_err; + +out: + xs_stream_reset_connect(lower_transport); + clear_bit(XPRT_SOCK_IGNORE_RECV, &lower_transport->sock_state); + return rc; + +out_put_xprt: + xprt_put(lower_xprt); + goto out; +} + +/** + * xs_tcp_tls_setup_socket - establish a TLS session on a TCP socket + * @work: queued work item + * + * Invoked by a work queue tasklet. + * + * For RPC-with-TLS, there is a two-stage connection process. + * + * The "upper-layer xprt" is visible to the RPC consumer. Once it has + * been marked connected, the consumer knows that a TCP connection and + * a TLS session have been established. + * + * A "lower-layer xprt", created in this function, handles the mechanics + * of connecting the TCP socket, performing the RPC_AUTH_TLS probe, and + * then driving the TLS handshake. Once all that is complete, the upper + * layer xprt is marked connected. + */ +static void xs_tcp_tls_setup_socket(struct work_struct *work) +{ + struct sock_xprt *upper_transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_clnt *upper_clnt = upper_transport->clnt; + struct rpc_xprt *upper_xprt = &upper_transport->xprt; + struct rpc_create_args args = { + .net = upper_xprt->xprt_net, + .protocol = upper_xprt->prot, + .address = (struct sockaddr *)&upper_xprt->addr, + .addrsize = upper_xprt->addrlen, + .timeout = upper_clnt->cl_timeout, + .servername = upper_xprt->servername, + .program = upper_clnt->cl_program, + .prognumber = upper_clnt->cl_prog, + .version = upper_clnt->cl_vers, + .authflavor = RPC_AUTH_TLS, + .cred = upper_clnt->cl_cred, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + }, + .stats = upper_clnt->cl_stats, + }; + unsigned int pflags = current->flags; + struct rpc_clnt *lower_clnt; + struct rpc_xprt *lower_xprt; + int status; + + if (atomic_read(&upper_xprt->swapper)) + current->flags |= PF_MEMALLOC; + + xs_stream_start_connect(upper_transport); + + /* This implicitly sends an RPC_AUTH_TLS probe */ + lower_clnt = rpc_create(&args); + if (IS_ERR(lower_clnt)) { + trace_rpc_tls_unavailable(upper_clnt, upper_xprt); + clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); + xprt_clear_connecting(upper_xprt); + xprt_wake_pending_tasks(upper_xprt, PTR_ERR(lower_clnt)); + xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); + goto out_unlock; + } + + /* RPC_AUTH_TLS probe was successful. Try a TLS handshake on + * the lower xprt. + */ + rcu_read_lock(); + lower_xprt = rcu_dereference(lower_clnt->cl_xprt); + rcu_read_unlock(); + + if (wait_on_bit_lock(&lower_xprt->state, XPRT_LOCKED, TASK_KILLABLE)) + goto out_unlock; + + status = xs_tls_handshake_sync(lower_xprt, &upper_xprt->xprtsec); + if (status) { + trace_rpc_tls_not_started(upper_clnt, upper_xprt); + goto out_close; + } + + status = xs_tcp_tls_finish_connecting(lower_xprt, upper_transport); + if (status) + goto out_close; + xprt_release_write(lower_xprt, NULL); + trace_rpc_socket_connect(upper_xprt, upper_transport->sock, 0); + rpc_shutdown_client(lower_clnt); + + /* Check for ingress data that arrived before the socket's + * ->data_ready callback was set up. + */ + xs_poll_check_readable(upper_transport); + +out_unlock: + current_restore_flags(pflags, PF_MEMALLOC); + upper_transport->clnt = NULL; + xprt_unlock_connect(upper_xprt, upper_transport); + return; + +out_close: + xprt_release_write(lower_xprt, NULL); + rpc_shutdown_client(lower_clnt); + + /* xprt_force_disconnect() wakes tasks with a fixed tk_status code. + * Wake them first here to ensure they get our tk_status code. + */ + xprt_wake_pending_tasks(upper_xprt, status); + xs_tcp_force_close(upper_xprt); + xprt_clear_connecting(upper_xprt); + goto out_unlock; +} + /** * xs_connect - connect a socket to a remote endpoint * @xprt: pointer to transport structure @@ -2396,6 +2805,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) } else dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); + transport->clnt = task->tk_client; queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, delay); @@ -2417,18 +2827,13 @@ static void xs_wake_error(struct sock_xprt *transport) { int sockerr; - if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) - return; - mutex_lock(&transport->recv_mutex); - if (transport->sock == NULL) - goto out; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) - goto out; + return; sockerr = xchg(&transport->xprt_err, 0); - if (sockerr < 0) + if (sockerr < 0) { xprt_wake_pending_tasks(&transport->xprt, sockerr); -out: - mutex_unlock(&transport->recv_mutex); + xs_tcp_force_close(&transport->xprt); + } } static void xs_wake_pending(struct sock_xprt *transport) @@ -2636,20 +3041,11 @@ static int bc_send_request(struct rpc_rqst *req) return len; } -/* - * The close routine. Since this is client initiated, we do nothing - */ - static void bc_close(struct rpc_xprt *xprt) { xprt_disconnect_done(xprt); } -/* - * The xprt destroy routine. Again, because this connection is client - * initiated, we do nothing - */ - static void bc_destroy(struct rpc_xprt *xprt) { dprintk("RPC: bc_destroy xprt %p\n", xprt); @@ -2670,6 +3066,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, + .abort_send_request = xs_stream_abort_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_close, .destroy = xs_destroy, @@ -2717,6 +3114,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, + .abort_send_request = xs_stream_abort_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_tcp_shutdown, .destroy = xs_destroy, @@ -2863,7 +3261,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) switch (sun->sun_family) { case AF_LOCAL: - if (sun->sun_path[0] != '/') { + if (sun->sun_path[0] != '/' && sun->sun_path[0] != '\0') { dprintk("RPC: bad AF_LOCAL address: %s\n", sun->sun_path); ret = ERR_PTR(-EINVAL); @@ -3006,8 +3404,13 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; + if (args->reconnect_timeout) + xprt->max_reconnect_timeout = args->reconnect_timeout; + xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); + if (args->connect_timeout) + xs_tcp_do_set_connect_timeout(xprt, args->connect_timeout); INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); @@ -3050,6 +3453,94 @@ out_err: } /** + * xs_setup_tcp_tls - Set up transport to use a TCP with TLS + * @args: rpc transport creation arguments + * + */ +static struct rpc_xprt *xs_setup_tcp_tls(struct xprt_create *args) +{ + struct sockaddr *addr = args->dstaddr; + struct rpc_xprt *xprt; + struct sock_xprt *transport; + struct rpc_xprt *ret; + unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries; + + if (args->flags & XPRT_CREATE_INFINITE_SLOTS) + max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT; + + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, + max_slot_table_size); + if (IS_ERR(xprt)) + return xprt; + transport = container_of(xprt, struct sock_xprt, xprt); + + xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xs_tcp_transport; + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + + xprt->bind_timeout = XS_BIND_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + + xprt->ops = &xs_tcp_ops; + xprt->timeout = &xs_tcp_default_timeout; + + xprt->max_reconnect_timeout = xprt->timeout->to_maxval; + xprt->connect_timeout = xprt->timeout->to_initval * + (xprt->timeout->to_retries + 1); + + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); + INIT_WORK(&transport->error_worker, xs_error_handle); + + switch (args->xprtsec.policy) { + case RPC_XPRTSEC_TLS_ANON: + case RPC_XPRTSEC_TLS_X509: + xprt->xprtsec = args->xprtsec; + INIT_DELAYED_WORK(&transport->connect_worker, + xs_tcp_tls_setup_socket); + break; + default: + ret = ERR_PTR(-EACCES); + goto out_err; + } + + switch (addr->sa_family) { + case AF_INET: + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + xprt_set_bound(xprt); + + xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); + break; + case AF_INET6: + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + xprt_set_bound(xprt); + + xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); + break; + default: + ret = ERR_PTR(-EAFNOSUPPORT); + goto out_err; + } + + if (xprt_bound(xprt)) + dprintk("RPC: set up xprt to %s (port %s) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + xprt->address_strings[RPC_DISPLAY_PROTO]); + else + dprintk("RPC: set up xprt to %s (autobind) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PROTO]); + + if (try_module_get(THIS_MODULE)) + return xprt; + ret = ERR_PTR(-EINVAL); +out_err: + xs_xprt_free(xprt); + return ret; +} + +/** * xs_setup_bc_tcp - Set up transport to use a TCP backchannel socket * @args: rpc transport creation arguments * @@ -3158,6 +3649,15 @@ static struct xprt_class xs_tcp_transport = { .netid = { "tcp", "tcp6", "" }, }; +static struct xprt_class xs_tcp_tls_transport = { + .list = LIST_HEAD_INIT(xs_tcp_tls_transport.list), + .name = "tcp-with-tls", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_TCP_TLS, + .setup = xs_setup_tcp_tls, + .netid = { "tcp", "tcp6", "" }, +}; + static struct xprt_class xs_bc_tcp_transport = { .list = LIST_HEAD_INIT(xs_bc_tcp_transport.list), .name = "tcp NFSv4.1 backchannel", @@ -3174,11 +3674,12 @@ static struct xprt_class xs_bc_tcp_transport = { int init_socket_xprt(void) { if (!sunrpc_table_header) - sunrpc_table_header = register_sysctl_table(sunrpc_table); + sunrpc_table_header = register_sysctl("sunrpc", xs_tunables_table); xprt_register_transport(&xs_local_transport); xprt_register_transport(&xs_udp_transport); xprt_register_transport(&xs_tcp_transport); + xprt_register_transport(&xs_tcp_tls_transport); xprt_register_transport(&xs_bc_tcp_transport); return 0; @@ -3198,6 +3699,7 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_local_transport); xprt_unregister_transport(&xs_udp_transport); xprt_unregister_transport(&xs_tcp_transport); + xprt_unregister_transport(&xs_tcp_tls_transport); xprt_unregister_transport(&xs_bc_tcp_transport); } |
