diff options
Diffstat (limited to 'net/sunrpc/svcsock.c')
| -rw-r--r-- | net/sunrpc/svcsock.c | 1053 |
1 files changed, 608 insertions, 445 deletions
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index a6a060925e5d..d61cd9b40491 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/svcsock.c * @@ -35,6 +36,8 @@ #include <linux/skbuff.h> #include <linux/file.h> #include <linux/freezer.h> +#include <linux/bvec.h> + #include <net/sock.h> #include <net/checksum.h> #include <net/ip.h> @@ -42,9 +45,12 @@ #include <net/udp.h> #include <net/tcp.h> #include <net/tcp_states.h> +#include <net/tls_prot.h> +#include <net/handshake.h> #include <linux/uaccess.h> +#include <linux/highmem.h> #include <asm/ioctls.h> -#include <trace/events/skb.h> +#include <linux/key.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/clnt.h> @@ -54,10 +60,31 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xprt.h> +#include <trace/events/sock.h> +#include <trace/events/sunrpc.h> + +#include "socklib.h" #include "sunrpc.h" #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* + * For UDP: + * 1 for header page + * enough pages for RPCSVC_MAXPAYLOAD_UDP + * 1 in case payload is not aligned + * 1 for tail page + */ +enum { + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1 +}; + +/* To-do: to avoid tying up an nfsd thread while waiting for a + * handshake request, the request could instead be deferred. + */ +enum { + SVC_HANDSHAKE_TO = 5U * HZ +}; static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int flags); @@ -106,33 +133,28 @@ static void svc_reclassify_socket(struct socket *sock) } #endif -/* - * Release an skbuff after use +/** + * svc_tcp_release_ctxt - Release transport-related resources + * @xprt: the transport which owned the context + * @ctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt + * */ -static void svc_release_skb(struct svc_rqst *rqstp) +static void svc_tcp_release_ctxt(struct svc_xprt *xprt, void *ctxt) { - struct sk_buff *skb = rqstp->rq_xprt_ctxt; - - if (skb) { - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - rqstp->rq_xprt_ctxt = NULL; - - dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); - skb_free_datagram_locked(svsk->sk_sk, skb); - } } -static void svc_release_udp_skb(struct svc_rqst *rqstp) +/** + * svc_udp_release_ctxt - Release transport-related resources + * @xprt: the transport which owned the context + * @ctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt + * + */ +static void svc_udp_release_ctxt(struct svc_xprt *xprt, void *ctxt) { - struct sk_buff *skb = rqstp->rq_xprt_ctxt; - - if (skb) { - rqstp->rq_xprt_ctxt = NULL; + struct sk_buff *skb = ctxt; - dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + if (skb) consume_skb(skb); - } } union svc_pktinfo_u { @@ -173,109 +195,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) } } -/* - * send routine intended to be shared by the fore- and back-channel - */ -int svc_send_common(struct socket *sock, struct xdr_buf *xdr, - struct page *headpage, unsigned long headoffset, - struct page *tailpage, unsigned long tailoffset) -{ - int result; - int size; - struct page **ppage = xdr->pages; - size_t base = xdr->page_base; - unsigned int pglen = xdr->page_len; - unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; - int slen; - int len = 0; - - slen = xdr->len; - - /* send head */ - if (slen == xdr->head[0].iov_len) - flags = 0; - len = kernel_sendpage(sock, headpage, headoffset, - xdr->head[0].iov_len, flags); - if (len != xdr->head[0].iov_len) - goto out; - slen -= xdr->head[0].iov_len; - if (slen == 0) - goto out; - - /* send page data */ - size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; - while (pglen > 0) { - if (slen == size) - flags = 0; - result = kernel_sendpage(sock, *ppage, base, size, flags); - if (result > 0) - len += result; - if (result != size) - goto out; - slen -= size; - pglen -= size; - size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; - base = 0; - ppage++; - } - - /* send tail */ - if (xdr->tail[0].iov_len) { - result = kernel_sendpage(sock, tailpage, tailoffset, - xdr->tail[0].iov_len, 0); - if (result > 0) - len += result; - } - -out: - return len; -} - - -/* - * Generic sendto routine - */ -static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - struct socket *sock = svsk->sk_sock; - union { - struct cmsghdr hdr; - long all[SVC_PKTINFO_SPACE / sizeof(long)]; - } buffer; - struct cmsghdr *cmh = &buffer.hdr; - int len = 0; - unsigned long tailoff; - unsigned long headoff; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - - if (rqstp->rq_prot == IPPROTO_UDP) { - struct msghdr msg = { - .msg_name = &rqstp->rq_addr, - .msg_namelen = rqstp->rq_addrlen, - .msg_control = cmh, - .msg_controllen = sizeof(buffer), - .msg_flags = MSG_MORE, - }; - - svc_set_cmsg_data(rqstp, cmh); - - if (sock_sendmsg(sock, &msg) < 0) - goto out; - } - - tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1); - headoff = 0; - len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff, - rqstp->rq_respages[0], tailoff); - -out: - dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n", - svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, - xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); - - return len; + return 0; } /* @@ -315,46 +238,153 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) return len; } +static int +svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg, + struct cmsghdr *cmsg, int ret) +{ + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -ENOTCONN : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; + } + return ret; +} + +static int +svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags) +{ + union { + struct cmsghdr cmsg; + u8 buf[CMSG_SPACE(sizeof(u8))]; + } u; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; + int ret; + + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT); + if (ret > 0 && + tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, ret); + ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN); + } + return ret; +} + +static int +svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg) +{ + int ret; + struct socket *sock = svsk->sk_sock; + + ret = sock_recvmsg(sock, msg, MSG_DONTWAIT); + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags); + } + return ret; +} + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek) +{ + struct bvec_iter bi = { + .bi_size = size + seek, + }; + struct bio_vec bv; + + bvec_iter_advance(bvec, &bi, seek & PAGE_MASK); + for_each_bvec(bv, bvec, bi, bi) + flush_dcache_page(bv.bv_page); +} +#else +static inline void svc_flush_bvec(const struct bio_vec *bvec, size_t size, + size_t seek) +{ +} +#endif + /* - * Generic recvfrom routine. + * Read from @rqstp's transport socket. The incoming message fills whole + * pages in @rqstp's rq_pages array until the last page of the message + * has been received into a partial page. */ -static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, - unsigned int nr, size_t buflen, unsigned int base) +static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, + size_t seek) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct bio_vec *bvec = rqstp->rq_bvec; struct msghdr msg = { NULL }; + unsigned int i; ssize_t len; - - rqstp->rq_xprt_hlen = 0; + size_t t; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen); - if (base != 0) { - iov_iter_advance(&msg.msg_iter, base); - buflen -= base; + + for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) + bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0); + rqstp->rq_respages = &rqstp->rq_pages[i]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen); + if (seek) { + iov_iter_advance(&msg.msg_iter, seek); + buflen -= seek; } - len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); + len = svc_tcp_sock_recvmsg(svsk, &msg); + if (len > 0) + svc_flush_bvec(bvec, len, seek); + /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) */ if (len == buflen) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n", - svsk, iov[0].iov_base, iov[0].iov_len, len); return len; } /* * Set socket snd and rcv buffer lengths */ -static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, - unsigned int rcv) +static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs) { + unsigned int max_mesg = svsk->sk_xprt.xpt_server->sv_max_mesg; + struct socket *sock = svsk->sk_sock; + + nreqs = min(nreqs, INT_MAX / 2 / max_mesg); + lock_sock(sock->sk); - sock->sk->sk_sndbuf = snd * 2; - sock->sk->sk_rcvbuf = rcv * 2; + sock->sk->sk_sndbuf = nreqs * max_mesg * 2; + sock->sk->sk_rcvbuf = nreqs * max_mesg * 2; sock->sk->sk_write_space(sock->sk); release_sock(sock->sk); } @@ -374,14 +404,15 @@ static void svc_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - if (svsk) { - dprintk("svc: socket %p(inet %p), busy=%d\n", - svsk, sk, - test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + trace_sk_data_ready(sk); + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); svsk->sk_odata(sk); + trace_svcsock_data_ready(&svsk->sk_xprt, 0); + if (test_bit(XPT_HANDSHAKE, &svsk->sk_xprt.xpt_flags)) + return; if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) svc_xprt_enqueue(&svsk->sk_xprt); } @@ -395,11 +426,9 @@ static void svc_write_space(struct sock *sk) struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); if (svsk) { - dprintk("svc: socket %p(inet %p), write_space busy=%d\n", - svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - /* Refer to svc_setup_socket() for details. */ rmb(); + trace_svcsock_write_space(&svsk->sk_xprt, 0); svsk->sk_owspace(sk); svc_xprt_enqueue(&svsk->sk_xprt); } @@ -416,17 +445,91 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) { - struct svc_sock *svsk; - struct socket *sock; - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + + sock_no_linger(svsk->sk_sock->sk); +} + +/** + * svc_tcp_handshake_done - Handshake completion handler + * @data: address of xprt to wake + * @status: status of handshake + * @peerid: serial number of key containing the remote peer's identity + * + * If a security policy is specified as an export option, we don't + * have a specific export here to check. So we set a "TLS session + * is present" flag on the xprt and let an upper layer enforce local + * security policy. + */ +static void svc_tcp_handshake_done(void *data, int status, key_serial_t peerid) +{ + struct svc_xprt *xprt = data; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + + if (!status) { + if (peerid != TLS_NO_PEERID) + set_bit(XPT_PEER_AUTH, &xprt->xpt_flags); + set_bit(XPT_TLS_SESSION, &xprt->xpt_flags); + } + clear_bit(XPT_HANDSHAKE, &xprt->xpt_flags); + complete_all(&svsk->sk_handshake_done); +} + +/** + * svc_tcp_handshake - Perform a transport-layer security handshake + * @xprt: connected transport endpoint + * + */ +static void svc_tcp_handshake(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct sock *sk = svsk->sk_sock->sk; + struct tls_handshake_args args = { + .ta_sock = svsk->sk_sock, + .ta_done = svc_tcp_handshake_done, + .ta_data = xprt, }; + int ret; + + trace_svc_tls_upcall(xprt); + + clear_bit(XPT_TLS_SESSION, &xprt->xpt_flags); + init_completion(&svsk->sk_handshake_done); + + ret = tls_server_hello_x509(&args, GFP_KERNEL); + if (ret) { + trace_svc_tls_not_started(xprt); + goto out_failed; + } + + ret = wait_for_completion_interruptible_timeout(&svsk->sk_handshake_done, + SVC_HANDSHAKE_TO); + if (ret <= 0) { + if (tls_handshake_cancel(sk)) { + trace_svc_tls_timed_out(xprt); + goto out_close; + } + } - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) { + trace_svc_tls_unavailable(xprt); + goto out_close; + } + + /* Mark the transport ready in case the remote sent RPC + * traffic before the kernel received the handshake + * completion downcall. + */ + set_bit(XPT_DATA, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + return; + +out_close: + set_bit(XPT_CLOSE, &xprt->xpt_flags); +out_failed: + clear_bit(XPT_HANDSHAKE, &xprt->xpt_flags); + set_bit(XPT_DATA, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); } /* @@ -484,8 +587,15 @@ static int svc_udp_get_dest_address(struct svc_rqst *rqstp, return 0; } -/* - * Receive a datagram from a UDP socket. +/** + * svc_udp_recvfrom - Receive a datagram from a UDP socket. + * @rqstp: request structure into which to receive an RPC Call + * + * Called in a loop when XPT_DATA has been set. + * + * Returns: + * On success, the number of bytes in a received RPC Call, or + * %0 if a complete RPC Call message was not ready to return */ static int svc_udp_recvfrom(struct svc_rqst *rqstp) { @@ -516,25 +626,17 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) * provides an upper bound on the number of threads * which will access the socket. */ - svc_sock_setbufsize(svsk->sk_sock, - (serv->sv_nrthreads+3) * serv->sv_max_mesg, - (serv->sv_nrthreads+3) * serv->sv_max_mesg); + svc_sock_setbufsize(svsk, serv->sv_nrthreads + 3); clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 0, 0, MSG_PEEK | MSG_DONTWAIT); - if (err >= 0) - skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err); - - if (skb == NULL) { - if (err != -EAGAIN) { - /* possibly an icmp error */ - dprintk("svc: recvfrom returned error %d\n", -err); - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - } - return 0; - } + if (err < 0) + goto out_recv_err; + skb = skb_recv_udp(svsk->sk_sk, MSG_DONTWAIT, &err); + if (!skb) + goto out_recv_err; + len = svc_addr_len(svc_addr(rqstp)); rqstp->rq_addrlen = len; if (skb->tstamp == 0) { @@ -545,26 +647,21 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) sock_write_timestamp(svsk->sk_sk, skb->tstamp); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ - len = skb->len; + len = skb->len; rqstp->rq_arg.len = len; + trace_svcsock_udp_recv(&svsk->sk_xprt, len); rqstp->rq_prot = IPPROTO_UDP; - if (!svc_udp_get_dest_address(rqstp, cmh)) { - net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", - cmh->cmsg_level, cmh->cmsg_type); - goto out_free; - } + if (!svc_udp_get_dest_address(rqstp, cmh)) + goto out_cmsg_err; rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp)); if (skb_is_nonlinear(skb)) { /* we have to copy */ local_bh_disable(); - if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { - local_bh_enable(); - /* checksum error */ - goto out_free; - } + if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) + goto out_bh_enable; local_bh_enable(); consume_skb(skb); } else { @@ -591,23 +688,89 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (serv->sv_stats) serv->sv_stats->netudpcnt++; + svc_sock_secure_port(rqstp); + svc_xprt_received(rqstp->rq_xprt); return len; + +out_recv_err: + if (err != -EAGAIN) { + /* possibly an icmp error */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + } + trace_svcsock_udp_recv_err(&svsk->sk_xprt, err); + goto out_clear_busy; +out_cmsg_err: + net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", + cmh->cmsg_level, cmh->cmsg_type); + goto out_free; +out_bh_enable: + local_bh_enable(); out_free: kfree_skb(skb); +out_clear_busy: + svc_xprt_received(rqstp->rq_xprt); return 0; } -static int -svc_udp_sendto(struct svc_rqst *rqstp) +/** + * svc_udp_sendto - Send out a reply on a UDP socket + * @rqstp: completed svc_rqst + * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * + * Returns the number of bytes sent, or a negative errno. + */ +static int svc_udp_sendto(struct svc_rqst *rqstp) { - int error; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct xdr_buf *xdr = &rqstp->rq_res; + union { + struct cmsghdr hdr; + long all[SVC_PKTINFO_SPACE / sizeof(long)]; + } buffer; + struct cmsghdr *cmh = &buffer.hdr; + struct msghdr msg = { + .msg_name = &rqstp->rq_addr, + .msg_namelen = rqstp->rq_addrlen, + .msg_control = cmh, + .msg_flags = MSG_SPLICE_PAGES, + .msg_controllen = sizeof(buffer), + }; + unsigned int count; + int err; + + svc_udp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; + + svc_set_cmsg_data(rqstp, cmh); + + mutex_lock(&xprt->xpt_mutex); + + if (svc_xprt_is_dead(xprt)) + goto out_notconn; - error = svc_sendto(rqstp, &rqstp->rq_res); - if (error == -ECONNREFUSED) + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr); + + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, + count, rqstp->rq_res.len); + err = sock_sendmsg(svsk->sk_sock, &msg); + if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - error = svc_sendto(rqstp, &rqstp->rq_res); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, + count, rqstp->rq_res.len); + err = sock_sendmsg(svsk->sk_sock, &msg); + } + + trace_svcsock_udp_send(xprt, err); + + mutex_unlock(&xprt->xpt_mutex); + return err; - return error; +out_notconn: + mutex_unlock(&xprt->xpt_mutex); + return -ENOTCONN; } static int svc_udp_has_wspace(struct svc_xprt *xprt) @@ -650,12 +813,12 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, - .xpo_release_rqst = svc_release_udp_skb, + .xpo_result_payload = svc_sock_result_payload, + .xpo_release_ctxt = svc_udp_release_ctxt, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, - .xpo_secure_port = svc_sock_secure_port, .xpo_kill_temp_xprt = svc_udp_kill_temp_xprt, }; @@ -669,8 +832,6 @@ static struct svc_xprt_class svc_udp_class = { static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { - int err, level, optname, one = 1; - svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); @@ -681,30 +842,24 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) * receive and respond to one request. * svc_udp_recvfrom will re-adjust if necessary */ - svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); + svc_sock_setbufsize(svsk, 3); /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { case AF_INET: - level = SOL_IP; - optname = IP_PKTINFO; + ip_sock_set_pktinfo(svsk->sk_sock->sk); break; case AF_INET6: - level = SOL_IPV6; - optname = IPV6_RECVPKTINFO; + ip6_sock_set_recvpktinfo(svsk->sk_sock->sk); break; default: BUG(); } - err = kernel_setsockopt(svsk->sk_sock, level, optname, - (char *)&one, sizeof(one)); - dprintk("svc: kernel_setsockopt returned %d\n", err); } /* @@ -715,14 +870,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - dprintk("svc: socket %p TCP (listen) state change %d\n", - sk, sk->sk_state); - - if (svsk) { - /* Refer to svc_setup_socket() for details. */ - rmb(); - svsk->sk_odata(sk); - } + trace_sk_data_ready(sk); /* * This callback may called twice when a new connection @@ -732,14 +880,18 @@ static void svc_tcp_listen_data_ready(struct sock *sk) * when one of child sockets become ESTABLISHED. * 2) data_ready method of the child socket may be called * when it receives data before the socket is accepted. - * In case of 2, we should ignore it silently. + * In case of 2, we should ignore it silently and DO NOT + * dereference svsk. */ - if (sk->sk_state == TCP_LISTEN) { - if (svsk) { - set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } else - printk("svc: socket %p: no user data\n", sk); + if (sk->sk_state != TCP_LISTEN) + return; + + if (svsk) { + /* Refer to svc_setup_socket() for details. */ + rmb(); + svsk->sk_odata(sk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } } @@ -750,19 +902,13 @@ static void svc_tcp_state_change(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", - sk, sk->sk_state, sk->sk_user_data); - - if (!svsk) - printk("svc: socket %p: no user data\n", sk); - else { + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); svsk->sk_ostate(sk); - if (sk->sk_state != TCP_ESTABLISHED) { - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } + trace_svcsock_tcp_state(&svsk->sk_xprt, svsk->sk_sock); + if (sk->sk_state != TCP_ESTABLISHED) + svc_xprt_deferred_close(&svsk->sk_xprt); } } @@ -779,45 +925,29 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) struct socket *newsock; struct svc_sock *newsvsk; int err, slen; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); if (!sock) return NULL; clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); if (err < 0) { - if (err == -ENOMEM) - printk(KERN_WARNING "%s: no more sockets!\n", - serv->sv_name); - else if (err != -EAGAIN) - net_warn_ratelimited("%s: accept failed (err %d)!\n", - serv->sv_name, -err); + if (err != -EAGAIN) + trace_svcsock_accept_err(xprt, serv->sv_name, err); return NULL; } + if (IS_ERR(sock_alloc_file(newsock, O_NONBLOCK, NULL))) + return NULL; + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_getpeername(newsock, sin); if (err < 0) { - net_warn_ratelimited("%s: peername failed (err %d)!\n", - serv->sv_name, -err); + trace_svcsock_getpeername_err(xprt, serv->sv_name, err); goto failed; /* aborted connection or whatever */ } slen = err; - /* Ideally, we would want to reject connections from unauthorized - * hosts here, but when we get encryption, the IP of the host won't - * tell us anything. For now just warn about unpriv connections. - */ - if (!svc_port_is_privileged(sin)) { - dprintk("%s: connect from unprivileged port: %s\n", - serv->sv_name, - __svc_print_addr(sin, buf, sizeof(buf))); - } - dprintk("%s: connect from %s\n", serv->sv_name, - __svc_print_addr(sin, buf, sizeof(buf))); - /* Reset the inherited callbacks before calling svc_setup_socket */ newsock->sk->sk_state_change = svsk->sk_ostate; newsock->sk->sk_data_ready = svsk->sk_odata; @@ -835,10 +965,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); err = kernel_getsockname(newsock, sin); slen = err; - if (unlikely(err < 0)) { - dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); + if (unlikely(err < 0)) slen = offsetof(struct sockaddr, sa_data); - } svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); if (sock_is_loopback(newsock->sk)) @@ -851,17 +979,18 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) return &newsvsk->sk_xprt; failed: - sock_release(newsock); + sockfd_put(newsock); return NULL; } -static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +static size_t svc_tcp_restore_pages(struct svc_sock *svsk, + struct svc_rqst *rqstp) { - unsigned int i, len, npages; + size_t len = svsk->sk_datalen; + unsigned int i, npages; - if (svsk->sk_datalen == 0) + if (!len) return 0; - len = svsk->sk_datalen; npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (rqstp->rq_pages[i] != NULL) @@ -910,47 +1039,46 @@ out: } /* - * Receive fragment record header. - * If we haven't gotten the record length yet, get the next four bytes. + * Receive fragment record header into sk_marker. */ -static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) +static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, + struct svc_rqst *rqstp) { - struct svc_serv *serv = svsk->sk_xprt.xpt_server; - unsigned int want; - int len; + ssize_t want, len; + /* If we haven't gotten the record length yet, + * get the next four bytes. + */ if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { + struct msghdr msg = { NULL }; struct kvec iov; want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; - iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; - len = svc_recvfrom(rqstp, &iov, 1, want, 0); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len < 0) - goto error; + return len; svsk->sk_tcplen += len; - if (len < want) { - dprintk("svc: short recvfrom while reading record " - "length (%d of %d)\n", len, want); - return -EAGAIN; + /* call again to read the remaining bytes */ + goto err_short; } - - dprintk("svc: TCP record, %d bytes\n", svc_sock_reclen(svsk)); + trace_svcsock_marker(&svsk->sk_xprt, svsk->sk_marker); if (svc_sock_reclen(svsk) + svsk->sk_datalen > - serv->sv_max_mesg) { - net_notice_ratelimited("RPC: fragment too large: %d\n", - svc_sock_reclen(svsk)); - goto err_delete; - } + svsk->sk_xprt.xpt_server->sv_max_mesg) + goto err_too_large; } - return svc_sock_reclen(svsk); -error: - dprintk("RPC: TCP recv_record got %d\n", len); - return len; -err_delete: - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + +err_too_large: + net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n", + svsk->sk_xprt.xpt_server->sv_name, + svc_sock_reclen(svsk), + (struct sockaddr *)&svsk->sk_xprt.xpt_remote); + svc_xprt_deferred_close(&svsk->sk_xprt); +err_short: return -EAGAIN; } @@ -960,18 +1088,14 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) struct rpc_rqst *req = NULL; struct kvec *src, *dst; __be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base; - __be32 xid; - __be32 calldir; - - xid = *p++; - calldir = *p; + __be32 xid = *p; if (!bc_xprt) return -EAGAIN; spin_lock(&bc_xprt->queue_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) - goto unlock_notfound; + goto unlock_eagain; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); /* @@ -988,98 +1112,63 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) rqstp->rq_arg.len = 0; spin_unlock(&bc_xprt->queue_lock); return 0; -unlock_notfound: - printk(KERN_NOTICE - "%s: Got unrecognized reply: " - "calldir 0x%x xpt_bc_xprt %p xid %08x\n", - __func__, ntohl(calldir), - bc_xprt, ntohl(xid)); unlock_eagain: spin_unlock(&bc_xprt->queue_lock); return -EAGAIN; } -static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) -{ - int i = 0; - int t = 0; - - while (t < len) { - vec[i].iov_base = page_address(pages[i]); - vec[i].iov_len = PAGE_SIZE; - i++; - t += PAGE_SIZE; - } - return i; -} - static void svc_tcp_fragment_received(struct svc_sock *svsk) { /* If we have more data, signal svc_xprt_enqueue() to try again */ - dprintk("svc: TCP %s record (%d bytes)\n", - svc_sock_final_rec(svsk) ? "final" : "nonfinal", - svc_sock_reclen(svsk)); svsk->sk_tcplen = 0; - svsk->sk_reclen = 0; + svsk->sk_marker = xdr_zero; } -/* - * Receive data from a TCP socket. +/** + * svc_tcp_recvfrom - Receive data from a TCP socket + * @rqstp: request structure into which to receive an RPC Call + * + * Called in a loop when XPT_DATA has been set. + * + * Read the 4-byte stream record marker, then use the record length + * in that marker to set up exactly the resources needed to receive + * the next RPC message into @rqstp. + * + * Returns: + * On success, the number of bytes in a received RPC Call, or + * %0 if a complete RPC Call message was not ready to return + * + * The zero return case handles partial receives and callback Replies. + * The state of a partial receive is preserved in the svc_sock for + * the next call to svc_tcp_recvfrom. */ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int len; - struct kvec *vec; - unsigned int want, base; + size_t want, base; + ssize_t len; __be32 *p; __be32 calldir; - int pnum; - - dprintk("svc: tcp_recv %p data %d conn %d close %d\n", - svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - len = svc_tcp_recv_record(svsk, rqstp); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + len = svc_tcp_read_marker(svsk, rqstp); if (len < 0) goto error; base = svc_tcp_restore_pages(svsk, rqstp); - want = svc_sock_reclen(svsk) - (svsk->sk_tcplen - sizeof(rpc_fraghdr)); - - vec = rqstp->rq_vec; - - pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want); - - rqstp->rq_respages = &rqstp->rq_pages[pnum]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Now receive data */ - len = svc_recvfrom(rqstp, vec, pnum, base + want, base); + want = len - (svsk->sk_tcplen - sizeof(rpc_fraghdr)); + len = svc_tcp_read_msg(rqstp, base + want, base); if (len >= 0) { + trace_svcsock_tcp_recv(&svsk->sk_xprt, len); svsk->sk_tcplen += len; svsk->sk_datalen += len; } - if (len != want || !svc_sock_final_rec(svsk)) { - svc_tcp_save_pages(svsk, rqstp); - if (len < 0 && len != -EAGAIN) - goto err_delete; - if (len == want) - svc_tcp_fragment_received(svsk); - else - dprintk("svc: incomplete TCP record (%d of %d)\n", - (int)(svsk->sk_tcplen - sizeof(rpc_fraghdr)), - svc_sock_reclen(svsk)); - goto err_noclose; - } - - if (svsk->sk_datalen < 8) { - svsk->sk_datalen = 0; - goto err_delete; /* client is nuts. */ - } + if (len != want || !svc_sock_final_rec(svsk)) + goto err_incomplete; + if (svsk->sk_datalen < 8) + goto err_nuts; rqstp->rq_arg.len = svsk->sk_datalen; rqstp->rq_arg.page_base = 0; @@ -1112,50 +1201,113 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (serv->sv_stats) serv->sv_stats->nettcpcnt++; + svc_sock_secure_port(rqstp); + svc_xprt_received(rqstp->rq_xprt); return rqstp->rq_arg.len; +err_incomplete: + svc_tcp_save_pages(svsk, rqstp); + if (len < 0 && len != -EAGAIN) + goto err_delete; + if (len == want) + svc_tcp_fragment_received(svsk); + else + trace_svcsock_tcp_recv_short(&svsk->sk_xprt, + svc_sock_reclen(svsk), + svsk->sk_tcplen - sizeof(rpc_fraghdr)); + goto err_noclose; error: if (len != -EAGAIN) goto err_delete; - dprintk("RPC: TCP recvfrom got EAGAIN\n"); - return 0; + trace_svcsock_tcp_recv_eagain(&svsk->sk_xprt, 0); + goto err_noclose; +err_nuts: + svsk->sk_datalen = 0; err_delete: - printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_xprt.xpt_server->sv_name, -len); - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + trace_svcsock_tcp_recv_err(&svsk->sk_xprt, len); + svc_xprt_deferred_close(&svsk->sk_xprt); err_noclose: + svc_xprt_received(rqstp->rq_xprt); return 0; /* record not complete */ } /* - * Send out data on TCP socket. + * MSG_SPLICE_PAGES is used exclusively to reduce the number of + * copy operations in this path. Therefore the caller must ensure + * that the pages backing @xdr are unchanging. + */ +static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, + rpc_fraghdr marker) +{ + struct msghdr msg = { + .msg_flags = MSG_SPLICE_PAGES, + }; + unsigned int count; + void *buf; + int ret; + + /* The stream record marker is copied into a temporary page + * fragment buffer so that it can be included in sk_bvec. + */ + buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), + GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, &marker, sizeof(marker)); + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker)); + + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages, + &rqstp->rq_res); + + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, + 1 + count, sizeof(marker) + rqstp->rq_res.len); + ret = sock_sendmsg(svsk->sk_sock, &msg); + page_frag_free(buf); + return ret; +} + +/** + * svc_tcp_sendto - Send out a reply on a TCP socket + * @rqstp: completed svc_rqst + * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * + * Returns the number of bytes sent, or a negative errno. */ static int svc_tcp_sendto(struct svc_rqst *rqstp) { - struct xdr_buf *xbufp = &rqstp->rq_res; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct xdr_buf *xdr = &rqstp->rq_res; + rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | + (u32)xdr->len); int sent; - __be32 reclen; - /* Set up the first element of the reply kvec. - * Any other kvecs that may be in use have been taken - * care of by the server implementation itself. - */ - reclen = htonl(0x80000000|((xbufp->len ) - 4)); - memcpy(xbufp->head[0].iov_base, &reclen, 4); - - sent = svc_sendto(rqstp, &rqstp->rq_res); - if (sent != xbufp->len) { - printk(KERN_NOTICE - "rpc-srv/tcp: %s: %s %d when sending %d bytes " - "- shutting down socket\n", - rqstp->rq_xprt->xpt_server->sv_name, - (sent<0)?"got error":"sent only", - sent, xbufp->len); - set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); - svc_xprt_enqueue(rqstp->rq_xprt); - sent = -EAGAIN; - } + svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; + + mutex_lock(&xprt->xpt_mutex); + if (svc_xprt_is_dead(xprt)) + goto out_notconn; + sent = svc_tcp_sendmsg(svsk, rqstp, marker); + trace_svcsock_tcp_send(xprt, sent); + if (sent < 0 || sent != (xdr->len + sizeof(marker))) + goto out_close; + mutex_unlock(&xprt->xpt_mutex); return sent; + +out_notconn: + mutex_unlock(&xprt->xpt_mutex); + return -ENOTCONN; +out_close: + pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n", + xprt->xpt_server->sv_name, + (sent < 0) ? "got error" : "sent", + sent, xdr->len + sizeof(marker)); + svc_xprt_deferred_close(xprt); + mutex_unlock(&xprt->xpt_mutex); + return -EAGAIN; } static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, @@ -1170,13 +1322,14 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, - .xpo_release_rqst = svc_release_skb, + .xpo_result_payload = svc_sock_result_payload, + .xpo_release_ctxt = svc_tcp_release_ctxt, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, - .xpo_secure_port = svc_sock_secure_port, .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt, + .xpo_handshake = svc_tcp_handshake, }; static struct svc_xprt_class svc_tcp_class = { @@ -1208,23 +1361,23 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { - dprintk("setting up TCP socket for listening\n"); strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { - dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; sk->sk_data_ready = svc_data_ready; sk->sk_write_space = svc_write_space; - svsk->sk_reclen = 0; + svsk->sk_marker = xdr_zero; svsk->sk_tcplen = 0; svsk->sk_datalen = 0; - memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); + memset(&svsk->sk_pages[0], 0, + svsk->sk_maxpages * sizeof(struct page *)); - tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + tcp_sock_set_nodelay(sk); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); switch (sk->sk_state) { @@ -1232,7 +1385,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) case TCP_ESTABLISHED: break; default: - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_deferred_close(&svsk->sk_xprt); } } } @@ -1250,7 +1403,20 @@ void svc_sock_update_bufs(struct svc_serv *serv) set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); spin_unlock_bh(&serv->sv_lock); } -EXPORT_SYMBOL_GPL(svc_sock_update_bufs); + +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags) +{ + switch (sock->type) { + case SOCK_STREAM: + /* +1 for TCP record marker */ + if (flags & SVC_SOCK_TEMPORARY) + return svc_serv_maxpages(serv) + 1; + return 0; + case SOCK_DGRAM: + return SUNRPC_MAX_UDP_SENDPAGES; + } + return -EINVAL; +} /* * Initialize socket for RPC use and create svc_sock struct @@ -1262,24 +1428,41 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); - int err = 0; + int sendpages; + unsigned long pages; - dprintk("svc: svc_setup_socket %p\n", sock); - svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); + sendpages = svc_sock_sendpages(serv, sock, flags); + if (sendpages < 0) + return ERR_PTR(sendpages); + + pages = svc_serv_maxpages(serv); + svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + if (sendpages) { + svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL); + if (!svsk->sk_bvec) { + kfree(svsk); + return ERR_PTR(-ENOMEM); + } + } + + svsk->sk_maxpages = pages; + inet = sock->sk; - /* Register socket with portmapper */ - if (pmap_register) + if (pmap_register) { + int err; + err = svc_register(serv, sock_net(sock->sk), inet->sk_family, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); - - if (err < 0) { - kfree(svsk); - return ERR_PTR(err); + if (err < 0) { + kfree(svsk->sk_bvec); + kfree(svsk); + return ERR_PTR(err); + } } svsk->sk_sock = sock; @@ -1289,7 +1472,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_owspace = inet->sk_write_space; /* * This barrier is necessary in order to prevent race condition - * with svc_data_ready(), svc_listen_data_ready() and others + * with svc_data_ready(), svc_tcp_listen_data_ready(), and others * when calling callbacks above. */ wmb(); @@ -1301,44 +1484,25 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - dprintk("svc: svc_setup_socket created %p (inet %p), " - "listen %d close %d\n", - svsk, svsk->sk_sk, - test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - + trace_svcsock_new(svsk, sock); return svsk; } -bool svc_alien_sock(struct net *net, int fd) -{ - int err; - struct socket *sock = sockfd_lookup(fd, &err); - bool ret = false; - - if (!sock) - goto out; - if (sock_net(sock->sk) != net) - ret = true; - sockfd_put(sock); -out: - return ret; -} -EXPORT_SYMBOL_GPL(svc_alien_sock); - /** * svc_addsock - add a listener socket to an RPC service * @serv: pointer to RPC service to which to add a new listener + * @net: caller's network namespace * @fd: file descriptor of the new listener * @name_return: pointer to buffer to fill in with name of listener * @len: size of the buffer + * @cred: credential * * Fills in socket name and returns positive length of name if successful. * Name is terminated with '\n'. On error, returns a negative errno * value. */ -int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, - const size_t len) +int svc_addsock(struct svc_serv *serv, struct net *net, const int fd, + char *name_return, const size_t len, const struct cred *cred) { int err = 0; struct socket *so = sockfd_lookup(fd, &err); @@ -1349,6 +1513,9 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, if (!so) return err; + err = -EINVAL; + if (sock_net(so->sk) != net) + goto out; err = -EAFNOSUPPORT; if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6)) goto out; @@ -1371,6 +1538,7 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, salen = kernel_getsockname(svsk->sk_sock, sin); if (salen >= 0) svc_xprt_set_local(&svsk->sk_xprt, sin, salen); + svsk->sk_xprt.xpt_cred = get_cred(cred); svc_add_new_perm_xprt(serv, &svsk->sk_xprt); return svc_one_sock_name(svsk, name_return, len); out: @@ -1396,12 +1564,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; int family; - int val; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - - dprintk("svc: svc_create_socket(%s, %d, %s)\n", - serv->sv_program->pg_name, protocol, - __svc_print_addr(sin, buf, sizeof(buf))); if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { printk(KERN_WARNING "svc: only UDP and TCP " @@ -1432,14 +1594,11 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, * getting requests from IPv4 remotes. Those should * be shunted to a PF_INET listener via rpcbind. */ - val = 1; if (family == PF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - + ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ - error = kernel_bind(sock, sin, len); + error = kernel_bind(sock, (struct sockaddr_unsized *)sin, len); if (error < 0) goto bummer; @@ -1449,7 +1608,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, newlen = error; if (protocol == IPPROTO_TCP) { - if ((error = kernel_listen(sock, 64)) < 0) + sk_net_refcnt_upgrade(sock->sk); + if ((error = kernel_listen(sock, SOMAXCONN)) < 0) goto bummer; } @@ -1461,7 +1621,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); return (struct svc_xprt *)svsk; bummer: - dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); return ERR_PTR(error); } @@ -1475,8 +1634,6 @@ static void svc_sock_detach(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sock *sk = svsk->sk_sk; - dprintk("svc: svc_sock_detach(%p)\n", svsk); - /* put back the old socket callbacks */ lock_sock(sk); sk->sk_state_change = svsk->sk_ostate; @@ -1493,7 +1650,7 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk); + tls_handshake_close(svsk->sk_sock); svc_sock_detach(xprt); @@ -1509,11 +1666,17 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) static void svc_sock_free(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - dprintk("svc: svc_sock_free(%p)\n", svsk); + struct socket *sock = svsk->sk_sock; - if (svsk->sk_sock->file) - sockfd_put(svsk->sk_sock); + trace_svcsock_free(svsk, sock); + + tls_handshake_cancel(sock->sk); + if (sock->file) + sockfd_put(sock); else - sock_release(svsk->sk_sock); + sock_release(sock); + + page_frag_cache_drain(&svsk->sk_frag_cache); + kfree(svsk->sk_bvec); kfree(svsk); } |
