diff options
Diffstat (limited to 'net/kcm/kcmsock.c')
| -rw-r--r-- | net/kcm/kcmsock.c | 682 |
1 files changed, 261 insertions, 421 deletions
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index da49191f7ad0..5dd7e0509a48 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1,17 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel Connection Multiplexor * * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. */ #include <linux/bpf.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/file.h> +#include <linux/filter.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> @@ -21,6 +19,7 @@ #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/socket.h> +#include <linux/splice.h> #include <linux/uaccess.h> #include <linux/workqueue.h> #include <linux/syscalls.h> @@ -30,6 +29,7 @@ #include <net/netns/generic.h> #include <net/sock.h> #include <uapi/linux/kcm.h> +#include <trace/events/sock.h> unsigned int kcm_net_id; @@ -50,7 +50,7 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; - csk->sk_error_report(csk); + sk_error_report(csk); } static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, @@ -96,12 +96,12 @@ static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { STRP_STATS_ADD(mux->stats.rx_bytes, - psock->strp.stats.rx_bytes - + psock->strp.stats.bytes - psock->saved_rx_bytes); mux->stats.rx_msgs += - psock->strp.stats.rx_msgs - psock->saved_rx_msgs; - psock->saved_rx_msgs = psock->strp.stats.rx_msgs; - psock->saved_rx_bytes = psock->strp.stats.rx_bytes; + psock->strp.stats.msgs - psock->saved_rx_msgs; + psock->saved_rx_msgs = psock->strp.stats.msgs; + psock->saved_rx_bytes = psock->strp.stats.bytes; } static void kcm_update_tx_mux_stats(struct kcm_mux *mux, @@ -164,7 +164,8 @@ static void kcm_rcv_ready(struct kcm_sock *kcm) /* Buffer limit is okay now, add to ready list */ list_add_tail(&kcm->wait_rx_list, &kcm->mux->kcm_rx_waiters); - kcm->rx_wait = true; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, true); } static void kcm_rfree(struct sk_buff *skb) @@ -180,7 +181,7 @@ static void kcm_rfree(struct sk_buff *skb) /* For reading rx_wait and rx_psock without holding lock */ smp_mb__after_atomic(); - if (!kcm->rx_wait && !kcm->rx_psock && + if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) && sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); @@ -223,7 +224,7 @@ static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) struct sk_buff *skb; struct kcm_sock *kcm; - while ((skb = __skb_dequeue(head))) { + while ((skb = skb_dequeue(head))) { /* Reset destructor to avoid calling kcm_rcv_ready */ skb->destructor = sock_rfree; skb_orphan(skb); @@ -239,7 +240,8 @@ try_again: if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); /* Commit rx_wait to read in kcm_free */ smp_wmb(); @@ -282,10 +284,12 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); psock->rx_kcm = kcm; - kcm->rx_psock = psock; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_psock, psock); spin_unlock_bh(&mux->rx_lock); @@ -312,7 +316,8 @@ static void unreserve_rx_kcm(struct kcm_psock *psock, spin_lock_bh(&mux->rx_lock); psock->rx_kcm = NULL; - kcm->rx_psock = NULL; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_psock, NULL); /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with * kcm_rfree @@ -346,6 +351,8 @@ static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; @@ -381,8 +388,10 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; + int res; - return (*prog->bpf_func)(skb, prog->insnsi); + res = bpf_prog_run_pin_on_cpu(prog, skb); + return res; } static int kcm_read_sock_done(struct strparser *strp, int err) @@ -396,8 +405,8 @@ static int kcm_read_sock_done(struct strparser *strp, int err) static void psock_state_change(struct sock *sk) { - /* TCP only does a POLLIN for a half close. Do a POLLHUP here - * since application will normally not poll with POLLIN + /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here + * since application will normally not poll with EPOLLIN * on the TCP sockets. */ @@ -421,7 +430,7 @@ static void psock_write_space(struct sock *sk) /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; - if (kcm && !unlikely(kcm->tx_stopped)) + if (kcm) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); @@ -573,12 +582,10 @@ static void kcm_report_tx_retry(struct kcm_sock *kcm) */ static int kcm_write_msgs(struct kcm_sock *kcm) { + unsigned int total_sent = 0; struct sock *sk = &kcm->sk; struct kcm_psock *psock; - struct sk_buff *skb, *head; - struct kcm_tx_msg *txm; - unsigned short fragidx, frag_offset; - unsigned int sent, total_sent = 0; + struct sk_buff *head; int ret = 0; kcm->tx_wait_more = false; @@ -592,123 +599,108 @@ static int kcm_write_msgs(struct kcm_sock *kcm) if (skb_queue_empty(&sk->sk_write_queue)) return 0; - kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; - - } else if (skb_queue_empty(&sk->sk_write_queue)) { - return 0; + kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false; } - head = skb_peek(&sk->sk_write_queue); - txm = kcm_tx_msg(head); - - if (txm->sent) { - /* Send of first skbuff in queue already in progress */ - if (WARN_ON(!psock)) { - ret = -EINVAL; - goto out; +retry: + while ((head = skb_peek(&sk->sk_write_queue))) { + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, + }; + struct kcm_tx_msg *txm = kcm_tx_msg(head); + struct sk_buff *skb; + unsigned int msize; + int i; + + if (!txm->started_tx) { + psock = reserve_psock(kcm); + if (!psock) + goto out; + skb = head; + txm->frag_offset = 0; + txm->sent = 0; + txm->started_tx = true; + } else { + if (WARN_ON(!psock)) { + ret = -EINVAL; + goto out; + } + skb = txm->frag_skb; } - sent = txm->sent; - frag_offset = txm->frag_offset; - fragidx = txm->fragidx; - skb = txm->frag_skb; - goto do_frag; - } - -try_again: - psock = reserve_psock(kcm); - if (!psock) - goto out; - - do { - skb = head; - txm = kcm_tx_msg(head); - sent = 0; - -do_frag_list: - if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { + if (WARN_ON(!skb_shinfo(skb)->nr_frags) || + WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) { ret = -EINVAL; goto out; } - for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; - fragidx++) { - skb_frag_t *frag; + msize = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + msize += skb_frag_size(&skb_shinfo(skb)->frags[i]); - frag_offset = 0; -do_frag: - frag = &skb_shinfo(skb)->frags[fragidx]; - if (WARN_ON(!frag->size)) { - ret = -EINVAL; - goto out; - } + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, + (const struct bio_vec *)skb_shinfo(skb)->frags, + skb_shinfo(skb)->nr_frags, msize); + iov_iter_advance(&msg.msg_iter, txm->frag_offset); - ret = kernel_sendpage(psock->sk->sk_socket, - frag->page.p, - frag->page_offset + frag_offset, - frag->size - frag_offset, - MSG_DONTWAIT); + do { + ret = sock_sendmsg(psock->sk->sk_socket, &msg); if (ret <= 0) { if (ret == -EAGAIN) { /* Save state to try again when there's * write space on the socket */ - txm->sent = sent; - txm->frag_offset = frag_offset; - txm->fragidx = fragidx; txm->frag_skb = skb; - ret = 0; goto out; } /* Hard failure in sending message, abort this * psock since it has lost framing - * synchonization and retry sending the + * synchronization and retry sending the * message from the beginning. */ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, true); unreserve_psock(kcm); + psock = NULL; - txm->sent = 0; + txm->started_tx = false; kcm_report_tx_retry(kcm); ret = 0; - - goto try_again; + goto retry; } - sent += ret; - frag_offset += ret; + txm->sent += ret; + txm->frag_offset += ret; KCM_STATS_ADD(psock->stats.tx_bytes, ret); - if (frag_offset < frag->size) { - /* Not finished with this frag */ - goto do_frag; - } - } + } while (msg.msg_iter.count > 0); if (skb == head) { if (skb_has_frag_list(skb)) { - skb = skb_shinfo(skb)->frag_list; - goto do_frag_list; + txm->frag_skb = skb_shinfo(skb)->frag_list; + txm->frag_offset = 0; + continue; } } else if (skb->next) { - skb = skb->next; - goto do_frag_list; + txm->frag_skb = skb->next; + txm->frag_offset = 0; + continue; } /* Successfully sent the whole packet, account for it. */ + sk->sk_wmem_queued -= txm->sent; + total_sent += txm->sent; skb_dequeue(&sk->sk_write_queue); kfree_skb(head); - sk->sk_wmem_queued -= sent; - total_sent += sent; KCM_STATS_INCR(psock->stats.tx_msgs); - } while ((head = skb_peek(&sk->sk_write_queue))); + } out: if (!head) { /* Done with all queued messages. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); - unreserve_psock(kcm); + if (psock) + unreserve_psock(kcm); } /* Check if write space is available */ @@ -753,149 +745,6 @@ static void kcm_push(struct kcm_sock *kcm) kcm_write_msgs(kcm); } -static ssize_t kcm_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags) - -{ - struct sock *sk = sock->sk; - struct kcm_sock *kcm = kcm_sk(sk); - struct sk_buff *skb = NULL, *head = NULL; - long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - bool eor; - int err = 0; - int i; - - if (flags & MSG_SENDPAGE_NOTLAST) - flags |= MSG_MORE; - - /* No MSG_EOR from splice, only look at MSG_MORE */ - eor = !(flags & MSG_MORE); - - lock_sock(sk); - - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - - err = -EPIPE; - if (sk->sk_err) - goto out_error; - - if (kcm->seq_skb) { - /* Previously opened message */ - head = kcm->seq_skb; - skb = kcm_tx_msg(head)->last_skb; - i = skb_shinfo(skb)->nr_frags; - - if (skb_can_coalesce(skb, i, page, offset)) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; - goto coalesced; - } - - if (i >= MAX_SKB_FRAGS) { - struct sk_buff *tskb; - - tskb = alloc_skb(0, sk->sk_allocation); - while (!tskb) { - kcm_push(kcm); - err = sk_stream_wait_memory(sk, &timeo); - if (err) - goto out_error; - } - - if (head == skb) - skb_shinfo(head)->frag_list = tskb; - else - skb->next = tskb; - - skb = tskb; - skb->ip_summed = CHECKSUM_UNNECESSARY; - i = 0; - } - } else { - /* Call the sk_stream functions to manage the sndbuf mem. */ - if (!sk_stream_memory_free(sk)) { - kcm_push(kcm); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = sk_stream_wait_memory(sk, &timeo); - if (err) - goto out_error; - } - - head = alloc_skb(0, sk->sk_allocation); - while (!head) { - kcm_push(kcm); - err = sk_stream_wait_memory(sk, &timeo); - if (err) - goto out_error; - } - - skb = head; - i = 0; - } - - get_page(page); - skb_fill_page_desc(skb, i, page, offset, size); - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; - -coalesced: - skb->len += size; - skb->data_len += size; - skb->truesize += size; - sk->sk_wmem_queued += size; - sk_mem_charge(sk, size); - - if (head != skb) { - head->len += size; - head->data_len += size; - head->truesize += size; - } - - if (eor) { - bool not_busy = skb_queue_empty(&sk->sk_write_queue); - - /* Message complete, queue it on send buffer */ - __skb_queue_tail(&sk->sk_write_queue, head); - kcm->seq_skb = NULL; - KCM_STATS_INCR(kcm->stats.tx_msgs); - - if (flags & MSG_BATCH) { - kcm->tx_wait_more = true; - } else if (kcm->tx_wait_more || not_busy) { - err = kcm_write_msgs(kcm); - if (err < 0) { - /* We got a hard error in write_msgs but have - * already queued this message. Report an error - * in the socket, but don't affect return value - * from sendmsg - */ - pr_warn("KCM: Hard failure on kcm_write_msgs\n"); - report_csk_error(&kcm->sk, -err); - } - } - } else { - /* Message not complete, save state */ - kcm->seq_skb = head; - kcm_tx_msg(head)->last_skb = skb; - } - - KCM_STATS_ADD(kcm->stats.tx_bytes, size); - - release_sock(sk); - return size; - -out_error: - kcm_push(kcm); - - err = sk_stream_error(sk, flags, err); - - /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) - sk->sk_write_space(sk); - - release_sock(sk); - return err; -} - static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; @@ -907,6 +756,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); int err = -EPIPE; + mutex_lock(&kcm->tx_mutex); lock_sock(sk); /* Per tcp_sendmsg this should be in poll */ @@ -981,29 +831,51 @@ start: merge = false; } - copy = min_t(int, msg_data_left(msg), - pfrag->size - pfrag->offset); + if (msg->msg_flags & MSG_SPLICE_PAGES) { + copy = msg_data_left(msg); + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + err = skb_splice_from_iter(skb, &msg->msg_iter, copy); + if (err < 0) { + if (err == -EMSGSIZE) + goto wait_for_memory; + goto out_error; + } - err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, - pfrag->page, - pfrag->offset, - copy); - if (err) - goto out_error; + copy = err; + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); - /* Update the skb. */ - if (merge) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + if (head != skb) + head->truesize += copy; } else { - skb_fill_page_desc(skb, i, pfrag->page, - pfrag->offset, copy); - get_page(pfrag->page); + copy = min_t(int, msg_data_left(msg), + pfrag->size - pfrag->offset); + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; + + err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, + pfrag->page, + pfrag->offset, + copy); + if (err) + goto out_error; + + /* Update the skb. */ + if (merge) { + skb_frag_size_add( + &skb_shinfo(skb)->frags[i - 1], copy); + } else { + skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, copy); + get_page(pfrag->page); + } + + pfrag->offset += copy; } - pfrag->offset += copy; copied += copy; if (head != skb) { head->len += copy; @@ -1055,20 +927,24 @@ partial_message: KCM_STATS_ADD(kcm->stats.tx_bytes, copied); release_sock(sk); + mutex_unlock(&kcm->tx_mutex); return copied; out_error: kcm_push(kcm); - if (copied && sock->type == SOCK_SEQPACKET) { + if (sock->type == SOCK_SEQPACKET) { /* Wrote some bytes before encountering an * error, return partial success. */ - goto partial_message; - } - - if (head != kcm->seq_skb) + if (copied) + goto partial_message; + if (head != kcm->seq_skb) + kfree_skb(head); + } else { kfree_skb(head); + kcm->seq_skb = NULL; + } err = sk_stream_error(sk, msg->msg_flags, err); @@ -1077,38 +953,21 @@ out_error: sk->sk_write_space(sk); release_sock(sk); + mutex_unlock(&kcm->tx_mutex); return err; } -static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, - long timeo, int *err) +static void kcm_splice_eof(struct socket *sock) { - struct sk_buff *skb; - - while (!(skb = skb_peek(&sk->sk_receive_queue))) { - if (sk->sk_err) { - *err = sock_error(sk); - return NULL; - } - - if (sock_flag(sk, SOCK_DONE)) - return NULL; - - if ((flags & MSG_DONTWAIT) || !timeo) { - *err = -EAGAIN; - return NULL; - } - - sk_wait_data(sk, &timeo, NULL); + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); - /* Handle signals */ - if (signal_pending(current)) { - *err = sock_intr_errno(timeo); - return NULL; - } - } + if (skb_queue_empty_lockless(&sk->sk_write_queue)) + return; - return skb; + lock_sock(sk); + kcm_write_msgs(kcm); + release_sock(sk); } static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, @@ -1117,54 +976,46 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); int err = 0; - long timeo; - struct strp_rx_msg *rxm; + struct strp_msg *stm; int copied = 0; struct sk_buff *skb; - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - lock_sock(sk); - - skb = kcm_wait_data(sk, flags, timeo, &err); + skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; /* Okay, have a message on the receive queue */ - rxm = strp_rx_msg(skb); + stm = strp_msg(skb); - if (len > rxm->full_len) - len = rxm->full_len; + if (len > stm->full_len) + len = stm->full_len; - err = skb_copy_datagram_msg(skb, rxm->offset, msg, len); + err = skb_copy_datagram_msg(skb, stm->offset, msg, len); if (err < 0) goto out; copied = len; if (likely(!(flags & MSG_PEEK))) { KCM_STATS_ADD(kcm->stats.rx_bytes, copied); - if (copied < rxm->full_len) { + if (copied < stm->full_len) { if (sock->type == SOCK_DGRAM) { /* Truncated message */ msg->msg_flags |= MSG_TRUNC; goto msg_finished; } - rxm->offset += copied; - rxm->full_len -= copied; + stm->offset += copied; + stm->full_len -= copied; } else { msg_finished: /* Finished with message */ msg->msg_flags |= MSG_EOR; KCM_STATS_INCR(kcm->stats.rx_msgs); - skb_unlink(skb, &sk->sk_receive_queue); - kfree_skb(skb); } } out: - release_sock(sk); - + skb_free_datagram(sk, skb); return copied ? : err; } @@ -1174,30 +1025,30 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); - long timeo; - struct strp_rx_msg *rxm; + struct strp_msg *stm; int err = 0; ssize_t copied; struct sk_buff *skb; - /* Only support splice for SOCKSEQPACKET */ + if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) + flags = MSG_DONTWAIT; + else + flags = 0; - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - lock_sock(sk); + /* Only support splice for SOCKSEQPACKET */ - skb = kcm_wait_data(sk, flags, timeo, &err); + skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto err_out; /* Okay, have a message on the receive queue */ - rxm = strp_rx_msg(skb); + stm = strp_msg(skb); - if (len > rxm->full_len) - len = rxm->full_len; + if (len > stm->full_len) + len = stm->full_len; - copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags); + copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags); if (copied < 0) { err = copied; goto err_out; @@ -1205,8 +1056,8 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, KCM_STATS_ADD(kcm->stats.rx_bytes, copied); - rxm->offset += copied; - rxm->full_len -= copied; + stm->offset += copied; + stm->full_len -= copied; /* We have no way to return MSG_EOR. If all the bytes have been * read we still leave the message in the receive socket buffer. @@ -1214,13 +1065,11 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, * finish reading the message. */ - release_sock(sk); - + skb_free_datagram(sk, skb); return copied; err_out: - release_sock(sk); - + skb_free_datagram(sk, skb); return err; } @@ -1240,7 +1089,8 @@ static void kcm_recv_disable(struct kcm_sock *kcm) if (!kcm->rx_psock) { if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); } requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); @@ -1266,7 +1116,7 @@ static void kcm_recv_enable(struct kcm_sock *kcm) } static int kcm_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, valbool; @@ -1278,8 +1128,8 @@ static int kcm_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; valbool = val ? 1 : 0; @@ -1311,10 +1161,11 @@ static int kcm_getsockopt(struct socket *sock, int level, int optname, if (get_user(len, optlen)) return -EFAULT; - len = min_t(unsigned int, len, sizeof(int)); if (len < 0) return -EINVAL; + len = min_t(unsigned int, len, sizeof(int)); + switch (optname) { case KCM_RECV_DISABLE: val = kcm->rx_disabled; @@ -1338,7 +1189,7 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so * we set sk_state, otherwise epoll_wait always returns right away with - * POLLHUP + * EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; @@ -1361,6 +1212,7 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) spin_unlock_bh(&mux->lock); INIT_WORK(&kcm->tx_work, kcm_tx_work); + mutex_init(&kcm->tx_mutex); spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); @@ -1376,35 +1228,61 @@ static int kcm_attach(struct socket *sock, struct socket *csock, struct kcm_psock *psock = NULL, *tpsock; struct list_head *head; int index = 0; - struct strp_callbacks cb; - int err; + static const struct strp_callbacks cb = { + .rcv_msg = kcm_rcv_strparser, + .parse_msg = kcm_parse_func_strparser, + .read_sock_done = kcm_read_sock_done, + }; + int err = 0; csk = csock->sk; if (!csk) return -EINVAL; + lock_sock(csk); + + /* Only allow TCP sockets to be attached for now */ + if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || + csk->sk_protocol != IPPROTO_TCP) { + err = -EOPNOTSUPP; + goto out; + } + + /* Don't allow listeners or closed sockets */ + if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) { + err = -EOPNOTSUPP; + goto out; + } + psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); - if (!psock) - return -ENOMEM; + if (!psock) { + err = -ENOMEM; + goto out; + } psock->mux = mux; psock->sk = csk; psock->bpf_prog = prog; - cb.rcv_msg = kcm_rcv_strparser; - cb.abort_parser = NULL; - cb.parse_msg = kcm_parse_func_strparser; - cb.read_sock_done = kcm_read_sock_done; + write_lock_bh(&csk->sk_callback_lock); + + /* Check if sk_user_data is already by KCM or someone else. + * Must be done under lock to prevent race conditions. + */ + if (csk->sk_user_data) { + write_unlock_bh(&csk->sk_callback_lock); + kmem_cache_free(kcm_psockp, psock); + err = -EALREADY; + goto out; + } err = strp_init(&psock->strp, csk, &cb); if (err) { + write_unlock_bh(&csk->sk_callback_lock); kmem_cache_free(kcm_psockp, psock); - return err; + goto out; } - sock_hold(csk); - - write_lock_bh(&csk->sk_callback_lock); psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; @@ -1412,8 +1290,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock, csk->sk_data_ready = psock_data_ready; csk->sk_write_space = psock_write_space; csk->sk_state_change = psock_state_change; + write_unlock_bh(&csk->sk_callback_lock); + sock_hold(csk); + /* Finished initialization, now add the psock to the MUX. */ spin_lock_bh(&mux->lock); head = &mux->psocks; @@ -1435,7 +1316,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock, /* Schedule RX work in case there are already bytes queued */ strp_check_rcv(&psock->strp); - return 0; +out: + release_sock(csk); + + return err; } static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) @@ -1464,7 +1348,7 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) return 0; out: - fput(csock->file); + sockfd_put(csock); return err; } @@ -1487,6 +1371,7 @@ static void kcm_unattach(struct kcm_psock *psock) if (WARN_ON(psock->rx_kcm)) { write_unlock_bh(&csk->sk_callback_lock); + release_sock(csk); return; } @@ -1611,7 +1496,7 @@ static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) spin_unlock_bh(&mux->lock); out: - fput(csock->file); + sockfd_put(csock); return err; } @@ -1622,60 +1507,30 @@ static struct proto kcm_proto = { }; /* Clone a kcm socket. */ -static int kcm_clone(struct socket *osock, struct kcm_clone *info, - struct socket **newsockp) +static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; - struct file *newfile; - int err, newfd; - err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out; + return ERR_PTR(-ENFILE); newsock->type = osock->type; newsock->ops = osock->ops; __module_get(newsock->ops->owner); - newfd = get_unused_fd_flags(0); - if (unlikely(newfd < 0)) { - err = newfd; - goto out_fd_fail; - } - - newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (unlikely(IS_ERR(newfile))) { - err = PTR_ERR(newfile); - goto out_sock_alloc_fail; - } - newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, - &kcm_proto, true); + &kcm_proto, false); if (!newsk) { - err = -ENOMEM; - goto out_sk_alloc_fail; + sock_release(newsock); + return ERR_PTR(-ENOMEM); } - sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); - fd_install(newfd, newfile); - *newsockp = newsock; - info->fd = newfd; - - return 0; - -out_sk_alloc_fail: - fput(newfile); -out_sock_alloc_fail: - put_unused_fd(newfd); -out_fd_fail: - sock_release(newsock); -out: - return err; + return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) @@ -1705,17 +1560,17 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCKCMCLONE: { struct kcm_clone info; - struct socket *newsock = NULL; - - err = kcm_clone(sock, &info, &newsock); - if (!err) { - if (copy_to_user((void __user *)arg, &info, - sizeof(info))) { - err = -EFAULT; - sys_close(info.fd); - } - } + FD_PREPARE(fdf, 0, kcm_clone(sock)); + if (fdf.err) + return fdf.err; + + info.fd = fd_prepare_fd(fdf); + if (copy_to_user((void __user *)arg, &info, sizeof(info))) + return -EFAULT; + + fd_publish(fdf); + err = 0; break; } default: @@ -1726,14 +1581,6 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static void free_mux(struct rcu_head *rcu) -{ - struct kcm_mux *mux = container_of(rcu, - struct kcm_mux, rcu); - - kmem_cache_free(kcm_muxp, mux); -} - static void release_mux(struct kcm_mux *mux) { struct kcm_net *knet = mux->knet; @@ -1761,7 +1608,7 @@ static void release_mux(struct kcm_mux *mux) knet->count--; mutex_unlock(&knet->mutex); - call_rcu(&mux->rcu, free_mux); + kfree_rcu(mux, rcu); } static void kcm_done(struct kcm_sock *kcm) @@ -1782,7 +1629,8 @@ static void kcm_done(struct kcm_sock *kcm) if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); } /* Move any pending receive messages to other kcm sockets */ requeue_rx_msgs(mux, &sk->sk_receive_queue); @@ -1827,22 +1675,16 @@ static int kcm_release(struct socket *sock) kcm = kcm_sk(sk); mux = kcm->mux; + lock_sock(sk); sock_orphan(sk); kfree_skb(kcm->seq_skb); - lock_sock(sk); /* Purge queue under lock to avoid race condition with tx_work trying * to act when queue is nonempty. If tx_work runs after this point * it will just return. */ __skb_queue_purge(&sk->sk_write_queue); - /* Set tx_stopped. This is checked when psock is bound to a kcm and we - * get a writespace callback. This prevents further work being queued - * from the callback (unbinding the psock occurs after canceling work. - */ - kcm->tx_stopped = 1; - release_sock(sk); spin_lock_bh(&mux->lock); @@ -1858,7 +1700,7 @@ static int kcm_release(struct socket *sock) /* Cancel work. After this point there should be no outside references * to the kcm socket. */ - cancel_work_sync(&kcm->tx_work); + disable_work_sync(&kcm->tx_work); lock_sock(sk); psock = kcm->tx_psock; @@ -1900,7 +1742,7 @@ static const struct proto_ops kcm_dgram_ops = { .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, - .sendpage = kcm_sendpage, + .splice_eof = kcm_splice_eof, }; static const struct proto_ops kcm_seqpacket_ops = { @@ -1921,7 +1763,7 @@ static const struct proto_ops kcm_seqpacket_ops = { .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, - .sendpage = kcm_sendpage, + .splice_eof = kcm_splice_eof, .splice_read = kcm_splice_read, }; @@ -2009,6 +1851,8 @@ static __net_exit void kcm_exit_net(struct net *net) * that all multiplexors and psocks have been destroyed. */ WARN_ON(!list_empty(&knet->mux_list)); + + mutex_destroy(&knet->mutex); } static struct pernet_operations kcm_net_ops = { @@ -2022,15 +1866,11 @@ static int __init kcm_init(void) { int err = -ENOMEM; - kcm_muxp = kmem_cache_create("kcm_mux_cache", - sizeof(struct kcm_mux), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN); if (!kcm_muxp) goto fail; - kcm_psockp = kmem_cache_create("kcm_psock_cache", - sizeof(struct kcm_psock), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN); if (!kcm_psockp) goto fail; @@ -2042,14 +1882,14 @@ static int __init kcm_init(void) if (err) goto fail; - err = sock_register(&kcm_family_ops); - if (err) - goto sock_register_fail; - err = register_pernet_device(&kcm_net_ops); if (err) goto net_ops_fail; + err = sock_register(&kcm_family_ops); + if (err) + goto sock_register_fail; + err = kcm_proc_init(); if (err) goto proc_init_fail; @@ -2057,12 +1897,12 @@ static int __init kcm_init(void) return 0; proc_init_fail: - unregister_pernet_device(&kcm_net_ops); - -net_ops_fail: sock_unregister(PF_KCM); sock_register_fail: + unregister_pernet_device(&kcm_net_ops); + +net_ops_fail: proto_unregister(&kcm_proto); fail: @@ -2078,8 +1918,8 @@ fail: static void __exit kcm_exit(void) { kcm_proc_exit(); - unregister_pernet_device(&kcm_net_ops); sock_unregister(PF_KCM); + unregister_pernet_device(&kcm_net_ops); proto_unregister(&kcm_proto); destroy_workqueue(kcm_wq); @@ -2091,5 +1931,5 @@ module_init(kcm_init); module_exit(kcm_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets"); MODULE_ALIAS_NETPROTO(PF_KCM); - |
