summaryrefslogtreecommitdiff
path: root/net/core/skmsg.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-04-02 11:03:07 -0700
committerDavid S. Miller <davem@davemloft.net>2021-04-02 11:03:07 -0700
commitc2bcb4cf021121d7c162e44b7773281891e3abc2 (patch)
tree8f37f13ff2292f0435cd523507d40b7a384cb8c6 /net/core/skmsg.c
parentbd78980be1a68d14524c51c4b4170782fada622b (diff)
parent89d69c5d0fbcabd8656459bc8b1a476d6f1efee4 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says: ==================== pull-request: bpf-next 2021-04-01 The following pull-request contains BPF updates for your *net-next* tree. We've added 68 non-merge commits during the last 7 day(s) which contain a total of 70 files changed, 2944 insertions(+), 1139 deletions(-). The main changes are: 1) UDP support for sockmap, from Cong. 2) Verifier merge conflict resolution fix, from Daniel. 3) xsk selftests enhancements, from Maciej. 4) Unstable helpers aka kernel func calling, from Martin. 5) Batches ops for LPM map, from Pedro. 6) Fix race in bpf_get_local_storage, from Yonghong. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core/skmsg.c')
-rw-r--r--net/core/skmsg.c177
1 files changed, 143 insertions, 34 deletions
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 07f54015238a..92a83c02562a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,6 +399,104 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+ long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_wait_data);
+
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ struct sk_msg *msg_rx;
+ int i, copied = 0;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (!copy)
+ return copied ? copied : -EFAULT;
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ if (!msg_rx->skb)
+ sk_mem_uncharge(sk, copy);
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ /* Lets not optimize peek case if copy_page_to_iter
+ * didn't copy the entire length lets just break.
+ */
+ if (copy != sge->length)
+ return copied;
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while (i != msg_rx->sg.end);
+
+ if (unlikely(peek)) {
+ msg_rx = sk_psock_next_msg(psock, msg_rx);
+ if (!msg_rx)
+ break;
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+ msg_rx = sk_psock_dequeue_msg(psock);
+ kfree_sk_msg(msg_rx);
+ }
+ msg_rx = sk_psock_peek_msg(psock);
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
struct sk_buff *skb)
{
@@ -410,7 +508,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
if (!sk_rmem_schedule(sk, skb, skb->truesize))
return NULL;
- msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
if (unlikely(!msg))
return NULL;
@@ -497,7 +595,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
- return skb_send_sock_locked(psock->sk, skb, off, len);
+ return skb_send_sock(psock->sk, skb, off, len);
}
return sk_psock_skb_ingress(psock, skb);
}
@@ -511,8 +609,7 @@ static void sk_psock_backlog(struct work_struct *work)
u32 len, off;
int ret;
- /* Lock sock to avoid losing sk_socket during loop. */
- lock_sock(psock->sk);
+ mutex_lock(&psock->work_mutex);
if (state->skb) {
skb = state->skb;
len = state->len;
@@ -529,7 +626,7 @@ start:
skb_bpf_redirect_clear(skb);
do {
ret = -EIO;
- if (likely(psock->sk->sk_socket))
+ if (!sock_flag(psock->sk, SOCK_DEAD))
ret = sk_psock_handle_skb(psock, skb, off,
len, ingress);
if (ret <= 0) {
@@ -553,7 +650,7 @@ start:
kfree_skb(skb);
}
end:
- release_sock(psock->sk);
+ mutex_unlock(&psock->work_mutex);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -563,11 +660,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
write_lock_bh(&sk->sk_callback_lock);
- if (inet_csk_has_ulp(sk)) {
- psock = ERR_PTR(-EINVAL);
- goto out;
- }
-
if (sk->sk_user_data) {
psock = ERR_PTR(-EBUSY);
goto out;
@@ -591,7 +683,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
spin_lock_init(&psock->link_lock);
INIT_WORK(&psock->work, sk_psock_backlog);
+ mutex_init(&psock->work_mutex);
INIT_LIST_HEAD(&psock->ingress_msg);
+ spin_lock_init(&psock->ingress_lock);
skb_queue_head_init(&psock->ingress_skb);
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
@@ -630,11 +724,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
}
}
-static void sk_psock_zap_ingress(struct sk_psock *psock)
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
{
struct sk_buff *skb;
- while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
+ while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
skb_bpf_redirect_clear(skb);
kfree_skb(skb);
}
@@ -651,23 +745,35 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
}
}
+void sk_psock_stop(struct sk_psock *psock, bool wait)
+{
+ spin_lock_bh(&psock->ingress_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ sk_psock_cork_free(psock);
+ __sk_psock_zap_ingress(psock);
+ spin_unlock_bh(&psock->ingress_lock);
+
+ if (wait)
+ cancel_work_sync(&psock->work);
+}
+
static void sk_psock_done_strp(struct sk_psock *psock);
-static void sk_psock_destroy_deferred(struct work_struct *gc)
+static void sk_psock_destroy(struct work_struct *work)
{
- struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
-
+ struct sk_psock *psock = container_of(to_rcu_work(work),
+ struct sk_psock, rwork);
/* No sk_callback_lock since already detached. */
sk_psock_done_strp(psock);
cancel_work_sync(&psock->work);
+ mutex_destroy(&psock->work_mutex);
psock_progs_drop(&psock->progs);
sk_psock_link_destroy(psock);
sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
if (psock->sk_redir)
sock_put(psock->sk_redir);
@@ -675,30 +781,21 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
kfree(psock);
}
-static void sk_psock_destroy(struct rcu_head *rcu)
-{
- struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
-
- INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
- schedule_work(&psock->gc);
-}
-
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
- sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
+ sk_psock_stop(psock, false);
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
if (psock->progs.stream_parser)
sk_psock_stop_strp(sk, psock);
- else if (psock->progs.stream_verdict)
+ else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
- sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- call_rcu(&psock->rcu, sk_psock_destroy);
+ INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+ queue_rcu_work(system_wq, &psock->rwork);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
@@ -767,14 +864,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
* error that caused the pipe to break. We can't send a packet on
* a socket that is in this state so we drop the skb.
*/
- if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
- !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+ kfree_skb(skb);
+ return;
+ }
+ spin_lock_bh(&psock_other->ingress_lock);
+ if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ spin_unlock_bh(&psock_other->ingress_lock);
kfree_skb(skb);
return;
}
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
+ spin_unlock_bh(&psock_other->ingress_lock);
}
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
@@ -842,8 +945,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
err = sk_psock_skb_ingress_self(psock, skb);
}
if (err < 0) {
- skb_queue_tail(&psock->ingress_skb, skb);
- schedule_work(&psock->work);
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ skb_queue_tail(&psock->ingress_skb, skb);
+ schedule_work(&psock->work);
+ }
+ spin_unlock_bh(&psock->ingress_lock);
}
break;
case __SK_REDIRECT:
@@ -1010,6 +1117,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
}
skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.stream_verdict);
+ if (!prog)
+ prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);