From 95036a79e7b56178e2fa9c485114be61d24c1695 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 28 Jun 2024 02:10:11 +0000 Subject: seccomp: interrupt SECCOMP_IOCTL_NOTIF_RECV when all users have exited SECCOMP_IOCTL_NOTIF_RECV promptly returns when a seccomp filter becomes unused, as a filter without users can't trigger any events. Previously, event listeners had to rely on epoll to detect when all processes had exited. The change is based on the 'commit 99cdb8b9a573 ("seccomp: notify about unused filter")' which implemented (E)POLLHUP notifications. Reviewed-by: Christian Brauner Signed-off-by: Andrei Vagin Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240628021014.231976-2-avagin@google.com Reviewed-by: Tycho Andersen Signed-off-by: Kees Cook --- kernel/seccomp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e30b60b57614..60990264fef0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1466,7 +1466,7 @@ static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int s void *key) { /* Avoid a wakeup if event not interesting for us. */ - if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) + if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR | EPOLLHUP))) return 0; return autoremove_wake_function(wait, mode, sync, key); } @@ -1476,6 +1476,9 @@ static int recv_wait_event(struct seccomp_filter *filter) DEFINE_WAIT_FUNC(wait, recv_wake_function); int ret; + if (refcount_read(&filter->users) == 0) + return 0; + if (atomic_dec_if_positive(&filter->notif->requests) >= 0) return 0; @@ -1484,6 +1487,8 @@ static int recv_wait_event(struct seccomp_filter *filter) if (atomic_dec_if_positive(&filter->notif->requests) >= 0) break; + if (refcount_read(&filter->users) == 0) + break; if (ret) return ret; -- cgit From bfafe5efa9754ebc991750da0bcca2a6694f3ed3 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 28 Jun 2024 02:10:12 +0000 Subject: seccomp: release task filters when the task exits Previously, seccomp filters were released in release_task(), which required the process to exit and its zombie to be collected. However, exited threads/processes can't trigger any seccomp events, making it more logical to release filters upon task exits. This adjustment simplifies scenarios where a parent is tracing its child process. The parent process can now handle all events from a seccomp listening descriptor and then call wait to collect a child zombie. seccomp_filter_release takes the siglock to avoid races with seccomp_sync_threads. There was an idea to bypass taking the lock by checking PF_EXITING, but it can be set without holding siglock if threads have SIGNAL_GROUP_EXIT. This means it can happen concurently with seccomp_filter_release. This change also fixes another minor problem. Suppose that a group leader installs the new filter without SECCOMP_FILTER_FLAG_TSYNC, exits, and becomes a zombie. Without this change, SECCOMP_FILTER_FLAG_TSYNC from any other thread can never succeed, seccomp_can_sync_threads() will check a zombie leader and is_ancestor() will fail. Reviewed-by: Oleg Nesterov Signed-off-by: Andrei Vagin Link: https://lore.kernel.org/r/20240628021014.231976-3-avagin@google.com Reviewed-by: Tycho Andersen Signed-off-by: Kees Cook --- kernel/exit.c | 3 ++- kernel/seccomp.c | 23 ++++++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f95a2c1338a8..b945ab81eb92 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -277,7 +277,6 @@ repeat: } write_unlock_irq(&tasklist_lock); - seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); @@ -832,6 +831,8 @@ void __noreturn do_exit(long code) io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ + seccomp_filter_release(tsk); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 60990264fef0..dc51e521bc1d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -502,6 +502,9 @@ static inline pid_t seccomp_can_sync_threads(void) /* Skip current, since it is initiating the sync. */ if (thread == caller) continue; + /* Skip exited threads. */ + if (thread->flags & PF_EXITING) + continue; if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || (thread->seccomp.mode == SECCOMP_MODE_FILTER && @@ -563,18 +566,21 @@ static void __seccomp_filter_release(struct seccomp_filter *orig) * @tsk: task the filter should be released from. * * This function should only be called when the task is exiting as - * it detaches it from its filter tree. As such, READ_ONCE() and - * barriers are not needed here, as would normally be needed. + * it detaches it from its filter tree. PF_EXITING has to be set + * for the task. */ void seccomp_filter_release(struct task_struct *tsk) { - struct seccomp_filter *orig = tsk->seccomp.filter; + struct seccomp_filter *orig; - /* We are effectively holding the siglock by not having any sighand. */ - WARN_ON(tsk->sighand != NULL); + if (WARN_ON((tsk->flags & PF_EXITING) == 0)) + return; + spin_lock_irq(&tsk->sighand->siglock); + orig = tsk->seccomp.filter; /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; + spin_unlock_irq(&tsk->sighand->siglock); __seccomp_filter_release(orig); } @@ -602,6 +608,13 @@ static inline void seccomp_sync_threads(unsigned long flags) if (thread == caller) continue; + /* + * Skip exited threads. seccomp_filter_release could have + * been already called for this task. + */ + if (thread->flags & PF_EXITING) + continue; + /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); -- cgit