summaryrefslogtreecommitdiff
path: root/kernel/seccomp.c
diff options
context:
space:
mode:
authorSargun Dhillon <sargun@sargun.me>2022-05-03 01:09:56 -0700
committerKees Cook <keescook@chromium.org>2022-05-03 14:11:58 -0700
commitc2aa2dfef243efe213a480a1ee8566507a5152f4 (patch)
tree81b322a5a5f425bc485ed1ac0c7bcceac1d7f22c /kernel/seccomp.c
parent662340ef921828507c931da6db303fa3cb02228e (diff)
seccomp: Add wait_killable semantic to seccomp user notifier
This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) that makes it so that when notifications are received by the supervisor the notifying process will transition to wait killable semantics. Although wait killable isn't a set of semantics formally exposed to userspace, the concept is searchable. If the notifying process is signaled prior to the notification being received by the userspace agent, it will be handled as normal. One quirk about how this is handled is that the notifying process only switches to TASK_KILLABLE if it receives a wakeup from either an addfd or a signal. This is to avoid an unnecessary wakeup of the notifying task. The reasons behind switching into wait_killable only after userspace receives the notification are: * Avoiding unncessary work - Often, workloads will perform work that they may abort (request racing comes to mind). This allows for syscalls to be aborted safely prior to the notification being received by the supervisor. In this, the supervisor doesn't end up doing work that the workload does not want to complete anyways. * Avoiding side effects - We don't want the syscall to be interruptible once the supervisor starts doing work because it may not be trivial to reverse the operation. For example, unmounting a file system may take a long time, and it's hard to rollback, or treat that as reentrant. * Avoid breaking runtimes - Various runtimes do not GC when they are during a syscall (or while running native code that subsequently calls a syscall). If many notifications are blocked, and not picked up by the supervisor, this can get the application into a bad state. Signed-off-by: Sargun Dhillon <sargun@sargun.me> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220503080958.20220-2-sargun@sargun.me
Diffstat (limited to 'kernel/seccomp.c')
-rw-r--r--kernel/seccomp.c42
1 files changed, 40 insertions, 2 deletions
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3caa0fe85235..e9852d1b4a5e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -200,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
* the filter can be freed.
* @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @wait_killable_recv: Put notifying process in killable state once the
+ * notification is received by the userspace listener.
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
* @notif: the struct that holds all notification related information
@@ -220,6 +222,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ bool wait_killable_recv;
struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
@@ -893,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags,
if (flags & SECCOMP_FILTER_FLAG_LOG)
filter->log = true;
+ /* Set wait killable flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+ filter->wait_killable_recv = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -1080,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
complete(&addfd->completion);
}
+static bool should_sleep_killable(struct seccomp_filter *match,
+ struct seccomp_knotif *n)
+{
+ return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+}
+
static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
@@ -1110,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall,
* This is where we wait for a reply from userspace.
*/
do {
+ bool wait_killable = should_sleep_killable(match, &n);
+
mutex_unlock(&match->notify_lock);
- err = wait_for_completion_interruptible(&n.ready);
+ if (wait_killable)
+ err = wait_for_completion_killable(&n.ready);
+ else
+ err = wait_for_completion_interruptible(&n.ready);
mutex_lock(&match->notify_lock);
- if (err != 0)
+
+ if (err != 0) {
+ /*
+ * Check to see if the notifcation got picked up and
+ * whether we should switch to wait killable.
+ */
+ if (!wait_killable && should_sleep_killable(match, &n))
+ continue;
+
goto interrupted;
+ }
addfd = list_first_entry_or_null(&n.addfd,
struct seccomp_kaddfd, list);
@@ -1484,6 +1511,9 @@ out:
mutex_lock(&filter->notify_lock);
knotif = find_notification(filter, unotif.id);
if (knotif) {
+ /* Reset the process to make sure it's not stuck */
+ if (should_sleep_killable(filter, knotif))
+ complete(&knotif->ready);
knotif->state = SECCOMP_NOTIFY_INIT;
up(&filter->notif->request);
}
@@ -1829,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
return -EINVAL;
+ /*
+ * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
+ * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
+ ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))