From cf587db2ee0261c74d04f61f39783db88a0b65e4 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:23 -0600 Subject: kernel: Allow a kernel thread's name to be set in copy_process This patch allows kernel users to pass in the thread name so it can be set during creation instead of having to use set_task_comm after the thread is created. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/sched/task.h') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 357e0068497c..32c9f01af0a6 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -23,6 +23,7 @@ struct kernel_clone_args { int __user *pidfd; int __user *child_tid; int __user *parent_tid; + const char *name; int exit_signal; unsigned long stack; unsigned long stack_size; @@ -91,7 +92,8 @@ extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); -extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags); extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); -- cgit From c81cc5819faf5dd77124f5086aa654482281ac37 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:25 -0600 Subject: kernel: Make io_thread and kthread bit fields We only set args->io_thread/kthread to 0 or 1 then test if they are set, so make them bit fields. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/sched/task.h') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 32c9f01af0a6..268c77a42155 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -25,6 +25,8 @@ struct kernel_clone_args { int __user *parent_tid; const char *name; int exit_signal; + u32 kthread:1; + u32 io_thread:1; unsigned long stack; unsigned long stack_size; unsigned long tls; @@ -32,8 +34,6 @@ struct kernel_clone_args { /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; - int io_thread; - int kthread; int idle; int (*fn)(void *); void *fn_arg; -- cgit From 54e6842d0775ba76db65cbe21311c3ca466e663d Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:26 -0600 Subject: fork/vm: Move common PF_IO_WORKER behavior to new flag This adds a new flag, PF_USER_WORKER, that's used for behavior common to to both PF_IO_WORKER and users like vhost which will use a new helper instead of create_io_thread because they require different behavior for operations like signal handling. The common behavior PF_USER_WORKER covers is the vm reclaim handling. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched/task.h') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 268c77a42155..2950e83d5382 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -27,6 +27,7 @@ struct kernel_clone_args { int exit_signal; u32 kthread:1; u32 io_thread:1; + u32 user_worker:1; unsigned long stack; unsigned long stack_size; unsigned long tls; -- cgit From 11f3f500ec8a75c96087f3bed87aa2b1c5de7498 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:27 -0600 Subject: fork: add kernel_clone_args flag to not dup/clone files Each vhost device gets a thread that is used to perform IO and management operations. Instead of a thread that is accessing a device, the thread is part of the device, so when it creates a thread using a helper based on copy_process we can't dup or clone the parent's files/FDS because it would do an extra increment on ourself. Later, when we do: Qemu process exits: do_exit -> exit_files -> put_files_struct -> close_files we would leak the device's resources because of that extra refcount on the fd or file_struct. This patch adds a no_files option so these worker threads can prevent taking an extra refcount on themselves. Signed-off-by: Mike Christie Acked-by: Christian Brauner Acked-by: Michael S. Tsirkin Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched/task.h') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 2950e83d5382..4f816048794f 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -28,6 +28,7 @@ struct kernel_clone_args { u32 kthread:1; u32 io_thread:1; u32 user_worker:1; + u32 no_files:1; unsigned long stack; unsigned long stack_size; unsigned long tls; -- cgit From 094717586bf71ac20ae3b240d2654d826634b21e Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:28 -0600 Subject: fork: Add kernel_clone_args flag to ignore signals Since: commit 10ab825bdef8 ("change kernel threads to ignore signals instead of blocking them") kthreads have been ignoring signals by default, and the vhost layer has never had a need to change that. This patch adds an option flag, USER_WORKER_SIG_IGN, handled in copy_process() after copy_sighand() and copy_signals() so vhost_tasks added in the next patches can continue to ignore singals. Signed-off-by: Christian Brauner Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched/task.h') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 4f816048794f..00c54bfac0b5 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -29,6 +29,7 @@ struct kernel_clone_args { u32 io_thread:1; u32 user_worker:1; u32 no_files:1; + u32 ignore_signals:1; unsigned long stack; unsigned long stack_size; unsigned long tls; -- cgit From 89c8e98d8cfb0656dbeb648572df5b13e372247d Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:29 -0600 Subject: fork: allow kernel code to call copy_process The next patch adds helpers like create_io_thread, but for use by the vhost layer. There are several functions, so they are in their own file instead of cluttering up fork.c. This patch allows that new file to call copy_process. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched/task.h') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 00c54bfac0b5..537cbf9a2ade 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -93,6 +93,8 @@ extern void exit_files(struct task_struct *); extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); +struct task_struct *copy_process(struct pid *pid, int trace, int node, + struct kernel_clone_args *args); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, -- cgit From f9010dbdce911ee1f1af1398a24b1f9f992e0080 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 1 Jun 2023 13:32:32 -0500 Subject: fork, vhost: Use CLONE_THREAD to fix freezer/ps regression When switching from kthreads to vhost_tasks two bugs were added: 1. The vhost worker tasks's now show up as processes so scripts doing ps or ps a would not incorrectly detect the vhost task as another process. 2. kthreads disabled freeze by setting PF_NOFREEZE, but vhost tasks's didn't disable or add support for them. To fix both bugs, this switches the vhost task to be thread in the process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that SIGKILL/STOP support is required because CLONE_THREAD requires CLONE_SIGHAND which requires those 2 signals to be supported. This is a modified version of the patch written by Mike Christie which was a modified version of patch originally written by Linus. Much of what depended upon PF_IO_WORKER now depends on PF_USER_WORKER. Including ignoring signals, setting up the register state, and having get_signal return instead of calling do_group_exit. Tidied up the vhost_task abstraction so that the definition of vhost_task only needs to be visible inside of vhost_task.c. Making it easier to review the code and tell what needs to be done where. As part of this the main loop has been moved from vhost_worker into vhost_task_fn. vhost_worker now returns true if work was done. The main loop has been updated to call get_signal which handles SIGSTOP, freezing, and collects the message that tells the thread to exit as part of process exit. This collection clears __fatal_signal_pending. This collection is not guaranteed to clear signal_pending() so clear that explicitly so the schedule() sleeps. For now the vhost thread continues to exist and run work until the last file descriptor is closed and the release function is called as part of freeing struct file. To avoid hangs in the coredump rendezvous and when killing threads in a multi-threaded exec. The coredump code and de_thread have been modified to ignore vhost threads. Remvoing the special case for exec appears to require teaching vhost_dev_flush how to directly complete transactions in case the vhost thread is no longer running. Removing the special case for coredump rendezvous requires either the above fix needed for exec or moving the coredump rendezvous into get_signal. Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads") Signed-off-by: Eric W. Biederman Co-developed-by: Mike Christie Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Linus Torvalds --- include/linux/sched/task.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched/task.h') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 537cbf9a2ade..e0f5ac90a228 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -29,7 +29,6 @@ struct kernel_clone_args { u32 io_thread:1; u32 user_worker:1; u32 no_files:1; - u32 ignore_signals:1; unsigned long stack; unsigned long stack_size; unsigned long tls; -- cgit