From cf587db2ee0261c74d04f61f39783db88a0b65e4 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:23 -0600 Subject: kernel: Allow a kernel thread's name to be set in copy_process This patch allows kernel users to pass in the thread name so it can be set during creation instead of having to use set_task_comm after the thread is created. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 357e0068497c..32c9f01af0a6 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -23,6 +23,7 @@ struct kernel_clone_args { int __user *pidfd; int __user *child_tid; int __user *parent_tid; + const char *name; int exit_signal; unsigned long stack; unsigned long stack_size; @@ -91,7 +92,8 @@ extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); -extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags); extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); -- cgit From c81cc5819faf5dd77124f5086aa654482281ac37 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:25 -0600 Subject: kernel: Make io_thread and kthread bit fields We only set args->io_thread/kthread to 0 or 1 then test if they are set, so make them bit fields. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 32c9f01af0a6..268c77a42155 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -25,6 +25,8 @@ struct kernel_clone_args { int __user *parent_tid; const char *name; int exit_signal; + u32 kthread:1; + u32 io_thread:1; unsigned long stack; unsigned long stack_size; unsigned long tls; @@ -32,8 +34,6 @@ struct kernel_clone_args { /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; - int io_thread; - int kthread; int idle; int (*fn)(void *); void *fn_arg; -- cgit From 54e6842d0775ba76db65cbe21311c3ca466e663d Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:26 -0600 Subject: fork/vm: Move common PF_IO_WORKER behavior to new flag This adds a new flag, PF_USER_WORKER, that's used for behavior common to to both PF_IO_WORKER and users like vhost which will use a new helper instead of create_io_thread because they require different behavior for operations like signal handling. The common behavior PF_USER_WORKER covers is the vm reclaim handling. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched.h | 2 +- include/linux/sched/task.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 63d242164b1a..e1e605b1255b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1729,7 +1729,7 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF__HOLE__00004000 0x00004000 +#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 268c77a42155..2950e83d5382 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -27,6 +27,7 @@ struct kernel_clone_args { int exit_signal; u32 kthread:1; u32 io_thread:1; + u32 user_worker:1; unsigned long stack; unsigned long stack_size; unsigned long tls; -- cgit From 11f3f500ec8a75c96087f3bed87aa2b1c5de7498 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:27 -0600 Subject: fork: add kernel_clone_args flag to not dup/clone files Each vhost device gets a thread that is used to perform IO and management operations. Instead of a thread that is accessing a device, the thread is part of the device, so when it creates a thread using a helper based on copy_process we can't dup or clone the parent's files/FDS because it would do an extra increment on ourself. Later, when we do: Qemu process exits: do_exit -> exit_files -> put_files_struct -> close_files we would leak the device's resources because of that extra refcount on the fd or file_struct. This patch adds a no_files option so these worker threads can prevent taking an extra refcount on themselves. Signed-off-by: Mike Christie Acked-by: Christian Brauner Acked-by: Michael S. Tsirkin Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 2950e83d5382..4f816048794f 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -28,6 +28,7 @@ struct kernel_clone_args { u32 kthread:1; u32 io_thread:1; u32 user_worker:1; + u32 no_files:1; unsigned long stack; unsigned long stack_size; unsigned long tls; -- cgit From 094717586bf71ac20ae3b240d2654d826634b21e Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:28 -0600 Subject: fork: Add kernel_clone_args flag to ignore signals Since: commit 10ab825bdef8 ("change kernel threads to ignore signals instead of blocking them") kthreads have been ignoring signals by default, and the vhost layer has never had a need to change that. This patch adds an option flag, USER_WORKER_SIG_IGN, handled in copy_process() after copy_sighand() and copy_signals() so vhost_tasks added in the next patches can continue to ignore singals. Signed-off-by: Christian Brauner Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 4f816048794f..00c54bfac0b5 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -29,6 +29,7 @@ struct kernel_clone_args { u32 io_thread:1; u32 user_worker:1; u32 no_files:1; + u32 ignore_signals:1; unsigned long stack; unsigned long stack_size; unsigned long tls; -- cgit From 89c8e98d8cfb0656dbeb648572df5b13e372247d Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:29 -0600 Subject: fork: allow kernel code to call copy_process The next patch adds helpers like create_io_thread, but for use by the vhost layer. There are several functions, so they are in their own file instead of cluttering up fork.c. This patch allows that new file to call copy_process. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- include/linux/sched/task.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 00c54bfac0b5..537cbf9a2ade 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -93,6 +93,8 @@ extern void exit_files(struct task_struct *); extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); +struct task_struct *copy_process(struct pid *pid, int trace, int node, + struct kernel_clone_args *args); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, -- cgit From e297cd54b3f81d652456ae6cb93941fc6b5c6683 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:30 -0600 Subject: vhost_task: Allow vhost layer to use copy_process Qemu will create vhost devices in the kernel which perform network, SCSI, etc IO and management operations from worker threads created by the kthread API. Because the kthread API does a copy_process on the kthreadd thread, the vhost layer has to use kthread_use_mm to access the Qemu thread's memory and cgroup_attach_task_all to add itself to the Qemu thread's cgroups, and it bypasses the RLIMIT_NPROC limit which can result in VMs creating more threads than the admin expected. This patch adds a new struct vhost_task which can be used instead of kthreads. They allow the vhost layer to use copy_process and inherit the userspace process's mm and cgroups, the task is accounted for under the userspace's nproc count and can be seen in its process tree, and other features like namespaces work and are inherited by default. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Christian Brauner --- include/linux/sched/vhost_task.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/linux/sched/vhost_task.h (limited to 'include') diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h new file mode 100644 index 000000000000..6123c10b99cf --- /dev/null +++ b/include/linux/sched/vhost_task.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VHOST_TASK_H +#define _LINUX_VHOST_TASK_H + +#include + +struct task_struct; + +struct vhost_task { + int (*fn)(void *data); + void *data; + struct completion exited; + unsigned long flags; + struct task_struct *task; +}; + +struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, + const char *name); +void vhost_task_start(struct vhost_task *vtsk); +void vhost_task_stop(struct vhost_task *vtsk); +bool vhost_task_should_stop(struct vhost_task *vtsk); + +#endif -- cgit