summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-24 12:52:35 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-24 12:52:35 -0700
commit3323ddce085cdb33331c2c1bb7a88233023566a9 (patch)
tree8954fe69a603a4980b3cf8b4a6a2a4012f3fe49d /include
parenta632b76b427d886911221331f4bfcd44a3e58197 (diff)
parent6e890c5d5021ca7e69bbe203fde42447874d9a82 (diff)
Merge tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull user work thread updates from Christian Brauner: "This contains the work generalizing the ability to create a kernel worker from a userspace process. Such user workers will run with the same credentials as the userspace process they were created from providing stronger security and accounting guarantees than the traditional override_creds() approach ever could've hoped for. The original work was heavily based and optimzed for the needs of io_uring which was the first user. However, as it quickly turned out the ability to create user workers inherting properties from a userspace process is generally useful. The vhost subsystem currently creates workers using the kthread api. The consequences of using the kthread api are that RLIMITs don't work correctly as they are inherited from khtreadd. This leads to bugs where more workers are created than would be allowed by the RLIMITs of the userspace process in lieu of which workers are created. Problems like this disappear with user workers created from the userspace processes for which they perform the work. In addition, providing this api allows vhost to remove additional complexity. For example, cgroup and mm sharing will just work out of the box with user workers based on the relevant userspace process instead of manually ensuring the correct cgroup and mm contexts are used. So the vhost subsystem should simply be made to use the same mechanism as io_uring. To this end the original mechanism used for create_io_thread() is generalized into user workers: - Introduce PF_USER_WORKER as a generic indicator that a given task is a user worker, i.e., a kernel task that was created from a userspace process. Now a PF_IO_WORKER thread is just a specialized version of PF_USER_WORKER. So io_uring io workers raise both flags. - Make copy_process() available to core kernel code - Extend struct kernel_clone_args with the following bitfields allowing to indicate to copy_process(): - to create a user worker (raise PF_USER_WORKER) - to not inherit any files from the userspace process - to ignore signals After all generic changes are in place the vhost subsystem implements a new dedicated vhost api based on user workers. Finally, vhost is switched to rely on the new api moving it off of kthreads. Thanks to Mike for sticking it out and making it through this rather arduous journey" * tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: vhost: use vhost_tasks for worker threads vhost: move worker thread fields to new struct vhost_task: Allow vhost layer to use copy_process fork: allow kernel code to call copy_process fork: Add kernel_clone_args flag to ignore signals fork: add kernel_clone_args flag to not dup/clone files fork/vm: Move common PF_IO_WORKER behavior to new flag kernel: Make io_thread and kthread bit fields kthread: Pass in the thread's name during creation kernel: Allow a kernel thread's name to be set in copy_process csky: Remove kernel_thread declaration
Diffstat (limited to 'include')
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/sched/task.h13
-rw-r--r--include/linux/sched/vhost_task.h23
3 files changed, 34 insertions, 4 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..e1e605b1255b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1729,7 +1729,7 @@ extern struct pid *cad_pid;
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
-#define PF__HOLE__00004000 0x00004000
+#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF__HOLE__00010000 0x00010000
#define PF_KSWAPD 0x00020000 /* I am kswapd */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 357e0068497c..537cbf9a2ade 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -23,7 +23,13 @@ struct kernel_clone_args {
int __user *pidfd;
int __user *child_tid;
int __user *parent_tid;
+ const char *name;
int exit_signal;
+ u32 kthread:1;
+ u32 io_thread:1;
+ u32 user_worker:1;
+ u32 no_files:1;
+ u32 ignore_signals:1;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
@@ -31,8 +37,6 @@ struct kernel_clone_args {
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
- int io_thread;
- int kthread;
int idle;
int (*fn)(void *);
void *fn_arg;
@@ -89,9 +93,12 @@ extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);
extern pid_t kernel_clone(struct kernel_clone_args *kargs);
+struct task_struct *copy_process(struct pid *pid, int trace, int node,
+ struct kernel_clone_args *args);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
-extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags);
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);
diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
new file mode 100644
index 000000000000..6123c10b99cf
--- /dev/null
+++ b/include/linux/sched/vhost_task.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VHOST_TASK_H
+#define _LINUX_VHOST_TASK_H
+
+#include <linux/completion.h>
+
+struct task_struct;
+
+struct vhost_task {
+ int (*fn)(void *data);
+ void *data;
+ struct completion exited;
+ unsigned long flags;
+ struct task_struct *task;
+};
+
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+ const char *name);
+void vhost_task_start(struct vhost_task *vtsk);
+void vhost_task_stop(struct vhost_task *vtsk);
+bool vhost_task_should_stop(struct vhost_task *vtsk);
+
+#endif