From f43caa2adc96fc9c95fd77eef63cdff86ebf33cb Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Fri, 24 Jan 2020 12:40:16 +0100 Subject: cgroup: Clean up css_set task traversal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit css_task_iter stores pointer to head of each iterable list, this dates back to commit 0f0a2b4fa621 ("cgroup: reorganize css_task_iter") when we did not store cur_cset. Let us utilize list heads directly in cur_cset and streamline css_task_iter_advance_css_set a bit. This is no intentional function change. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e75d2191226b..f1219b927817 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -58,9 +58,6 @@ struct css_task_iter { struct list_head *tcset_head; struct list_head *task_pos; - struct list_head *tasks_head; - struct list_head *mg_tasks_head; - struct list_head *dying_tasks_head; struct list_head *cur_tasks_head; struct css_set *cur_cset; -- cgit From a49e4629b5edf1db856de05fbf1aae05502ef1af Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Fri, 24 Jan 2020 20:37:29 +0530 Subject: cpuset: Make cpuset hotplug synchronous Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug path. For memory hotplug path it still gets queued as a work item. Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug path, it is not required to wait for cpuset hotplug while thawing processes. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 04c20de66afc..cede4cb98b78 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -54,7 +54,6 @@ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void cpuset_read_lock(void); extern void cpuset_read_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); @@ -176,8 +175,6 @@ static inline void cpuset_update_active_cpus(void) partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_wait_for_hotplug(void) { } - static inline void cpuset_read_lock(void) { } static inline void cpuset_read_unlock(void) { } -- cgit From ef2c41cf38a7559bbf91af42d5b6a4429db8fc68 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:22 +0100 Subject: clone3: allow spawning processes into cgroups This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Johannes Weiner Cc: Li Zefan Cc: Peter Zijlstra Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 5 +++-- include/linux/cgroup.h | 20 ++++++++++++++------ include/linux/sched/task.h | 4 ++++ 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 63097cb243cb..68c391f451d1 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -628,8 +628,9 @@ struct cgroup_subsys { void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); - int (*can_fork)(struct task_struct *task); - void (*cancel_fork)(struct task_struct *task); + int (*can_fork)(struct task_struct *task, + struct css_set *cset); + void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f1219b927817..4598e4da6b1b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,8 @@ #include +struct kernel_clone_args; + #ifdef CONFIG_CGROUPS /* @@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); -extern int cgroup_can_fork(struct task_struct *p); -extern void cgroup_cancel_fork(struct task_struct *p); -extern void cgroup_post_fork(struct task_struct *p); +extern int cgroup_can_fork(struct task_struct *p, + struct kernel_clone_args *kargs); +extern void cgroup_cancel_fork(struct task_struct *p, + struct kernel_clone_args *kargs); +extern void cgroup_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); @@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} -static inline int cgroup_can_fork(struct task_struct *p) { return 0; } -static inline void cgroup_cancel_fork(struct task_struct *p) {} -static inline void cgroup_post_fork(struct task_struct *p) {} +static inline int cgroup_can_fork(struct task_struct *p, + struct kernel_clone_args *kargs) { return 0; } +static inline void cgroup_cancel_fork(struct task_struct *p, + struct kernel_clone_args *kargs) {} +static inline void cgroup_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index f1879884238e..38359071236a 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -13,6 +13,7 @@ struct task_struct; struct rusage; union thread_union; +struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL @@ -29,6 +30,9 @@ struct kernel_clone_args { pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; + int cgroup; + struct cgroup *cgrp; + struct css_set *cset; }; /* -- cgit From a46a22955bae16fc5a756af7188d3ccb25c3f797 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 12 Mar 2020 13:03:15 -0700 Subject: kernfs: Add removed_size out param for simple_xattr_set This helps set up size accounting in the next commit. Without this out param, it's difficult to find out the removed xattr size without taking a lock for longer and walking the xattr linked list twice. Signed-off-by: Daniel Xu Acked-by: Chris Down Reviewed-by: Greg Kroah-Hartman Signed-off-by: Tejun Heo --- include/linux/xattr.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 6dad031be3c2..4cf6e11f4a3c 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -102,7 +102,8 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags); + const void *value, size_t size, int flags, + ssize_t *removed_size); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_list_add(struct simple_xattrs *xattrs, -- cgit From 0c47383ba3bd10877956e41149d19644fba937d1 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 12 Mar 2020 13:03:16 -0700 Subject: kernfs: Add option to enable user xattrs User extended attributes are useful as metadata storage for kernfs consumers like cgroups. Especially in the case of cgroups, it is useful to have a central metadata store that multiple processes/services can use to coordinate actions. A concrete example is for userspace out of memory killers. We want to let delegated cgroup subtree owners (running as non-root) to be able to say "please avoid killing this cgroup". This is especially important for desktop linux as delegated subtrees owners are less likely to run as root. This patch introduces a new flag, KERNFS_ROOT_SUPPORT_USER_XATTR, that lets kernfs consumers enable user xattr support. An initial limit of 128 entries or 128KB -- whichever is hit first -- is placed per cgroup because xattrs come from kernel memory and we don't want to let unprivileged users accidentally eat up too much kernel memory. Signed-off-by: Daniel Xu Acked-by: Chris Down Reviewed-by: Greg Kroah-Hartman Signed-off-by: Tejun Heo --- include/linux/kernfs.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index dded2e5a9f42..89f6a4214a70 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -37,8 +37,10 @@ enum kernfs_node_type { KERNFS_LINK = 0x0004, }; -#define KERNFS_TYPE_MASK 0x000f -#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK +#define KERNFS_TYPE_MASK 0x000f +#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK +#define KERNFS_MAX_USER_XATTRS 128 +#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, @@ -78,6 +80,11 @@ enum kernfs_root_flag { * fhandle to access nodes of the fs. */ KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, + + /* + * Support user xattrs to be written to nodes rooted at this root. + */ + KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, }; /* type-specific structures for kernfs_node union members */ -- cgit From 2b729fe7f3e9478a21a336231daf35768e7cf37b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Apr 2020 11:32:13 -0400 Subject: Revert "cpuset: Make cpuset hotplug synchronous" This reverts commit a49e4629b5ed ("cpuset: Make cpuset hotplug synchronous") as it may deadlock with cpu hotplug path. Link: http://lkml.kernel.org/r/F0388D99-84D7-453B-9B6B-EEFF0E7BE4CC@lca.pw Signed-off-by: Tejun Heo Reported-by: Qian Cai Cc: Prateek Sood --- include/linux/cpuset.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index cede4cb98b78..04c20de66afc 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -54,6 +54,7 @@ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); +extern void cpuset_wait_for_hotplug(void); extern void cpuset_read_lock(void); extern void cpuset_read_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); @@ -175,6 +176,8 @@ static inline void cpuset_update_active_cpus(void) partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_wait_for_hotplug(void) { } + static inline void cpuset_read_lock(void) { } static inline void cpuset_read_unlock(void) { } -- cgit