From f43caa2adc96fc9c95fd77eef63cdff86ebf33cb Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Fri, 24 Jan 2020 12:40:16 +0100 Subject: cgroup: Clean up css_set task traversal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit css_task_iter stores pointer to head of each iterable list, this dates back to commit 0f0a2b4fa621 ("cgroup: reorganize css_task_iter") when we did not store cur_cset. Let us utilize list heads directly in cur_cset and streamline css_task_iter_advance_css_set a bit. This is no intentional function change. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 61 +++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c719a4154d6d..b4c4c4fbd6de 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4391,29 +4391,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) lockdep_assert_held(&css_set_lock); - /* Advance to the next non-empty css_set */ - do { - cset = css_task_iter_next_css_set(it); - if (!cset) { - it->task_pos = NULL; - return; + /* Advance to the next non-empty css_set and find first non-empty tasks list*/ + while ((cset = css_task_iter_next_css_set(it))) { + if (!list_empty(&cset->tasks)) { + it->cur_tasks_head = &cset->tasks; + break; + } else if (!list_empty(&cset->mg_tasks)) { + it->cur_tasks_head = &cset->mg_tasks; + break; + } else if (!list_empty(&cset->dying_tasks)) { + it->cur_tasks_head = &cset->dying_tasks; + break; } - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - - if (!list_empty(&cset->tasks)) { - it->task_pos = cset->tasks.next; - it->cur_tasks_head = &cset->tasks; - } else if (!list_empty(&cset->mg_tasks)) { - it->task_pos = cset->mg_tasks.next; - it->cur_tasks_head = &cset->mg_tasks; - } else { - it->task_pos = cset->dying_tasks.next; - it->cur_tasks_head = &cset->dying_tasks; } - - it->tasks_head = &cset->tasks; - it->mg_tasks_head = &cset->mg_tasks; - it->dying_tasks_head = &cset->dying_tasks; + if (!cset) { + it->task_pos = NULL; + return; + } + it->task_pos = it->cur_tasks_head->next; /* * We don't keep css_sets locked across iteration steps and thus @@ -4458,24 +4453,24 @@ static void css_task_iter_advance(struct css_task_iter *it) repeat: if (it->task_pos) { /* - * Advance iterator to find next entry. cset->tasks is - * consumed first and then ->mg_tasks. After ->mg_tasks, - * we move onto the next cset. + * Advance iterator to find next entry. We go through cset + * tasks, mg_tasks and dying_tasks, when consumed we move onto + * the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) { - it->task_pos = it->mg_tasks_head->next; - it->cur_tasks_head = it->mg_tasks_head; + if (it->task_pos == &it->cur_cset->tasks) { + it->cur_tasks_head = &it->cur_cset->mg_tasks; + it->task_pos = it->cur_tasks_head->next; } - if (it->task_pos == it->mg_tasks_head) { - it->task_pos = it->dying_tasks_head->next; - it->cur_tasks_head = it->dying_tasks_head; + if (it->task_pos == &it->cur_cset->mg_tasks) { + it->cur_tasks_head = &it->cur_cset->dying_tasks; + it->task_pos = it->cur_tasks_head->next; } - if (it->task_pos == it->dying_tasks_head) + if (it->task_pos == &it->cur_cset->dying_tasks) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ @@ -4493,12 +4488,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (it->cur_tasks_head == it->dying_tasks_head && + if (it->cur_tasks_head == &it->cur_cset->dying_tasks && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (it->cur_tasks_head == it->dying_tasks_head) + if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat; } } -- cgit From 3010c5b9f5f476b35b24955f08a3a6c06ec8e878 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sat, 18 Jan 2020 08:40:51 +0530 Subject: cgroup.c: Use built-in RCU list checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit list_for_each_entry_rcu has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Even though the function css_next_child() already checks if cgroup_mutex or rcu_read_lock() is held using cgroup_assert_mutex_or_rcu_locked(), there is a need to pass cond to list_for_each_entry_rcu() to avoid false positive lockdep warning. Signed-off-by: Madhuparna Bhowmik Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b4c4c4fbd6de..7a310db6c807 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4148,7 +4148,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &parent->children, sibling) + list_for_each_entry_rcu(next, &parent->children, sibling, + lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break; } -- cgit From a49e4629b5edf1db856de05fbf1aae05502ef1af Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Fri, 24 Jan 2020 20:37:29 +0530 Subject: cpuset: Make cpuset hotplug synchronous Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug path. For memory hotplug path it still gets queued as a work item. Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug path, it is not required to wait for cpuset hotplug while thawing processes. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 31 +++++++++++++++++++------------ kernel/power/process.c | 2 -- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58f5073acff7..cafd4d2ff882 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3101,7 +3101,7 @@ update_tasks: } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -3116,7 +3116,7 @@ update_tasks: * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug(bool use_cpu_hp_lock) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -3201,25 +3201,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated || force_rebuild) { force_rebuild = false; - rebuild_sched_domains(); + if (use_cpu_hp_lock) + rebuild_sched_domains(); + else { + /* Acquiring cpu_hotplug_lock is not required. + * When cpuset_hotplug() is called in hotplug path, + * cpu_hotplug_lock is held by the hotplug context + * which is waiting for cpuhp_thread_fun to indicate + * completion of callback. + */ + percpu_down_write(&cpuset_rwsem); + rebuild_sched_domains_locked(); + percpu_up_write(&cpuset_rwsem); + } } free_cpumasks(NULL, ptmp); } -void cpuset_update_active_cpus(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); + cpuset_hotplug(true); } -void cpuset_wait_for_hotplug(void) +void cpuset_update_active_cpus(void) { - flush_work(&cpuset_hotplug_work); + cpuset_hotplug(false); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 4b6a54da7e65..08f7019357ee 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,8 +204,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ -- cgit From 6df970e4f5d2c273554550d40d8b92cea9bec1a0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:18 +0100 Subject: cgroup: unify attach permission checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The core codepaths to check whether a process can be attached to a cgroup are the same for threads and thread-group leaders. Only a small piece of code verifying that source and destination cgroup are in the same domain differentiates the thread permission checking from thread-group leader permission checking. Since cgroup_migrate_vet_dst() only matters cgroup2 - it is a noop on cgroup1 - we can move it out of cgroup_attach_task(). All checks can now be consolidated into a new helper cgroup_attach_permissions() callable from both cgroup_procs_write() and cgroup_threads_write(). Cc: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Acked-by: Michal Koutný Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7a310db6c807..9ca51bf3769a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2714,11 +2714,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; - int ret; - - ret = cgroup_migrate_vet_dst(dst_cgrp); - if (ret) - return ret; + int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); @@ -4695,6 +4691,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, return 0; } +static int cgroup_attach_permissions(struct cgroup *src_cgrp, + struct cgroup *dst_cgrp, + struct super_block *sb, bool threadgroup) +{ + int ret = 0; + + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + if (ret) + return ret; + + ret = cgroup_migrate_vet_dst(dst_cgrp); + if (ret) + return ret; + + if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) + ret = -EOPNOTSUPP; + + return ret; +} + static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -4717,8 +4733,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, true); if (ret) goto out_finish; @@ -4762,16 +4778,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, spin_unlock_irq(&css_set_lock); /* thread migrations follow the cgroup.procs delegation rule */ - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, false); if (ret) goto out_finish; - /* and must be contained in the same domain */ - ret = -EOPNOTSUPP; - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) - goto out_finish; - ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: -- cgit From 17703097f3456498e6424614571648c6452f4d34 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:19 +0100 Subject: cgroup: add cgroup_get_from_file() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper cgroup_get_from_file(). The helper will be used in subsequent patches to retrieve a cgroup while holding a reference to the struct file it was taken from. Cc: Tejun Heo Cc: Johannes Weiner Cc: Li Zefan Cc: cgroups@vger.kernel.org Acked-by: Michal Koutný Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9ca51bf3769a..16fe1c6cad35 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5880,6 +5880,24 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} + /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the task in question. @@ -6171,7 +6189,6 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); */ struct cgroup *cgroup_get_from_fd(int fd) { - struct cgroup_subsys_state *css; struct cgroup *cgrp; struct file *f; @@ -6179,17 +6196,8 @@ struct cgroup *cgroup_get_from_fd(int fd) if (!f) return ERR_PTR(-EBADF); - css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + cgrp = cgroup_get_from_file(f); fput(f); - if (IS_ERR(css)) - return ERR_CAST(css); - - cgrp = css->cgroup; - if (!cgroup_on_dfl(cgrp)) { - cgroup_put(cgrp); - return ERR_PTR(-EBADF); - } - return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); -- cgit From 5a5cf5cb30d7815c01035fde4b84edef85d11c68 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:20 +0100 Subject: cgroup: refactor fork helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This refactors the fork helpers so they can be easily modified in the next patches. The patch just moves the cgroup threadgroup rwsem grab and release into the helpers. They don't need to be directly exposed in fork.c. Cc: Tejun Heo Cc: Johannes Weiner Cc: Li Zefan Cc: cgroups@vger.kernel.org Acked-by: Michal Koutný Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 44 ++++++++++++++++++++++++++------------------ kernel/fork.c | 6 +----- 2 files changed, 27 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 16fe1c6cad35..502769b2683c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5900,17 +5900,20 @@ static struct cgroup *cgroup_get_from_file(struct file *f) /** * cgroup_can_fork - called on a new task before the process is exposed - * @child: the task in question. + * @child: the child process * - * This calls the subsystem can_fork() callbacks. If the can_fork() callback - * returns an error, the fork aborts with that error code. This allows for - * a cgroup subsystem to conditionally allow or deny new forks. + * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() + * callback returns an error, the fork aborts with that error code. This + * allows for a cgroup subsystem to conditionally allow or deny new forks. */ int cgroup_can_fork(struct task_struct *child) + __acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; int i, j, ret; + cgroup_threadgroup_change_begin(current); + do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child); if (ret) @@ -5927,17 +5930,20 @@ out_revert: ss->cancel_fork(child); } + cgroup_threadgroup_change_end(current); + return ret; } /** - * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() - * @child: the task in question - * - * This calls the cancel_fork() callbacks if a fork failed *after* - * cgroup_can_fork() succeded. - */ + * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() + * @child: the child process + * + * This calls the cancel_fork() callbacks if a fork failed *after* + * cgroup_can_fork() succeded. + */ void cgroup_cancel_fork(struct task_struct *child) + __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; int i; @@ -5945,19 +5951,19 @@ void cgroup_cancel_fork(struct task_struct *child) for_each_subsys(ss, i) if (ss->cancel_fork) ss->cancel_fork(child); + + cgroup_threadgroup_change_end(current); } /** - * cgroup_post_fork - called on a new task after adding it to the task list - * @child: the task in question - * - * Adds the task to the list running through its css_set if necessary and - * call the subsystem fork() callbacks. Has to be after the task is - * visible on the task list in case we race with the first call to - * cgroup_task_iter_start() - to guarantee that the new task ends up on its - * list. + * cgroup_post_fork - finalize cgroup setup for the child process + * @child: the child process + * + * Attach the child process to its css_set calling the subsystem fork() + * callbacks. */ void cgroup_post_fork(struct task_struct *child) + __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; struct css_set *cset; @@ -6003,6 +6009,8 @@ void cgroup_post_fork(struct task_struct *child) do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); } while_each_subsys_mask(); + + cgroup_threadgroup_change_end(current); } /** diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..9245b6e53f55 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2174,7 +2174,6 @@ static __latent_entropy struct task_struct *copy_process( INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - cgroup_threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -2183,7 +2182,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_cgroup_threadgroup_change_end; + goto bad_fork_put_pidfd; /* * From this point on we must avoid any synchronous user-space @@ -2289,7 +2288,6 @@ static __latent_entropy struct task_struct *copy_process( proc_fork_connector(p); cgroup_post_fork(p); - cgroup_threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2301,8 +2299,6 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); -bad_fork_cgroup_threadgroup_change_end: - cgroup_threadgroup_change_end(current); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); -- cgit From f3553220d4cc458d69f7da6e71a3a6097778bd28 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:21 +0100 Subject: cgroup: add cgroup_may_write() helper Add a cgroup_may_write() helper which we can use in the CLONE_INTO_CGROUP patch series to verify that we can write to the destination cgroup. Cc: Tejun Heo Cc: Johannes Weiner Cc: Li Zefan Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 502769b2683c..6d8bdddd8c28 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4654,13 +4654,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v) return 0; } +static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) +{ + int ret; + struct inode *inode; + + lockdep_assert_held(&cgroup_mutex); + + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + return ret; +} + static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb) { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; - struct inode *inode; int ret; lockdep_assert_held(&cgroup_mutex); @@ -4670,12 +4685,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; - - ret = inode_permission(inode, MAY_WRITE); - iput(inode); + ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret; -- cgit From ef2c41cf38a7559bbf91af42d5b6a4429db8fc68 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:22 +0100 Subject: clone3: allow spawning processes into cgroups This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Johannes Weiner Cc: Li Zefan Cc: Peter Zijlstra Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 191 +++++++++++++++++++++++++++++++++++++++++++------ kernel/cgroup/pids.c | 15 ++-- kernel/fork.c | 13 ++-- 3 files changed, 188 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6d8bdddd8c28..9a8a5ded3c48 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5881,8 +5881,7 @@ out: * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() - * attaches it to the parent's css_set. Empty cg_list indicates that - * @child isn't holding reference to its css_set. + * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { @@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f) return cgrp; } +/** + * cgroup_css_set_fork - find or create a css_set for a child process + * @kargs: the arguments passed to create the child process + * + * This functions finds or creates a new css_set which the child + * process will be attached to in cgroup_post_fork(). By default, + * the child process will be given the same css_set as its parent. + * + * If CLONE_INTO_CGROUP is specified this function will try to find an + * existing css_set which includes the requested cgroup and if not create + * a new css_set that the child will be attached to later. If this function + * succeeds it will hold cgroup_threadgroup_rwsem on return. If + * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex + * before grabbing cgroup_threadgroup_rwsem and will hold a reference + * to the target cgroup. + */ +static int cgroup_css_set_fork(struct kernel_clone_args *kargs) + __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) +{ + int ret; + struct cgroup *dst_cgrp = NULL; + struct css_set *cset; + struct super_block *sb; + struct file *f; + + if (kargs->flags & CLONE_INTO_CGROUP) + mutex_lock(&cgroup_mutex); + + cgroup_threadgroup_change_begin(current); + + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + if (!(kargs->flags & CLONE_INTO_CGROUP)) { + kargs->cset = cset; + return 0; + } + + f = fget_raw(kargs->cgroup); + if (!f) { + ret = -EBADF; + goto err; + } + sb = f->f_path.dentry->d_sb; + + dst_cgrp = cgroup_get_from_file(f); + if (IS_ERR(dst_cgrp)) { + ret = PTR_ERR(dst_cgrp); + dst_cgrp = NULL; + goto err; + } + + if (cgroup_is_dead(dst_cgrp)) { + ret = -ENODEV; + goto err; + } + + /* + * Verify that we the target cgroup is writable for us. This is + * usually done by the vfs layer but since we're not going through + * the vfs layer here we need to do it "manually". + */ + ret = cgroup_may_write(dst_cgrp, sb); + if (ret) + goto err; + + ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, + !(kargs->flags & CLONE_THREAD)); + if (ret) + goto err; + + kargs->cset = find_css_set(cset, dst_cgrp); + if (!kargs->cset) { + ret = -ENOMEM; + goto err; + } + + put_css_set(cset); + fput(f); + kargs->cgrp = dst_cgrp; + return ret; + +err: + cgroup_threadgroup_change_end(current); + mutex_unlock(&cgroup_mutex); + if (f) + fput(f); + if (dst_cgrp) + cgroup_put(dst_cgrp); + put_css_set(cset); + if (kargs->cset) + put_css_set(kargs->cset); + return ret; +} + +/** + * cgroup_css_set_put_fork - drop references we took during fork + * @kargs: the arguments passed to create the child process + * + * Drop references to the prepared css_set and target cgroup if + * CLONE_INTO_CGROUP was requested. + */ +static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) +{ + cgroup_threadgroup_change_end(current); + + if (kargs->flags & CLONE_INTO_CGROUP) { + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; + + mutex_unlock(&cgroup_mutex); + + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + + if (cgrp) { + cgroup_put(cgrp); + kargs->cgrp = NULL; + } + } +} + /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process * + * This prepares a new css_set for the child process which the child will + * be attached to in cgroup_post_fork(). * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() * callback returns an error, the fork aborts with that error code. This * allows for a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child) - __acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem) +int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; - cgroup_threadgroup_change_begin(current); + ret = cgroup_css_set_fork(kargs); + if (ret) + return ret; do_each_subsys_mask(ss, i, have_canfork_callback) { - ret = ss->can_fork(child); + ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); @@ -5937,32 +6066,34 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); } - cgroup_threadgroup_change_end(current); + cgroup_css_set_put_fork(kargs); return ret; } /** - * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() - * @child: the child process - * - * This calls the cancel_fork() callbacks if a fork failed *after* - * cgroup_can_fork() succeded. - */ -void cgroup_cancel_fork(struct task_struct *child) - __releases(&cgroup_threadgroup_rwsem) + * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() + * @child: the child process + * @kargs: the arguments passed to create the child process + * + * This calls the cancel_fork() callbacks if a fork failed *after* + * cgroup_can_fork() succeded and cleans up references we took to + * prepare a new css_set for the child process in cgroup_can_fork(). + */ +void cgroup_cancel_fork(struct task_struct *child, + struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); - cgroup_threadgroup_change_end(current); + cgroup_css_set_put_fork(kargs); } /** @@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child) * Attach the child process to its css_set calling the subsystem fork() * callbacks. */ -void cgroup_post_fork(struct task_struct *child) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_post_fork(struct task_struct *child, + struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { struct cgroup_subsys *ss; struct css_set *cset; int i; + cset = kargs->cset; + kargs->cset = NULL; + spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ if (likely(child->pid)) { WARN_ON_ONCE(!list_empty(&child->cg_list)); - cset = task_css_set(current); /* current is @child's parent */ - get_css_set(cset); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); + } else { + put_css_set(cset); + cset = NULL; } /* @@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child) ss->fork(child); } while_each_subsys_mask(); - cgroup_threadgroup_change_end(current); + /* Make the new cset the root_cset of the new cgroup namespace. */ + if (kargs->flags & CLONE_NEWCGROUP) { + struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; + + get_css_set(cset); + child->nsproxy->cgroup_ns->root_cset = cset; + put_css_set(rcset); + } + + cgroup_css_set_put_fork(kargs); } /** diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 138059eb730d..511af87f685e 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -33,6 +33,7 @@ #include #include #include +#include #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" @@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). */ -static int pids_can_fork(struct task_struct *task) +static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; int err; - css = task_css_check(current, pids_cgrp_id, true); + if (cset) + css = cset->subsys[pids_cgrp_id]; + else + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); err = pids_try_charge(pids, 1); if (err) { @@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task) return err; } -static void pids_cancel_fork(struct task_struct *task) +static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; - css = task_css_check(current, pids_cgrp_id, true); + if (cset) + css = cset->subsys[pids_cgrp_id]; + else + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); pids_uncharge(pids, 1); } diff --git a/kernel/fork.c b/kernel/fork.c index 9245b6e53f55..635d6369dfb9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process( * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p); + retval = cgroup_can_fork(p, args); if (retval) goto bad_fork_put_pidfd; @@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); + cgroup_post_fork(p, args); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process( bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - cgroup_cancel_fork(p); + cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); @@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, !valid_signal(args.exit_signal))) return -EINVAL; + if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0) + return -EINVAL; + *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), @@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack_size = args.stack_size, .tls = args.tls, .set_tid_size = args.set_tid_size, + .cgroup = args.cgroup, }; if (args.set_tid && @@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ - if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) + if (kargs->flags & + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) return false; /* -- cgit From e7b20d97967c2995700041f0348ea33047e5c942 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 12 Mar 2020 16:44:35 -0400 Subject: cgroup: Restructure release_agent_path handling cgrp->root->release_agent_path is protected by both cgroup_mutex and release_agent_path_lock and readers can hold either one. The dual-locking scheme was introduced while breaking a locking dependency issue around cgroup_mutex but doesn't make sense anymore given that the only remaining reader which uses cgroup_mutex is cgroup1_releaes_agent(). This patch updates cgroup1_release_agent() to use release_agent_path_lock so that release_agent_path is always protected only by release_agent_path_lock. While at it, convert strlen() based empty string checks to direct tests on the first character as suggested by Linus. Signed-off-by: Tejun Heo Cc: Linus Torvalds --- kernel/cgroup/cgroup-v1.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index f2d7cea86ffe..191c329e482a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -38,10 +38,7 @@ static bool cgroup_no_v1_named; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ +/* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) @@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; - mutex_lock(&cgroup_mutex); + /* snoop agent path and exit early if empty */ + if (!cgrp->root->release_agent_path[0]) + return; + /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf || !strlen(agentbuf)) - goto out; + agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out_free; - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); + spin_lock(&release_agent_path_lock); + strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); + spin_unlock(&release_agent_path_lock); + if (!agentbuf[0]) + goto out_free; + + ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0 || ret >= PATH_MAX) - goto out; + goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; @@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work) envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; - mutex_unlock(&cgroup_mutex); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); out_free: kfree(agentbuf); kfree(pathbuf); -- cgit From 38aca3071cebc90e6b07abd697cba5c9d7b37a94 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 12 Mar 2020 13:03:17 -0700 Subject: cgroupfs: Support user xattrs This patch turns on xattr support for cgroupfs. This is useful for letting non-root owners of delegated subtrees attach metadata to cgroups. One use case is for subtree owners to tell a userspace out of memory killer to bias away from killing specific subtrees. Tests: [/sys/fs/cgroup]# for i in $(seq 0 130); \ do setfattr workload.slice -n user.name$i -v wow; done setfattr: workload.slice: No space left on device setfattr: workload.slice: No space left on device setfattr: workload.slice: No space left on device [/sys/fs/cgroup]# for i in $(seq 0 130); \ do setfattr workload.slice --remove user.name$i; done setfattr: workload.slice: No such attribute setfattr: workload.slice: No such attribute setfattr: workload.slice: No such attribute [/sys/fs/cgroup]# for i in $(seq 0 130); \ do setfattr workload.slice -n user.name$i -v wow; done setfattr: workload.slice: No space left on device setfattr: workload.slice: No space left on device setfattr: workload.slice: No space left on device `seq 0 130` is inclusive, and 131 - 128 = 3, which is the number of errors we expect to see. [/data]# cat testxattr.c #include #include #include #include int main() { char name[256]; char *buf = malloc(64 << 10); if (!buf) { perror("malloc"); return 1; } for (int i = 0; i < 4; ++i) { snprintf(name, 256, "user.bigone%d", i); if (setxattr("/sys/fs/cgroup/system.slice", name, buf, 64 << 10, 0)) { printf("setxattr failed on iteration=%d\n", i); return 1; } } return 0; } [/data]# ./a.out setxattr failed on iteration=2 [/data]# ./a.out setxattr failed on iteration=0 [/sys/fs/cgroup]# setfattr -x user.bigone0 system.slice/ [/sys/fs/cgroup]# setfattr -x user.bigone1 system.slice/ [/data]# ./a.out setxattr failed on iteration=2 Signed-off-by: Daniel Xu Acked-by: Chris Down Reviewed-by: Greg Kroah-Hartman Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 114dcbf8d4f4..33ff9ec4a523 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1954,7 +1954,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_SUPPORT_EXPORTOP, + KERNFS_ROOT_SUPPORT_EXPORTOP | + KERNFS_ROOT_SUPPORT_USER_XATTR, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); -- cgit From 2b729fe7f3e9478a21a336231daf35768e7cf37b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Apr 2020 11:32:13 -0400 Subject: Revert "cpuset: Make cpuset hotplug synchronous" This reverts commit a49e4629b5ed ("cpuset: Make cpuset hotplug synchronous") as it may deadlock with cpu hotplug path. Link: http://lkml.kernel.org/r/F0388D99-84D7-453B-9B6B-EEFF0E7BE4CC@lca.pw Signed-off-by: Tejun Heo Reported-by: Qian Cai Cc: Prateek Sood --- kernel/cgroup/cpuset.c | 31 ++++++++++++------------------- kernel/power/process.c | 2 ++ 2 files changed, 14 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cafd4d2ff882..58f5073acff7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3101,7 +3101,7 @@ update_tasks: } /** - * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -3116,7 +3116,7 @@ update_tasks: * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug(bool use_cpu_hp_lock) +static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -3201,32 +3201,25 @@ static void cpuset_hotplug(bool use_cpu_hp_lock) /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated || force_rebuild) { force_rebuild = false; - if (use_cpu_hp_lock) - rebuild_sched_domains(); - else { - /* Acquiring cpu_hotplug_lock is not required. - * When cpuset_hotplug() is called in hotplug path, - * cpu_hotplug_lock is held by the hotplug context - * which is waiting for cpuhp_thread_fun to indicate - * completion of callback. - */ - percpu_down_write(&cpuset_rwsem); - rebuild_sched_domains_locked(); - percpu_up_write(&cpuset_rwsem); - } + rebuild_sched_domains(); } free_cpumasks(NULL, ptmp); } -static void cpuset_hotplug_workfn(struct work_struct *work) +void cpuset_update_active_cpus(void) { - cpuset_hotplug(true); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + */ + schedule_work(&cpuset_hotplug_work); } -void cpuset_update_active_cpus(void) +void cpuset_wait_for_hotplug(void) { - cpuset_hotplug(false); + flush_work(&cpuset_hotplug_work); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 08f7019357ee..4b6a54da7e65 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,6 +204,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ -- cgit From 0c05b9bdbfe52ad9b391a28dd26f047715627e0c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 30 Mar 2020 10:06:15 -0400 Subject: docs: cgroup-v1: Document the cpuset_v2_mode mount option The cpuset in cgroup v1 accepts a special "cpuset_v2_mode" mount option that make cpuset.cpus and cpuset.mems behave more like those in cgroup v2. Document it to make other people more aware of this feature that can be useful in some circumstances. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58f5073acff7..729d3a5c772e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -358,8 +358,12 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* - * Cgroup v2 behavior is used when on default hierarchy or the - * cgroup_v2_mode flag is set. + * Cgroup v2 behavior is used on the "cpus" and "mems" control files when + * on default hierarchy or when the cpuset_v2_mode flag is set by mounting + * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. + * With v2 behavior, "cpus" and "mems" are always what the users have + * requested and won't be changed by hotplug events. Only the effective + * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { -- cgit