diff options
Diffstat (limited to 'kernel/cgroup')
-rw-r--r-- | kernel/cgroup/Makefile | 3 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 22 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 81 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 1190 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 93 | ||||
-rw-r--r-- | kernel/cgroup/debug.c | 58 | ||||
-rw-r--r-- | kernel/cgroup/freezer.c | 6 | ||||
-rw-r--r-- | kernel/cgroup/namespace.c | 1 | ||||
-rw-r--r-- | kernel/cgroup/pids.c | 1 | ||||
-rw-r--r-- | kernel/cgroup/stat.c | 338 |
10 files changed, 1497 insertions, 296 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ce693ccb8c58..2be89a003185 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,4 +1,5 @@ -obj-y := cgroup.o namespace.o cgroup-v1.o +# SPDX-License-Identifier: GPL-2.0 +obj-y := cgroup.o stat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 8b4c3c2f2509..b928b27050c6 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CGROUP_INTERNAL_H #define __CGROUP_INTERNAL_H @@ -156,6 +157,8 @@ static inline void get_css_set(struct css_set *cset) bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); +bool cgroup_is_thread_root(struct cgroup *cgrp); +bool cgroup_is_threaded(struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, @@ -173,7 +176,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, struct cgroup_namespace *ns); -bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); +int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx); @@ -183,10 +186,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup); -ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, - loff_t off); +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) + __acquires(&cgroup_threadgroup_rwsem); +void cgroup_procs_write_finish(struct task_struct *task) + __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); @@ -198,6 +201,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); /* + * stat.c + */ +void cgroup_stat_flush(struct cgroup *cgrp); +int cgroup_stat_init(struct cgroup *cgrp); +void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_stat_show_cputime(struct seq_file *seq); +void cgroup_stat_boot(void); + +/* * namespace.c */ extern const struct proc_ns_operations cgroupns_operations; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7bf4b1533f34..a2c05d2476ac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (cgroup_on_dfl(to)) return -EINVAL; - if (!cgroup_may_migrate_to(to)) - return -EBUSY; + ret = cgroup_migrate_vet_dst(to); + if (ret) + return ret; mutex_lock(&cgroup_mutex); @@ -121,8 +122,12 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->self, &it); - task = css_task_iter_next(&it); + css_task_iter_start(&from->self, 0, &it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); @@ -373,7 +378,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->self, &it); + css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -510,10 +515,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return 0; } -static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + bool threadgroup) { - return __cgroup_procs_write(of, buf, nbytes, off, false); + struct cgroup *cgrp; + struct task_struct *task; + const struct cred *cred, *tcred; + ssize_t ret; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, threadgroup); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + cred = current_cred(); + tcred = get_task_cred(task); + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; + put_cred(tcred); + if (ret) + goto out_finish; + + ret = cgroup_attach_task(cgrp, task, threadgroup); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup1_procs_write(of, buf, nbytes, off, true); +} + +static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, @@ -592,7 +645,7 @@ struct cftype cgroup1_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write = cgroup_procs_write, + .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", @@ -611,7 +664,7 @@ struct cftype cgroup1_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write = cgroup_tasks_write, + .write = cgroup1_tasks_write, }, { .name = "notify_on_release", @@ -701,7 +754,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->self, &it); + css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: @@ -846,6 +899,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); + if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) + seq_puts(seq, ",cpuset_v2_mode"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) @@ -900,6 +955,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->cpuset_clone_children = true; continue; } + if (!strcmp(token, "cpuset_v2_mode")) { + opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; + continue; + } if (!strcmp(token, "xattr")) { opts->flags |= CGRP_ROOT_XATTR; continue; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index df2e0f14a95d..2cf06c274e4c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS +static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root; +struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -162,6 +164,9 @@ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; +/* some controllers can be threaded on the default hierarchy */ +static u16 cgrp_dfl_threaded_ss_mask; + /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; @@ -316,13 +321,87 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static struct cgroup *cgroup_parent(struct cgroup *cgrp) +static bool cgroup_has_tasks(struct cgroup *cgrp) { - struct cgroup_subsys_state *parent_css = cgrp->self.parent; + return cgrp->nr_populated_csets; +} - if (parent_css) - return container_of(parent_css, struct cgroup, self); - return NULL; +bool cgroup_is_threaded(struct cgroup *cgrp) +{ + return cgrp->dom_cgrp != cgrp; +} + +/* can @cgrp host both domain and threaded children? */ +static bool cgroup_is_mixable(struct cgroup *cgrp) +{ + /* + * Root isn't under domain level resource control exempting it from + * the no-internal-process constraint, so it can serve as a thread + * root and a parent of resource domains at the same time. + */ + return !cgroup_parent(cgrp); +} + +/* can @cgrp become a thread root? should always be true for a thread root */ +static bool cgroup_can_be_thread_root(struct cgroup *cgrp) +{ + /* mixables don't care */ + if (cgroup_is_mixable(cgrp)) + return true; + + /* domain roots can't be nested under threaded */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* can only have either domain or threaded children */ + if (cgrp->nr_populated_domain_children) + return false; + + /* and no domain controllers can be enabled */ + if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) + return false; + + return true; +} + +/* is @cgrp root of a threaded subtree? */ +bool cgroup_is_thread_root(struct cgroup *cgrp) +{ + /* thread root should be a domain */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* a domain w/ threaded children is a thread root */ + if (cgrp->nr_threaded_children) + return true; + + /* + * A domain which has tasks and explicit threaded controllers + * enabled is a thread root. + */ + if (cgroup_has_tasks(cgrp) && + (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) + return true; + + return false; +} + +/* a domain which isn't connected to the root w/o brekage can't be used */ +static bool cgroup_is_valid_domain(struct cgroup *cgrp) +{ + /* the cgroup itself can be a thread root */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* but the ancestors can't be unless mixable */ + while ((cgrp = cgroup_parent(cgrp))) { + if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) + return false; + if (cgroup_is_threaded(cgrp)) + return false; + } + + return true; } /* subsystems visibly enabled on a cgroup */ @@ -331,8 +410,14 @@ static u16 cgroup_control(struct cgroup *cgrp) struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; - if (parent) - return parent->subtree_control; + if (parent) { + u16 ss_mask = parent->subtree_control; + + /* threaded cgroups can only have threaded controllers */ + if (cgroup_is_threaded(cgrp)) + ss_mask &= cgrp_dfl_threaded_ss_mask; + return ss_mask; + } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | @@ -345,8 +430,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - if (parent) - return parent->subtree_ss_mask; + if (parent) { + u16 ss_mask = parent->subtree_ss_mask; + + /* threaded cgroups can only have threaded controllers */ + if (cgroup_is_threaded(cgrp)) + ss_mask &= cgrp_dfl_threaded_ss_mask; + return ss_mask; + } return cgrp->root->subsys_mask; } @@ -373,6 +464,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, } /** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (!css || !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + +/** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -436,22 +549,12 @@ out_unlock: return css; } -static void __maybe_unused cgroup_get(struct cgroup *cgrp) -{ - css_get(&cgrp->self); -} - static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); } -static bool cgroup_tryget(struct cgroup *cgrp) -{ - return css_tryget(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -560,16 +663,31 @@ EXPORT_SYMBOL_GPL(of_css); */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), + .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), + .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), + + /* + * The following field is re-initialized when this cset gets linked + * in cgroup_init(). However, let's initialize the field + * statically too so that the default cgroup can be accessed safely + * early during boot. + */ + .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ +static bool css_set_threaded(struct css_set *cset) +{ + return cset->dom_cset != cset; +} + /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set @@ -587,39 +705,48 @@ static bool css_set_populated(struct css_set *cset) } /** - * cgroup_update_populated - updated populated count of a cgroup + * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first - * task or losing the last. Update @cgrp->populated_cnt accordingly. The - * count is propagated towards root so that a given cgroup's populated_cnt - * is zero iff the cgroup and all its descendants don't contain any tasks. - * - * @cgrp's interface file "cgroup.populated" is zero if - * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt - * changes from or to zero, userland is notified that the content of the - * interface file has changed. This can be used to detect when @cgrp and - * its descendants become populated or empty. + * task or losing the last. Update @cgrp->nr_populated_* accordingly. The + * count is propagated towards root so that a given cgroup's + * nr_populated_children is zero iff none of its descendants contain any + * tasks. + * + * @cgrp's interface file "cgroup.populated" is zero if both + * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and + * 1 otherwise. When the sum changes from or to zero, userland is notified + * that the content of the interface file has changed. This can be used to + * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { + struct cgroup *child = NULL; + int adj = populated ? 1 : -1; + lockdep_assert_held(&css_set_lock); do { - bool trigger; + bool was_populated = cgroup_is_populated(cgrp); - if (populated) - trigger = !cgrp->populated_cnt++; - else - trigger = !--cgrp->populated_cnt; + if (!child) { + cgrp->nr_populated_csets += adj; + } else { + if (cgroup_is_threaded(child)) + cgrp->nr_populated_threaded_children += adj; + else + cgrp->nr_populated_domain_children += adj; + } - if (!trigger) + if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); + child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -630,7 +757,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the - * ->populated_cnt of all associated cgroups accordingly. + * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { @@ -653,7 +780,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated) * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * - * This function automatically handles populated_cnt updates and + * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ @@ -737,6 +864,8 @@ void put_css_set_locked(struct css_set *cset) if (!refcount_dec_and_test(&cset->refcount)) return; + WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); + /* This css_set is dead. unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); @@ -753,6 +882,11 @@ void put_css_set_locked(struct css_set *cset) kfree(link); } + if (css_set_threaded(cset)) { + list_del(&cset->threaded_csets_node); + put_css_set_locked(cset->dom_cset); + } + kfree_rcu(cset, rcu_head); } @@ -771,6 +905,7 @@ static bool compare_css_sets(struct css_set *cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { + struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* @@ -781,6 +916,16 @@ static bool compare_css_sets(struct css_set *cset, if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; + + /* @cset's domain should match the default cgroup's */ + if (cgroup_on_dfl(new_cgrp)) + new_dfl_cgrp = new_cgrp; + else + new_dfl_cgrp = old_cset->dfl_cgrp; + + if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) + return false; + /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may @@ -988,9 +1133,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, } refcount_set(&cset->refcount, 1); + cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->task_iters); + INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_preload_node); @@ -1028,6 +1175,28 @@ static struct css_set *find_css_set(struct css_set *old_cset, spin_unlock_irq(&css_set_lock); + /* + * If @cset should be threaded, look up the matching dom_cset and + * link them up. We first fully initialize @cset then look for the + * dom_cset. It's simpler this way and safe as @cset is guaranteed + * to stay empty until we return. + */ + if (cgroup_is_threaded(cset->dfl_cgrp)) { + struct css_set *dcset; + + dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); + if (!dcset) { + put_css_set(cset); + return NULL; + } + + spin_lock_irq(&css_set_lock); + cset->dom_cset = dcset; + list_add_tail(&cset->threaded_csets_node, + &dcset->threaded_csets); + spin_unlock_irq(&css_set_lock); + } + return cset; } @@ -1155,6 +1324,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, if (cset == &init_css_set) { res = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; @@ -1226,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1670,6 +1841,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; + cgrp->dom_cgrp = cgrp; + cgrp->max_descendants = INT_MAX; + cgrp->max_depth = INT_MAX; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -1690,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); + strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strcpy(root->name, opts->name); + strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } @@ -1737,7 +1911,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -1753,6 +1928,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) if (ret) goto destroy_root; + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + trace_cgroup_setup_root(root); /* @@ -2168,21 +2346,52 @@ out_release_tset: list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); + + /* + * Re-initialize the cgroup_taskset structure in case it is reused + * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() + * iteration. + */ + tset->nr_tasks = 0; + tset->csets = &tset->src_csets; return ret; } /** - * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * - * On the default hierarchy, except for the root, subtree_control must be - * zero for migration destination cgroups with tasks so that child cgroups - * don't compete against tasks. + * On the default hierarchy, except for the mixable, (possible) thread root + * and threaded cgroups, subtree_control must be zero for migration + * destination cgroups with tasks so that child cgroups don't compete + * against tasks. */ -bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { - return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || - !dst_cgrp->subtree_control; + /* v1 doesn't have any restriction */ + if (!cgroup_on_dfl(dst_cgrp)) + return 0; + + /* verify @dst_cgrp can host resources */ + if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) + return -EOPNOTSUPP; + + /* mixables don't care */ + if (cgroup_is_mixable(dst_cgrp)) + return 0; + + /* + * If @dst_cgrp is already or can become a thread root or is + * threaded, it doesn't matter. + */ + if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) + return 0; + + /* apply no-internal-process constraint */ + if (dst_cgrp->subtree_control) + return -EBUSY; + + return 0; } /** @@ -2387,8 +2596,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, struct task_struct *task; int ret; - if (!cgroup_may_migrate_to(dst_cgrp)) - return -EBUSY; + ret = cgroup_migrate_vet_dst(dst_cgrp); + if (ret) + return ret; /* look up all src csets */ spin_lock_irq(&css_set_lock); @@ -2415,96 +2625,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -static int cgroup_procs_write_permission(struct task_struct *task, - struct cgroup *dst_cgrp, - struct kernfs_open_file *of) -{ - struct super_block *sb = of->file->f_path.dentry->d_sb; - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; - struct cgroup *src_cgrp, *com_cgrp; - struct inode *inode; - int ret; - - if (!cgroup_on_dfl(dst_cgrp)) { - const struct cred *cred = current_cred(); - const struct cred *tcred = get_task_cred(task); - - /* - * even if we're attaching all tasks in the thread group, - * we only need to check permissions on one of them. - */ - if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || - uid_eq(cred->euid, tcred->uid) || - uid_eq(cred->euid, tcred->suid)) - ret = 0; - else - ret = -EACCES; - - put_cred(tcred); - return ret; - } - - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - /* and the common ancestor */ - com_cgrp = src_cgrp; - while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) - com_cgrp = cgroup_parent(com_cgrp); - - /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; - - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - if (ret) - return ret; - - /* - * If namespaces are delegation boundaries, %current must be able - * to see both source and destination cgroups from its namespace. - */ - if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && - (!cgroup_is_descendant(src_cgrp, root_cgrp) || - !cgroup_is_descendant(dst_cgrp, root_cgrp))) - return -ENOENT; - - return 0; -} - -/* - * Find the task_struct of the task to attach by vpid and pass it along to the - * function to attach either it or all tasks in its threadgroup. Will lock - * cgroup_mutex and threadgroup. - */ -ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) + __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; - struct cgroup_subsys *ss; - struct cgroup *cgrp; pid_t pid; - int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return -EINVAL; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; + return ERR_PTR(-EINVAL); percpu_down_write(&cgroup_threadgroup_rwsem); + rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { - ret = -ESRCH; - goto out_unlock_rcu; + tsk = ERR_PTR(-ESRCH); + goto out_unlock_threadgroup; } } else { tsk = current; @@ -2520,35 +2657,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { - ret = -EINVAL; - goto out_unlock_rcu; + tsk = ERR_PTR(-EINVAL); + goto out_unlock_threadgroup; } get_task_struct(tsk); + goto out_unlock_rcu; + +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); +out_unlock_rcu: rcu_read_unlock(); + return tsk; +} - ret = cgroup_procs_write_permission(tsk, cgrp, of); - if (!ret) - ret = cgroup_attach_task(cgrp, tsk, threadgroup); +void cgroup_procs_write_finish(struct task_struct *task) + __releases(&cgroup_threadgroup_rwsem) +{ + struct cgroup_subsys *ss; + int ssid; - put_task_struct(tsk); - goto out_unlock_threadgroup; + /* release reference from cgroup_procs_write_start() */ + put_task_struct(task); -out_unlock_rcu: - rcu_read_unlock(); -out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); - cgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, - loff_t off) -{ - return __cgroup_procs_write(of, buf, nbytes, off, true); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -2891,6 +3026,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) +{ + u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; + + /* if nothing is getting enabled, nothing to worry about */ + if (!enable) + return 0; + + /* can @cgrp host any resources? */ + if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) + return -EOPNOTSUPP; + + /* mixables don't care */ + if (cgroup_is_mixable(cgrp)) + return 0; + + if (domain_enable) { + /* can't enable domain controllers inside a thread subtree */ + if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) + return -EOPNOTSUPP; + } else { + /* + * Threaded controllers can handle internal competitions + * and are always allowed inside a (prospective) thread + * subtree. + */ + if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) + return 0; + } + + /* + * Controllers can't be enabled for a cgroup with tasks to avoid + * child cgroups competing against tasks. + */ + if (cgroup_has_tasks(cgrp)) + return -EBUSY; + + return 0; +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -2966,33 +3141,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - /* - * Except for the root, subtree_control must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (enable && cgroup_parent(cgrp)) { - struct cgrp_cset_link *link; - - /* - * Because namespaces pin csets too, @cgrp->cset_links - * might not be empty even when @cgrp is empty. Walk and - * verify each cset. - */ - spin_lock_irq(&css_set_lock); - - ret = 0; - list_for_each_entry(link, &cgrp->cset_links, cset_link) { - if (css_set_populated(link->cset)) { - ret = -EBUSY; - break; - } - } - - spin_unlock_irq(&css_set_lock); - - if (ret) - goto out_unlock; - } + ret = cgroup_vet_subtree_control_enable(cgrp, enable); + if (ret) + goto out_unlock; /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); @@ -3011,6 +3162,172 @@ out_unlock: return ret ?: nbytes; } +/** + * cgroup_enable_threaded - make @cgrp threaded + * @cgrp: the target cgroup + * + * Called when "threaded" is written to the cgroup.type interface file and + * tries to make @cgrp threaded and join the parent's resource domain. + * This function is never called on the root cgroup as cgroup.type doesn't + * exist on it. + */ +static int cgroup_enable_threaded(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup *dom_cgrp = parent->dom_cgrp; + int ret; + + lockdep_assert_held(&cgroup_mutex); + + /* noop if already threaded */ + if (cgroup_is_threaded(cgrp)) + return 0; + + /* we're joining the parent's domain, ensure its validity */ + if (!cgroup_is_valid_domain(dom_cgrp) || + !cgroup_can_be_thread_root(dom_cgrp)) + return -EOPNOTSUPP; + + /* + * The following shouldn't cause actual migrations and should + * always succeed. + */ + cgroup_save_control(cgrp); + + cgrp->dom_cgrp = dom_cgrp; + ret = cgroup_apply_control(cgrp); + if (!ret) + parent->nr_threaded_children++; + else + cgrp->dom_cgrp = cgrp; + + cgroup_finalize_control(cgrp, ret); + return ret; +} + +static int cgroup_type_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + if (cgroup_is_threaded(cgrp)) + seq_puts(seq, "threaded\n"); + else if (!cgroup_is_valid_domain(cgrp)) + seq_puts(seq, "domain invalid\n"); + else if (cgroup_is_thread_root(cgrp)) + seq_puts(seq, "domain threaded\n"); + else + seq_puts(seq, "domain\n"); + + return 0; +} + +static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + int ret; + + /* only switching to threaded mode is supported */ + if (strcmp(strstrip(buf), "threaded")) + return -EINVAL; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + /* threaded can only be enabled */ + ret = cgroup_enable_threaded(cgrp); + + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +static int cgroup_max_descendants_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + int descendants = READ_ONCE(cgrp->max_descendants); + + if (descendants == INT_MAX) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%d\n", descendants); + + return 0; +} + +static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + int descendants; + ssize_t ret; + + buf = strstrip(buf); + if (!strcmp(buf, "max")) { + descendants = INT_MAX; + } else { + ret = kstrtoint(buf, 0, &descendants); + if (ret) + return ret; + } + + if (descendants < 0) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgrp->max_descendants = descendants; + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + +static int cgroup_max_depth_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + int depth = READ_ONCE(cgrp->max_depth); + + if (depth == INT_MAX) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%d\n", depth); + + return 0; +} + +static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int depth; + + buf = strstrip(buf); + if (!strcmp(buf, "max")) { + depth = INT_MAX; + } else { + ret = kstrtoint(buf, 0, &depth); + if (ret) + return ret; + } + + if (depth < 0) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgrp->max_depth = depth; + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + static int cgroup_events_show(struct seq_file *seq, void *v) { seq_printf(seq, "populated %d\n", @@ -3018,6 +3335,49 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgroup = seq_css(seq)->cgroup; + + seq_printf(seq, "nr_descendants %d\n", + cgroup->nr_descendants); + seq_printf(seq, "nr_dying_descendants %d\n", + cgroup->nr_dying_descendants); + + return 0; +} + +static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, + struct cgroup *cgrp, int ssid) +{ + struct cgroup_subsys *ss = cgroup_subsys[ssid]; + struct cgroup_subsys_state *css; + int ret; + + if (!ss->css_extra_stat_show) + return 0; + + css = cgroup_tryget_css(cgrp, ss); + if (!css) + return 0; + + ret = ss->css_extra_stat_show(seq, css); + css_put(css); + return ret; +} + +static int cpu_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; + int ret = 0; + + cgroup_stat_show_cputime(seq); +#ifdef CONFIG_CGROUP_SCHED + ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); +#endif + return ret; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -3234,7 +3594,6 @@ restart: static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { - LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; @@ -3659,6 +4018,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css) return ret; } +static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) +{ + struct list_head *l; + struct cgrp_cset_link *link; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + /* find the next threaded cset */ + if (it->tcset_pos) { + l = it->tcset_pos->next; + + if (l != it->tcset_head) { + it->tcset_pos = l; + return container_of(l, struct css_set, + threaded_csets_node); + } + + it->tcset_pos = NULL; + } + + /* find the next cset */ + l = it->cset_pos; + l = l->next; + if (l == it->cset_head) { + it->cset_pos = NULL; + return NULL; + } + + if (it->ss) { + cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } + + it->cset_pos = l; + + /* initialize threaded css_set walking */ + if (it->flags & CSS_TASK_ITER_THREADED) { + if (it->cur_dcset) + put_css_set_locked(it->cur_dcset); + it->cur_dcset = cset; + get_css_set(cset); + + it->tcset_head = &cset->threaded_csets; + it->tcset_pos = &cset->threaded_csets; + } + + return cset; +} + /** * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance @@ -3667,32 +4078,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css) */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { - struct list_head *l = it->cset_pos; - struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set */ do { - l = l->next; - if (l == it->cset_head) { - it->cset_pos = NULL; + cset = css_task_iter_next_css_set(it); + if (!cset) { it->task_pos = NULL; return; } - - if (it->ss) { - cset = container_of(l, struct css_set, - e_cset_node[it->ss->id]); - } else { - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } } while (!css_set_populated(cset)); - it->cset_pos = l; - if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; else @@ -3727,30 +4125,36 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *l = it->task_pos; + struct list_head *next; lockdep_assert_held(&css_set_lock); - WARN_ON_ONCE(!l); - +repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the * next cset. */ - l = l->next; + next = it->task_pos->next; - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (l == it->mg_tasks_head) + if (next == it->mg_tasks_head) css_task_iter_advance_css_set(it); else - it->task_pos = l; + it->task_pos = next; + + /* if PROCS, skip over tasks which aren't group leaders */ + if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && + !thread_group_leader(list_entry(it->task_pos, struct task_struct, + cg_list))) + goto repeat; } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of + * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call @@ -3758,7 +4162,7 @@ static void css_task_iter_advance(struct css_task_iter *it) * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ -void css_task_iter_start(struct cgroup_subsys_state *css, +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { /* no one should try to iterate before mounting cgroups */ @@ -3769,6 +4173,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, spin_lock_irq(&css_set_lock); it->ss = css->ss; + it->flags = flags; if (it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; @@ -3826,6 +4231,9 @@ void css_task_iter_end(struct css_task_iter *it) spin_unlock_irq(&css_set_lock); } + if (it->cur_dcset) + put_css_set(it->cur_dcset); + if (it->cur_task) put_task_struct(it->cur_task); } @@ -3842,16 +4250,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; - struct task_struct *task; - do { - task = css_task_iter_next(it); - } while (task && !thread_group_leader(task)); - - return task; + return css_task_iter_next(it); } -static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, + unsigned int iter_flags) { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; @@ -3869,24 +4273,169 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) if (!it) return ERR_PTR(-ENOMEM); of->priv = it; - css_task_iter_start(&cgrp->self, it); + css_task_iter_start(&cgrp->self, iter_flags, it); } else if (!(*pos)++) { css_task_iter_end(it); - css_task_iter_start(&cgrp->self, it); + css_task_iter_start(&cgrp->self, iter_flags, it); } return cgroup_procs_next(s, NULL, NULL); } +static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +{ + struct cgroup *cgrp = seq_css(s)->cgroup; + + /* + * All processes of a threaded subtree belong to the domain cgroup + * of the subtree. Only threads can be distributed across the + * subtree. Reject reads on cgroup.procs in the subtree proper. + * They're always empty anyway. + */ + if (cgroup_is_threaded(cgrp)) + return ERR_PTR(-EOPNOTSUPP); + + return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | + CSS_TASK_ITER_THREADED); +} + static int cgroup_procs_show(struct seq_file *s, void *v) { - seq_printf(s, "%d\n", task_tgid_vnr(v)); + seq_printf(s, "%d\n", task_pid_vnr(v)); + return 0; +} + +static int cgroup_procs_write_permission(struct cgroup *src_cgrp, + struct cgroup *dst_cgrp, + struct super_block *sb) +{ + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup *com_cgrp = src_cgrp; + struct inode *inode; + int ret; + + lockdep_assert_held(&cgroup_mutex); + + /* find the common ancestor */ + while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) + com_cgrp = cgroup_parent(com_cgrp); + + /* %current should be authorized to migrate to the common ancestor */ + inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + if (ret) + return ret; + + /* + * If namespaces are delegation boundaries, %current must be able + * to see both source and destination cgroups from its namespace. + */ + if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && + (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || + !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) + return -ENOENT; + return 0; } +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *src_cgrp, *dst_cgrp; + struct task_struct *task; + ssize_t ret; + + dst_cgrp = cgroup_kn_lock_live(of->kn, false); + if (!dst_cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, true); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb); + if (ret) + goto out_finish; + + ret = cgroup_attach_task(dst_cgrp, task, true); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) +{ + return __cgroup_procs_start(s, pos, 0); +} + +static ssize_t cgroup_threads_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *src_cgrp, *dst_cgrp; + struct task_struct *task; + ssize_t ret; + + buf = strstrip(buf); + + dst_cgrp = cgroup_kn_lock_live(of->kn, false); + if (!dst_cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, false); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + /* thread migrations follow the cgroup.procs delegation rule */ + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb); + if (ret) + goto out_finish; + + /* and must be contained in the same domain */ + ret = -EOPNOTSUPP; + if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) + goto out_finish; + + ret = cgroup_attach_task(dst_cgrp, task, false); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { { + .name = "cgroup.type", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_type_show, + .write = cgroup_type_write, + }, + { .name = "cgroup.procs", .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), @@ -3897,6 +4446,14 @@ static struct cftype cgroup_base_files[] = { .write = cgroup_procs_write, }, { + .name = "cgroup.threads", + .release = cgroup_procs_release, + .seq_start = cgroup_threads_start, + .seq_next = cgroup_procs_next, + .seq_show = cgroup_procs_show, + .write = cgroup_threads_write, + }, + { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, @@ -3912,6 +4469,25 @@ static struct cftype cgroup_base_files[] = { .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, + { + .name = "cgroup.max.descendants", + .seq_show = cgroup_max_descendants_show, + .write = cgroup_max_descendants_write, + }, + { + .name = "cgroup.max.depth", + .seq_show = cgroup_max_depth_show, + .write = cgroup_max_depth_write, + }, + { + .name = "cgroup.stat", + .seq_show = cgroup_stat_show, + }, + { + .name = "cpu.stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_stat_show, + }, { } /* terminate */ }; @@ -3972,6 +4548,8 @@ static void css_free_work_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_exit(cgrp); kfree(cgrp); } else { /* @@ -4011,9 +4589,18 @@ static void css_release_work_fn(struct work_struct *work) if (ss->css_released) ss->css_released(css); } else { + struct cgroup *tcgrp; + /* cgroup release path */ trace_cgroup_release(cgrp); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_flush(cgrp); + + for (tcgrp = cgroup_parent(cgrp); tcgrp; + tcgrp = cgroup_parent(tcgrp)) + tcgrp->nr_dying_descendants--; + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4100,9 +4687,6 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; - if (ss->css_reset) - ss->css_reset(css); - if (ss->css_offline) ss->css_offline(css); @@ -4196,6 +4780,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_free_cgrp; + if (cgroup_on_dfl(parent)) { + ret = cgroup_stat_init(cgrp); + if (ret) + goto out_cancel_ref; + } + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4203,7 +4793,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_cancel_ref; + goto out_stat_exit; } init_cgroup_housekeeping(cgrp); @@ -4211,10 +4801,17 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_idr_free; - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + if (tcgrp != cgrp) + tcgrp->nr_descendants++; + } + if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4241,13 +4838,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); - if (parent) - cgroup_bpf_inherit(cgrp, parent); - cgroup_propagate_control(cgrp); return cgrp; +out_idr_free: + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_stat_exit: + if (cgroup_on_dfl(parent)) + cgroup_stat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -4255,6 +4854,29 @@ out_free_cgrp: return ERR_PTR(ret); } +static bool cgroup_check_hierarchy_limits(struct cgroup *parent) +{ + struct cgroup *cgroup; + int ret = false; + int level = 1; + + lockdep_assert_held(&cgroup_mutex); + + for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { + if (cgroup->nr_descendants >= cgroup->max_descendants) + goto fail; + + if (level > cgroup->max_depth) + goto fail; + + level++; + } + + ret = true; +fail: + return ret; +} + int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; @@ -4269,6 +4891,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) if (!parent) return -ENODEV; + if (!cgroup_check_hierarchy_limits(parent)) { + ret = -EAGAIN; + goto out_unlock; + } + cgrp = cgroup_create(parent); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); @@ -4420,6 +5047,7 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { + struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; @@ -4464,7 +5092,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - cgroup1_check_for_release(cgroup_parent(cgrp)); + if (parent && cgroup_is_threaded(cgrp)) + parent->nr_threaded_children--; + + for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + tcgrp->nr_descendants--; + tcgrp->nr_dying_descendants++; + } + + cgroup1_check_for_release(parent); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -4605,6 +5241,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + cgroup_stat_boot(); + /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. @@ -4659,11 +5297,17 @@ int __init cgroup_init(void) cgrp_dfl_root.subsys_mask |= 1 << ss->id; + /* implicit controllers must be threaded too */ + WARN_ON(ss->implicit_on_dfl && !ss->threaded); + if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; + if (ss->threaded) + cgrp_dfl_threaded_ss_mask |= 1 << ss->id; + if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { @@ -4708,6 +5352,18 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) +{ + struct kernfs_node *kn; + + kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return; + kernfs_path(kn, buf, buflen); + kernfs_put(kn); +} + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy @@ -5175,15 +5831,103 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, bool overridable) +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_detach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) { - struct cgroup *parent = cgroup_parent(cgrp); int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); + ret = __cgroup_bpf_query(cgrp, attr, uattr); mutex_unlock(&cgroup_mutex); return ret; } #endif /* CONFIG_CGROUP_BPF */ + +#ifdef CONFIG_SYSFS +static ssize_t show_delegatable_files(struct cftype *files, char *buf, + ssize_t size, const char *prefix) +{ + struct cftype *cft; + ssize_t ret = 0; + + for (cft = files; cft && cft->name[0] != '\0'; cft++) { + if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) + continue; + + if (prefix) + ret += snprintf(buf + ret, size - ret, "%s.", prefix); + + ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); + + if (unlikely(ret >= size)) { + WARN_ON(1); + break; + } + } + + return ret; +} + +static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct cgroup_subsys *ss; + int ssid; + ssize_t ret = 0; + + ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, + NULL); + + for_each_subsys(ss, ssid) + ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, + PAGE_SIZE - ret, + cgroup_subsys_name[ssid]); + + return ret; +} +static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); + +static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); +} +static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); + +static struct attribute *cgroup_sysfs_attrs[] = { + &cgroup_delegate_attr.attr, + &cgroup_features_attr.attr, + NULL, +}; + +static const struct attribute_group cgroup_sysfs_attr_group = { + .attrs = cgroup_sysfs_attrs, + .name = "cgroup", +}; + +static int __init cgroup_sysfs_init(void) +{ + return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); +} +subsys_initcall(cgroup_sysfs_init); +#endif /* CONFIG_SYSFS */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8d5151688504..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -56,7 +56,8 @@ #include <linux/time64.h> #include <linux/backing-dev.h> #include <linux/sort.h> - +#include <linux/oom.h> +#include <linux/sched/isolation.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <linux/mutex.h> @@ -300,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* + * Cgroup v2 behavior is used when on default hierarchy or the + * cgroup_v2_mode flag is set. + */ +static inline bool is_in_v2_mode(void) +{ + return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); +} + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -489,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; /* @@ -577,6 +587,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } +/* Must be called with cpuset_mutex held. */ +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + /* * generate_sched_domains() * @@ -639,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ - cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -649,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; - if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) - goto done; - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -666,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - non_isolated_cpus); + housekeeping_cpumask(HK_FLAG_DOMAIN)); goto done; } @@ -690,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains, */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_FLAG_DOMAIN)))) continue; if (is_sched_load_balance(cp)) @@ -772,7 +785,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, non_isolated_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -785,7 +798,6 @@ restart: BUG_ON(nslot != ndoms); done: - free_cpumask_var(non_isolated_cpus); kfree(csa); /* @@ -862,7 +874,7 @@ static void update_tasks_cpumask(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); @@ -896,8 +908,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - cpumask_empty(new_cpus)) + if (is_in_v2_mode() && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -914,7 +925,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1092,7 +1103,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; @@ -1150,8 +1161,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - nodes_empty(*new_mems)) + if (is_in_v2_mode() && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1168,7 +1178,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1285,7 +1295,7 @@ static void update_tasks_flags(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) cpuset_update_task_spread_flag(cs, task); css_task_iter_end(&it); @@ -1460,7 +1470,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + if (!is_in_v2_mode() && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1892,6 +1902,7 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, }, { @@ -1978,7 +1989,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2055,7 +2066,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2249,7 +2260,7 @@ retry: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2259,6 +2270,13 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2280,7 +2298,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + bool on_dfl = is_in_v2_mode(); mutex_lock(&cpuset_mutex); @@ -2333,8 +2351,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) + if (cpus_updated || force_rebuild) { + force_rebuild = false; rebuild_sched_domains(); + } } void cpuset_update_active_cpus(void) @@ -2343,16 +2363,15 @@ void cpuset_update_active_cpus(void) * We're inside cpu hotplug critical region which usually nests * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. - * - * We still need to do partition_sched_domains() synchronously; - * otherwise, the scheduler will get confused and put tasks to the - * dead CPU. Fall back to the default single domain. - * cpuset_hotplug_workfn() will rebuild it as necessary. */ - partition_sched_domains(1, NULL, NULL); schedule_work(&cpuset_hotplug_work); } +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); +} + /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2498,12 +2517,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, - * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. + * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed as is marked TIF_MEMDIE. + * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -2526,7 +2545,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok - * TIF_MEMDIE - any node ok + * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -2544,7 +2563,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) + if (unlikely(tsk_is_oom_victim(current))) return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index dac46af22782..9caeda610249 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Debug controller * @@ -49,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); if (refcnt > cset->nr_tasks) @@ -95,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -114,27 +115,49 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) { struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - int dead_cnt = 0, extra_refs = 0; + int dead_cnt = 0, extra_refs = 0, threaded_csets = 0; spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; int refcnt = refcount_read(&cset->refcount); - seq_printf(seq, " %d", refcnt); - if (refcnt - cset->nr_tasks > 0) { - int extra = refcnt - cset->nr_tasks; - - seq_printf(seq, " +%d", extra); - /* - * Take out the one additional reference in - * init_css_set. - */ - if (cset == &init_css_set) - extra--; - extra_refs += extra; + /* + * Print out the proc_cset and threaded_cset relationship + * and highlight difference between refcount and task_count. + */ + seq_printf(seq, "css_set %pK", cset); + if (rcu_dereference_protected(cset->dom_cset, 1) != cset) { + threaded_csets++; + seq_printf(seq, "=>%pK", cset->dom_cset); + } + if (!list_empty(&cset->threaded_csets)) { + struct css_set *tcset; + int idx = 0; + + list_for_each_entry(tcset, &cset->threaded_csets, + threaded_csets_node) { + seq_puts(seq, idx ? "," : "<="); + seq_printf(seq, "%pK", tcset); + idx++; + } + } else { + seq_printf(seq, " %d", refcnt); + if (refcnt - cset->nr_tasks > 0) { + int extra = refcnt - cset->nr_tasks; + + seq_printf(seq, " +%d", extra); + /* + * Take out the one additional reference in + * init_css_set. + */ + if (cset == &init_css_set) + extra--; + extra_refs += extra; + } } seq_puts(seq, "\n"); @@ -163,10 +186,12 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) } spin_unlock_irq(&css_set_lock); - if (!dead_cnt && !extra_refs) + if (!dead_cnt && !extra_refs && !threaded_csets) return 0; seq_puts(seq, "\n"); + if (threaded_csets) + seq_printf(seq, "threaded css_sets = %d\n", threaded_csets); if (extra_refs) seq_printf(seq, "extra references = %d\n", extra_refs); if (dead_cnt) @@ -352,6 +377,7 @@ static int __init enable_cgroup_debug(char *str) { debug_cgrp_subsys.dfl_cftypes = debug_files; debug_cgrp_subsys.implicit_on_dfl = true; + debug_cgrp_subsys.threaded = true; return 1; } __setup("cgroup_debug", enable_cgroup_debug); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 1b72d56edce5..08236798d173 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) rcu_read_unlock(); /* are all tasks frozen? */ - css_task_iter_start(css, &it); + css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task)) { @@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, &it); + css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); @@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, &it); + css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 66129eb4371d..b05f1dd58a62 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include <linux/sched/task.h> diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 2237201d66d5..9829c67ebc0a 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = { .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, + .threaded = true, }; diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c new file mode 100644 index 000000000000..1e111dd455c4 --- /dev/null +++ b/kernel/cgroup/stat.c @@ -0,0 +1,338 @@ +#include "cgroup-internal.h" + +#include <linux/sched/cputime.h> + +static DEFINE_MUTEX(cgroup_stat_mutex); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); + +static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->cpu_stat, cpu); +} + +/** + * cgroup_cpu_stat_updated - keep track of updated cpu_stat + * @cgrp: target cgroup + * @cpu: cpu on which cpu_stat was updated + * + * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching + * cpu_stat->updated_children list. See the comment on top of + * cgroup_cpu_stat definition for details. + */ +static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_cpu_stat(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (cstat->updated_next) + break; + + cstat->updated_next = pcstat->updated_children; + pcstat->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + +/** + * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_cpu_stat_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_cpu_stat *cstat; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + cstat = cgroup_cpu_stat(pos, cpu); + if (cstat->updated_children == pos) + break; + pos = cstat->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && cstat->updated_next) { + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + struct cgroup_cpu_stat *ncstat; + struct cgroup **nextp; + + nextp = &pcstat->updated_children; + while (true) { + ncstat = cgroup_cpu_stat(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &ncstat->updated_next; + } + + *nextp = cstat->updated_next; + cstat->updated_next = NULL; + } + + return pos; +} + +static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, + struct cgroup_stat *src_stat) +{ + dst_stat->cputime.utime += src_stat->cputime.utime; + dst_stat->cputime.stime += src_stat->cputime.stime; + dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; +} + +static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct task_cputime *last_cputime = &cstat->last_cputime; + struct task_cputime cputime; + struct cgroup_stat delta; + unsigned seq; + + lockdep_assert_held(&cgroup_stat_mutex); + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&cstat->sync); + cputime = cstat->cputime; + } while (__u64_stats_fetch_retry(&cstat->sync, seq)); + + /* accumulate the deltas to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_stat_accumulate(&delta, &cgrp->pending_stat); + memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_stat_accumulate(&cgrp->stat, &delta); + if (parent) + cgroup_stat_accumulate(&parent->pending_stat, &delta); +} + +/* see cgroup_stat_flush() */ +static void cgroup_stat_flush_locked(struct cgroup *cgrp) +{ + int cpu; + + lockdep_assert_held(&cgroup_stat_mutex); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *pos = NULL; + + raw_spin_lock_irq(cpu_lock); + while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) + cgroup_cpu_stat_flush_one(pos, cpu); + raw_spin_unlock_irq(cpu_lock); + } +} + +/** + * cgroup_stat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + */ +void cgroup_stat_flush(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_stat_mutex); + cgroup_stat_flush_locked(cgrp); + mutex_unlock(&cgroup_stat_mutex); +} + +static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +{ + struct cgroup_cpu_stat *cstat; + + cstat = get_cpu_ptr(cgrp->cpu_stat); + u64_stats_update_begin(&cstat->sync); + return cstat; +} + +static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, + struct cgroup_cpu_stat *cstat) +{ + u64_stats_update_end(&cstat->sync); + cgroup_cpu_stat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(cstat); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + cstat->cputime.sum_exec_runtime += delta_exec; + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + cstat->cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + cstat->cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void cgroup_stat_show_cputime(struct seq_file *seq) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + mutex_lock(&cgroup_stat_mutex); + + cgroup_stat_flush_locked(cgrp); + + usage = cgrp->stat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, + &utime, &stime); + + mutex_unlock(&cgroup_stat_mutex); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "usage_usec %llu\n" + "user_usec %llu\n" + "system_usec %llu\n", + usage, utime, stime); +} + +int cgroup_stat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has cpu_stat preallocated */ + if (!cgrp->cpu_stat) { + cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); + if (!cgrp->cpu_stat) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } + + prev_cputime_init(&cgrp->stat.prev_cputime); + + return 0; +} + +void cgroup_stat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_stat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + if (WARN_ON_ONCE(cstat->updated_children != cgrp) || + WARN_ON_ONCE(cstat->updated_next)) + return; + } + + free_percpu(cgrp->cpu_stat); + cgrp->cpu_stat = NULL; +} + +void __init cgroup_stat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); + + BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); +} |