summaryrefslogtreecommitdiff
path: root/kernel/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/Makefile3
-rw-r--r--kernel/cgroup/cgroup-internal.h22
-rw-r--r--kernel/cgroup/cgroup-v1.c81
-rw-r--r--kernel/cgroup/cgroup.c1190
-rw-r--r--kernel/cgroup/cpuset.c93
-rw-r--r--kernel/cgroup/debug.c58
-rw-r--r--kernel/cgroup/freezer.c6
-rw-r--r--kernel/cgroup/namespace.c1
-rw-r--r--kernel/cgroup/pids.c1
-rw-r--r--kernel/cgroup/stat.c338
10 files changed, 1497 insertions, 296 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..2be89a003185 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,5 @@
-obj-y := cgroup.o namespace.o cgroup-v1.o
+# SPDX-License-Identifier: GPL-2.0
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 8b4c3c2f2509..b928b27050c6 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CGROUP_INTERNAL_H
#define __CGROUP_INTERNAL_H
@@ -156,6 +157,8 @@ static inline void get_css_set(struct css_set *cset)
bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
+bool cgroup_is_thread_root(struct cgroup *cgrp);
+bool cgroup_is_threaded(struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -173,7 +176,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
struct cgroup_namespace *ns);
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
struct cgroup_mgctx *mgctx);
@@ -183,10 +186,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup);
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off);
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
@@ -198,6 +201,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
int cgroup_task_count(const struct cgroup *cgrp);
/*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
+void cgroup_stat_boot(void);
+
+/*
* namespace.c
*/
extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 7bf4b1533f34..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (cgroup_on_dfl(to))
return -EINVAL;
- if (!cgroup_may_migrate_to(to))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(to);
+ if (ret)
+ return ret;
mutex_lock(&cgroup_mutex);
@@ -121,8 +122,12 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
* ->can_attach() fails.
*/
do {
- css_task_iter_start(&from->self, &it);
- task = css_task_iter_next(&it);
+ css_task_iter_start(&from->self, 0, &it);
+
+ do {
+ task = css_task_iter_next(&it);
+ } while (task && (task->flags & PF_EXITING));
+
if (task)
get_task_struct(task);
css_task_iter_end(&it);
@@ -373,7 +378,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
@@ -510,10 +515,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return 0;
}
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ bool threadgroup)
{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
+ struct cgroup *cgrp;
+ struct task_struct *task;
+ const struct cred *cred, *tcred;
+ ssize_t ret;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, threadgroup);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ cred = current_cred();
+ tcred = get_task_cred(task);
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(cgrp, task, threadgroup);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, false);
}
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
@@ -592,7 +645,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
+ .write = cgroup1_procs_write,
},
{
.name = "cgroup.clone_children",
@@ -611,7 +664,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
+ .write = cgroup1_tasks_write,
},
{
.name = "notify_on_release",
@@ -701,7 +754,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
rcu_read_unlock();
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
@@ -846,6 +899,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+ if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
+ seq_puts(seq, ",cpuset_v2_mode");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -900,6 +955,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
opts->cpuset_clone_children = true;
continue;
}
+ if (!strcmp(token, "cpuset_v2_mode")) {
+ opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ continue;
+ }
if (!strcmp(token, "xattr")) {
opts->flags |= CGRP_ROOT_XATTR;
continue;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index df2e0f14a95d..2cf06c274e4c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
@@ -162,6 +164,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
@@ -316,13 +321,87 @@ static void cgroup_idr_remove(struct idr *idr, int id)
spin_unlock_bh(&cgroup_idr_lock);
}
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+static bool cgroup_has_tasks(struct cgroup *cgrp)
{
- struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+ return cgrp->nr_populated_csets;
+}
- if (parent_css)
- return container_of(parent_css, struct cgroup, self);
- return NULL;
+bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+ return cgrp->dom_cgrp != cgrp;
+}
+
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+ /*
+ * Root isn't under domain level resource control exempting it from
+ * the no-internal-process constraint, so it can serve as a thread
+ * root and a parent of resource domains at the same time.
+ */
+ return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return true;
+
+ /* domain roots can't be nested under threaded */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* can only have either domain or threaded children */
+ if (cgrp->nr_populated_domain_children)
+ return false;
+
+ /* and no domain controllers can be enabled */
+ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return false;
+
+ return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+ /* thread root should be a domain */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* a domain w/ threaded children is a thread root */
+ if (cgrp->nr_threaded_children)
+ return true;
+
+ /*
+ * A domain which has tasks and explicit threaded controllers
+ * enabled is a thread root.
+ */
+ if (cgroup_has_tasks(cgrp) &&
+ (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+ return true;
+
+ return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+ /* the cgroup itself can be a thread root */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* but the ancestors can't be unless mixable */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+ return false;
+ if (cgroup_is_threaded(cgrp))
+ return false;
+ }
+
+ return true;
}
/* subsystems visibly enabled on a cgroup */
@@ -331,8 +410,14 @@ static u16 cgroup_control(struct cgroup *cgrp)
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
- if (parent)
- return parent->subtree_control;
+ if (parent) {
+ u16 ss_mask = parent->subtree_control;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -345,8 +430,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- if (parent)
- return parent->subtree_ss_mask;
+ if (parent) {
+ u16 ss_mask = parent->subtree_ss_mask;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
return cgrp->root->subsys_mask;
}
@@ -373,6 +464,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
}
/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (!css || !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -436,22 +549,12 @@ out_unlock:
return css;
}
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
-{
- css_get(&cgrp->self);
-}
-
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -560,16 +663,31 @@ EXPORT_SYMBOL_GPL(of_css);
*/
struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
+ .dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+
+ /*
+ * The following field is re-initialized when this cset gets linked
+ * in cgroup_init(). However, let's initialize the field
+ * statically too so that the default cgroup can be accessed safely
+ * early during boot.
+ */
+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
};
static int css_set_count = 1; /* 1 for init_css_set */
+static bool css_set_threaded(struct css_set *cset)
+{
+ return cset->dom_cset != cset;
+}
+
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
@@ -587,39 +705,48 @@ static bool css_set_populated(struct css_set *cset)
}
/**
- * cgroup_update_populated - updated populated count of a cgroup
+ * cgroup_update_populated - update the populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last. Update @cgrp->populated_cnt accordingly. The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
- *
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed. This can be used to detect when @cgrp and
- * its descendants become populated or empty.
+ * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise. When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed. This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
+ struct cgroup *child = NULL;
+ int adj = populated ? 1 : -1;
+
lockdep_assert_held(&css_set_lock);
do {
- bool trigger;
+ bool was_populated = cgroup_is_populated(cgrp);
- if (populated)
- trigger = !cgrp->populated_cnt++;
- else
- trigger = !--cgrp->populated_cnt;
+ if (!child) {
+ cgrp->nr_populated_csets += adj;
+ } else {
+ if (cgroup_is_threaded(child))
+ cgrp->nr_populated_threaded_children += adj;
+ else
+ cgrp->nr_populated_domain_children += adj;
+ }
- if (!trigger)
+ if (was_populated == cgroup_is_populated(cgrp))
break;
cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
+ child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
@@ -630,7 +757,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
- * ->populated_cnt of all associated cgroups accordingly.
+ * populated counters of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
@@ -653,7 +780,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
- * This function automatically handles populated_cnt updates and
+ * This function automatically handles populated counter updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
@@ -737,6 +864,8 @@ void put_css_set_locked(struct css_set *cset)
if (!refcount_dec_and_test(&cset->refcount))
return;
+ WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
/* This css_set is dead. unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
@@ -753,6 +882,11 @@ void put_css_set_locked(struct css_set *cset)
kfree(link);
}
+ if (css_set_threaded(cset)) {
+ list_del(&cset->threaded_csets_node);
+ put_css_set_locked(cset->dom_cset);
+ }
+
kfree_rcu(cset, rcu_head);
}
@@ -771,6 +905,7 @@ static bool compare_css_sets(struct css_set *cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
+ struct cgroup *new_dfl_cgrp;
struct list_head *l1, *l2;
/*
@@ -781,6 +916,16 @@ static bool compare_css_sets(struct css_set *cset,
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
+
+ /* @cset's domain should match the default cgroup's */
+ if (cgroup_on_dfl(new_cgrp))
+ new_dfl_cgrp = new_cgrp;
+ else
+ new_dfl_cgrp = old_cset->dfl_cgrp;
+
+ if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+ return false;
+
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
@@ -988,9 +1133,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
}
refcount_set(&cset->refcount, 1);
+ cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
+ INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
@@ -1028,6 +1175,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
spin_unlock_irq(&css_set_lock);
+ /*
+ * If @cset should be threaded, look up the matching dom_cset and
+ * link them up. We first fully initialize @cset then look for the
+ * dom_cset. It's simpler this way and safe as @cset is guaranteed
+ * to stay empty until we return.
+ */
+ if (cgroup_is_threaded(cset->dfl_cgrp)) {
+ struct css_set *dcset;
+
+ dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+ if (!dcset) {
+ put_css_set(cset);
+ return NULL;
+ }
+
+ spin_lock_irq(&css_set_lock);
+ cset->dom_cset = dcset;
+ list_add_tail(&cset->threaded_csets_node,
+ &dcset->threaded_csets);
+ spin_unlock_irq(&css_set_lock);
+ }
+
return cset;
}
@@ -1155,6 +1324,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
if (cset == &init_css_set) {
res = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
@@ -1226,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
else
- strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
}
@@ -1670,6 +1841,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
+ cgrp->dom_cgrp = cgrp;
+ cgrp->max_descendants = INT_MAX;
+ cgrp->max_depth = INT_MAX;
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -1690,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
root->flags = opts->flags;
if (opts->release_agent)
- strcpy(root->release_agent_path, opts->release_agent);
+ strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX);
if (opts->name)
- strcpy(root->name, opts->name);
+ strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
if (opts->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
@@ -1737,7 +1911,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -1753,6 +1928,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
if (ret)
goto destroy_root;
+ ret = cgroup_bpf_inherit(root_cgrp);
+ WARN_ON_ONCE(ret);
+
trace_cgroup_setup_root(root);
/*
@@ -2168,21 +2346,52 @@ out_release_tset:
list_del_init(&cset->mg_node);
}
spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Re-initialize the cgroup_taskset structure in case it is reused
+ * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+ * iteration.
+ */
+ tset->nr_tasks = 0;
+ tset->csets = &tset->src_csets;
return ret;
}
/**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
*/
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
- return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
- !dst_cgrp->subtree_control;
+ /* v1 doesn't have any restriction */
+ if (!cgroup_on_dfl(dst_cgrp))
+ return 0;
+
+ /* verify @dst_cgrp can host resources */
+ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(dst_cgrp))
+ return 0;
+
+ /*
+ * If @dst_cgrp is already or can become a thread root or is
+ * threaded, it doesn't matter.
+ */
+ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+ return 0;
+
+ /* apply no-internal-process constraint */
+ if (dst_cgrp->subtree_control)
+ return -EBUSY;
+
+ return 0;
}
/**
@@ -2387,8 +2596,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
struct task_struct *task;
int ret;
- if (!cgroup_may_migrate_to(dst_cgrp))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -2415,96 +2625,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
return ret;
}
-static int cgroup_procs_write_permission(struct task_struct *task,
- struct cgroup *dst_cgrp,
- struct kernfs_open_file *of)
-{
- struct super_block *sb = of->file->f_path.dentry->d_sb;
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
- struct cgroup *src_cgrp, *com_cgrp;
- struct inode *inode;
- int ret;
-
- if (!cgroup_on_dfl(dst_cgrp)) {
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
-
- /*
- * even if we're attaching all tasks in the thread group,
- * we only need to check permissions on one of them.
- */
- if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->euid, tcred->suid))
- ret = 0;
- else
- ret = -EACCES;
-
- put_cred(tcred);
- return ret;
- }
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* and the common ancestor */
- com_cgrp = src_cgrp;
- while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
- com_cgrp = cgroup_parent(com_cgrp);
-
- /* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
- if (ret)
- return ret;
-
- /*
- * If namespaces are delegation boundaries, %current must be able
- * to see both source and destination cgroups from its namespace.
- */
- if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
- (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
- !cgroup_is_descendant(dst_cgrp, root_cgrp)))
- return -ENOENT;
-
- return 0;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
- struct cgroup_subsys *ss;
- struct cgroup *cgrp;
pid_t pid;
- int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
+ return ERR_PTR(-EINVAL);
percpu_down_write(&cgroup_threadgroup_rwsem);
+
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- ret = -ESRCH;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-ESRCH);
+ goto out_unlock_threadgroup;
}
} else {
tsk = current;
@@ -2520,35 +2657,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* cgroup with no rt_runtime allocated. Just say no.
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
- ret = -EINVAL;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-EINVAL);
+ goto out_unlock_threadgroup;
}
get_task_struct(tsk);
+ goto out_unlock_rcu;
+
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+out_unlock_rcu:
rcu_read_unlock();
+ return tsk;
+}
- ret = cgroup_procs_write_permission(tsk, cgrp, of);
- if (!ret)
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
- put_task_struct(tsk);
- goto out_unlock_threadgroup;
+ /* release reference from cgroup_procs_write_start() */
+ put_task_struct(task);
-out_unlock_rcu:
- rcu_read_unlock();
-out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
- cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, true);
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2891,6 +3026,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+ u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+ /* if nothing is getting enabled, nothing to worry about */
+ if (!enable)
+ return 0;
+
+ /* can @cgrp host any resources? */
+ if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return 0;
+
+ if (domain_enable) {
+ /* can't enable domain controllers inside a thread subtree */
+ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return -EOPNOTSUPP;
+ } else {
+ /*
+ * Threaded controllers can handle internal competitions
+ * and are always allowed inside a (prospective) thread
+ * subtree.
+ */
+ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return 0;
+ }
+
+ /*
+ * Controllers can't be enabled for a cgroup with tasks to avoid
+ * child cgroups competing against tasks.
+ */
+ if (cgroup_has_tasks(cgrp))
+ return -EBUSY;
+
+ return 0;
+}
+
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
@@ -2966,33 +3141,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
goto out_unlock;
}
- /*
- * Except for the root, subtree_control must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (enable && cgroup_parent(cgrp)) {
- struct cgrp_cset_link *link;
-
- /*
- * Because namespaces pin csets too, @cgrp->cset_links
- * might not be empty even when @cgrp is empty. Walk and
- * verify each cset.
- */
- spin_lock_irq(&css_set_lock);
-
- ret = 0;
- list_for_each_entry(link, &cgrp->cset_links, cset_link) {
- if (css_set_populated(link->cset)) {
- ret = -EBUSY;
- break;
- }
- }
-
- spin_unlock_irq(&css_set_lock);
-
- if (ret)
- goto out_unlock;
- }
+ ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+ if (ret)
+ goto out_unlock;
/* save and update control masks and prepare csses */
cgroup_save_control(cgrp);
@@ -3011,6 +3162,172 @@ out_unlock:
return ret ?: nbytes;
}
+/**
+ * cgroup_enable_threaded - make @cgrp threaded
+ * @cgrp: the target cgroup
+ *
+ * Called when "threaded" is written to the cgroup.type interface file and
+ * tries to make @cgrp threaded and join the parent's resource domain.
+ * This function is never called on the root cgroup as cgroup.type doesn't
+ * exist on it.
+ */
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup *dom_cgrp = parent->dom_cgrp;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* noop if already threaded */
+ if (cgroup_is_threaded(cgrp))
+ return 0;
+
+ /* we're joining the parent's domain, ensure its validity */
+ if (!cgroup_is_valid_domain(dom_cgrp) ||
+ !cgroup_can_be_thread_root(dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /*
+ * The following shouldn't cause actual migrations and should
+ * always succeed.
+ */
+ cgroup_save_control(cgrp);
+
+ cgrp->dom_cgrp = dom_cgrp;
+ ret = cgroup_apply_control(cgrp);
+ if (!ret)
+ parent->nr_threaded_children++;
+ else
+ cgrp->dom_cgrp = cgrp;
+
+ cgroup_finalize_control(cgrp, ret);
+ return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ if (cgroup_is_threaded(cgrp))
+ seq_puts(seq, "threaded\n");
+ else if (!cgroup_is_valid_domain(cgrp))
+ seq_puts(seq, "domain invalid\n");
+ else if (cgroup_is_thread_root(cgrp))
+ seq_puts(seq, "domain threaded\n");
+ else
+ seq_puts(seq, "domain\n");
+
+ return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ /* only switching to threaded mode is supported */
+ if (strcmp(strstrip(buf), "threaded"))
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /* threaded can only be enabled */
+ ret = cgroup_enable_threaded(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int descendants = READ_ONCE(cgrp->max_descendants);
+
+ if (descendants == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", descendants);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int descendants;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ descendants = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &descendants);
+ if (ret)
+ return ret;
+ }
+
+ if (descendants < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_descendants = descendants;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int depth = READ_ONCE(cgrp->max_depth);
+
+ if (depth == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", depth);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int depth;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ depth = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &depth);
+ if (ret)
+ return ret;
+ }
+
+ if (depth < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_depth = depth;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
static int cgroup_events_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "populated %d\n",
@@ -3018,6 +3335,49 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "nr_descendants %d\n",
+ cgroup->nr_descendants);
+ seq_printf(seq, "nr_dying_descendants %d\n",
+ cgroup->nr_dying_descendants);
+
+ return 0;
+}
+
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_extra_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_extra_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+ cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -3234,7 +3594,6 @@ restart:
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
@@ -3659,6 +4018,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
return ret;
}
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+ struct list_head *l;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* find the next threaded cset */
+ if (it->tcset_pos) {
+ l = it->tcset_pos->next;
+
+ if (l != it->tcset_head) {
+ it->tcset_pos = l;
+ return container_of(l, struct css_set,
+ threaded_csets_node);
+ }
+
+ it->tcset_pos = NULL;
+ }
+
+ /* find the next cset */
+ l = it->cset_pos;
+ l = l->next;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
+ return NULL;
+ }
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+
+ it->cset_pos = l;
+
+ /* initialize threaded css_set walking */
+ if (it->flags & CSS_TASK_ITER_THREADED) {
+ if (it->cur_dcset)
+ put_css_set_locked(it->cur_dcset);
+ it->cur_dcset = cset;
+ get_css_set(cset);
+
+ it->tcset_head = &cset->threaded_csets;
+ it->tcset_pos = &cset->threaded_csets;
+ }
+
+ return cset;
+}
+
/**
* css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
@@ -3667,32 +4078,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
*/
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
- struct list_head *l = it->cset_pos;
- struct cgrp_cset_link *link;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set */
do {
- l = l->next;
- if (l == it->cset_head) {
- it->cset_pos = NULL;
+ cset = css_task_iter_next_css_set(it);
+ if (!cset) {
it->task_pos = NULL;
return;
}
-
- if (it->ss) {
- cset = container_of(l, struct css_set,
- e_cset_node[it->ss->id]);
- } else {
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
- }
} while (!css_set_populated(cset));
- it->cset_pos = l;
-
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
else
@@ -3727,30 +4125,36 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *l = it->task_pos;
+ struct list_head *next;
lockdep_assert_held(&css_set_lock);
- WARN_ON_ONCE(!l);
-
+repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
* next cset.
*/
- l = l->next;
+ next = it->task_pos->next;
- if (l == it->tasks_head)
- l = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (l == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
css_task_iter_advance_css_set(it);
else
- it->task_pos = l;
+ it->task_pos = next;
+
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+ !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+ cg_list)))
+ goto repeat;
}
/**
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
@@ -3758,7 +4162,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*/
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
/* no one should try to iterate before mounting cgroups */
@@ -3769,6 +4173,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
spin_lock_irq(&css_set_lock);
it->ss = css->ss;
+ it->flags = flags;
if (it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -3826,6 +4231,9 @@ void css_task_iter_end(struct css_task_iter *it)
spin_unlock_irq(&css_set_lock);
}
+ if (it->cur_dcset)
+ put_css_set(it->cur_dcset);
+
if (it->cur_task)
put_task_struct(it->cur_task);
}
@@ -3842,16 +4250,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
- struct task_struct *task;
- do {
- task = css_task_iter_next(it);
- } while (task && !thread_group_leader(task));
-
- return task;
+ return css_task_iter_next(it);
}
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+ unsigned int iter_flags)
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
@@ -3869,24 +4273,169 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
if (!it)
return ERR_PTR(-ENOMEM);
of->priv = it;
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
} else if (!(*pos)++) {
css_task_iter_end(it);
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
}
return cgroup_procs_next(s, NULL, NULL);
}
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+
+ /*
+ * All processes of a threaded subtree belong to the domain cgroup
+ * of the subtree. Only threads can be distributed across the
+ * subtree. Reject reads on cgroup.procs in the subtree proper.
+ * They're always empty anyway.
+ */
+ if (cgroup_is_threaded(cgrp))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+ CSS_TASK_ITER_THREADED);
+}
+
static int cgroup_procs_show(struct seq_file *s, void *v)
{
- seq_printf(s, "%d\n", task_tgid_vnr(v));
+ seq_printf(s, "%d\n", task_pid_vnr(v));
+ return 0;
+}
+
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup *com_cgrp = src_cgrp;
+ struct inode *inode;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* find the common ancestor */
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+ return -ENOENT;
+
return 0;
}
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, true);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, true);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+ return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, false);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /* thread migrations follow the cgroup.procs delegation rule */
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ /* and must be contained in the same domain */
+ ret = -EOPNOTSUPP;
+ if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, false);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
{
+ .name = "cgroup.type",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_type_show,
+ .write = cgroup_type_write,
+ },
+ {
.name = "cgroup.procs",
.flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
@@ -3897,6 +4446,14 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_procs_write,
},
{
+ .name = "cgroup.threads",
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_threads_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
+ .write = cgroup_threads_write,
+ },
+ {
.name = "cgroup.controllers",
.seq_show = cgroup_controllers_show,
},
@@ -3912,6 +4469,25 @@ static struct cftype cgroup_base_files[] = {
.file_offset = offsetof(struct cgroup, events_file),
.seq_show = cgroup_events_show,
},
+ {
+ .name = "cgroup.max.descendants",
+ .seq_show = cgroup_max_descendants_show,
+ .write = cgroup_max_descendants_write,
+ },
+ {
+ .name = "cgroup.max.depth",
+ .seq_show = cgroup_max_depth_show,
+ .write = cgroup_max_depth_write,
+ },
+ {
+ .name = "cgroup.stat",
+ .seq_show = cgroup_stat_show,
+ },
+ {
+ .name = "cpu.stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_stat_show,
+ },
{ } /* terminate */
};
@@ -3972,6 +4548,8 @@ static void css_free_work_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
+ if (cgroup_on_dfl(cgrp))
+ cgroup_stat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4011,9 +4589,18 @@ static void css_release_work_fn(struct work_struct *work)
if (ss->css_released)
ss->css_released(css);
} else {
+ struct cgroup *tcgrp;
+
/* cgroup release path */
trace_cgroup_release(cgrp);
+ if (cgroup_on_dfl(cgrp))
+ cgroup_stat_flush(cgrp);
+
+ for (tcgrp = cgroup_parent(cgrp); tcgrp;
+ tcgrp = cgroup_parent(tcgrp))
+ tcgrp->nr_dying_descendants--;
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -4100,9 +4687,6 @@ static void offline_css(struct cgroup_subsys_state *css)
if (!(css->flags & CSS_ONLINE))
return;
- if (ss->css_reset)
- ss->css_reset(css);
-
if (ss->css_offline)
ss->css_offline(css);
@@ -4196,6 +4780,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (ret)
goto out_free_cgrp;
+ if (cgroup_on_dfl(parent)) {
+ ret = cgroup_stat_init(cgrp);
+ if (ret)
+ goto out_cancel_ref;
+ }
+
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
@@ -4203,7 +4793,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
- goto out_cancel_ref;
+ goto out_stat_exit;
}
init_cgroup_housekeeping(cgrp);
@@ -4211,10 +4801,17 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_idr_free;
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+ if (tcgrp != cgrp)
+ tcgrp->nr_descendants++;
+ }
+
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4241,13 +4838,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
- if (parent)
- cgroup_bpf_inherit(cgrp, parent);
-
cgroup_propagate_control(cgrp);
return cgrp;
+out_idr_free:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_stat_exit:
+ if (cgroup_on_dfl(parent))
+ cgroup_stat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -4255,6 +4854,29 @@ out_free_cgrp:
return ERR_PTR(ret);
}
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+ struct cgroup *cgroup;
+ int ret = false;
+ int level = 1;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+ if (cgroup->nr_descendants >= cgroup->max_descendants)
+ goto fail;
+
+ if (level > cgroup->max_depth)
+ goto fail;
+
+ level++;
+ }
+
+ ret = true;
+fail:
+ return ret;
+}
+
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
@@ -4269,6 +4891,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
if (!parent)
return -ENODEV;
+ if (!cgroup_check_hierarchy_limits(parent)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
cgrp = cgroup_create(parent);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
@@ -4420,6 +5047,7 @@ static void kill_css(struct cgroup_subsys_state *css)
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
int ssid;
@@ -4464,7 +5092,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
kernfs_remove(cgrp->kn);
- cgroup1_check_for_release(cgroup_parent(cgrp));
+ if (parent && cgroup_is_threaded(cgrp))
+ parent->nr_threaded_children--;
+
+ for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ tcgrp->nr_descendants--;
+ tcgrp->nr_dying_descendants++;
+ }
+
+ cgroup1_check_for_release(parent);
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -4605,6 +5241,8 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
+ cgroup_stat_boot();
+
/*
* The latency of the synchronize_sched() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
@@ -4659,11 +5297,17 @@ int __init cgroup_init(void)
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ /* implicit controllers must be threaded too */
+ WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
if (ss->implicit_on_dfl)
cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
else if (!ss->dfl_cftypes)
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
+ if (ss->threaded)
+ cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
} else {
@@ -4708,6 +5352,18 @@ static int __init cgroup_wq_init(void)
}
core_initcall(cgroup_wq_init);
+void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return;
+ kernfs_path(kn, buf, buflen);
+ kernfs_put(kn);
+}
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -5175,15 +5831,103 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
- struct cgroup *parent = cgroup_parent(cgrp);
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+ ssize_t size, const char *prefix)
+{
+ struct cftype *cft;
+ ssize_t ret = 0;
+
+ for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+ if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+ continue;
+
+ if (prefix)
+ ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+ ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+ if (unlikely(ret >= size)) {
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ ssize_t ret = 0;
+
+ ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+ NULL);
+
+ for_each_subsys(ss, ssid)
+ ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+ PAGE_SIZE - ret,
+ cgroup_subsys_name[ssid]);
+
+ return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+ &cgroup_delegate_attr.attr,
+ &cgroup_features_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+ .attrs = cgroup_sysfs_attrs,
+ .name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+ return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8d5151688504..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,7 +56,8 @@
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
-
+#include <linux/oom.h>
+#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
@@ -300,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
+ * Cgroup v2 behavior is used when on default hierarchy or the
+ * cgroup_v2_mode flag is set.
+ */
+static inline bool is_in_v2_mode(void)
+{
+ return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -489,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -577,6 +587,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
+/* Must be called with cpuset_mutex held. */
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
/*
* generate_sched_domains()
*
@@ -639,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -649,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
- if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
- goto done;
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -666,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- non_isolated_cpus);
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
@@ -690,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (is_sched_load_balance(cp))
@@ -772,7 +785,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, non_isolated_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -785,7 +798,6 @@ restart:
BUG_ON(nslot != ndoms);
done:
- free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
@@ -862,7 +874,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
@@ -896,8 +908,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- cpumask_empty(new_cpus))
+ if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -914,7 +925,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1092,7 +1103,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
@@ -1150,8 +1161,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- nodes_empty(*new_mems))
+ if (is_in_v2_mode() && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1168,7 +1178,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1285,7 +1295,7 @@ static void update_tasks_flags(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
cpuset_update_task_spread_flag(cs, task);
css_task_iter_end(&it);
@@ -1460,7 +1470,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ if (!is_in_v2_mode() &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1892,6 +1902,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
@@ -1978,7 +1989,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2055,7 +2066,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2249,7 +2260,7 @@ retry:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2259,6 +2270,13 @@ retry:
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2280,7 +2298,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool on_dfl = is_in_v2_mode();
mutex_lock(&cpuset_mutex);
@@ -2333,8 +2351,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(void)
@@ -2343,16 +2363,15 @@ void cpuset_update_active_cpus(void)
* We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
- *
- * We still need to do partition_sched_domains() synchronously;
- * otherwise, the scheduler will get confused and put tasks to the
- * dead CPU. Fall back to the default single domain.
- * cpuset_hotplug_workfn() will rebuild it as necessary.
*/
- partition_sched_domains(1, NULL, NULL);
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2498,12 +2517,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* If we're in interrupt, yes, we can always allocate. If @node is set in
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * yes. If current has access to memory reserves as an oom victim, yes.
* Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * unless the task has been OOM killed.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -2526,7 +2545,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
- * TIF_MEMDIE - any node ok
+ * tsk_is_oom_victim - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -2544,7 +2563,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index dac46af22782..9caeda610249 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Debug controller
*
@@ -49,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
spin_lock_irq(&css_set_lock);
rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
+ cset = task_css_set(current);
refcnt = refcount_read(&cset->refcount);
seq_printf(seq, "css_set %pK %d", cset, refcnt);
if (refcnt > cset->nr_tasks)
@@ -95,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
spin_lock_irq(&css_set_lock);
rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
+ cset = task_css_set(current);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -114,27 +115,49 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- int dead_cnt = 0, extra_refs = 0;
+ int dead_cnt = 0, extra_refs = 0, threaded_csets = 0;
spin_lock_irq(&css_set_lock);
+
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
int refcnt = refcount_read(&cset->refcount);
- seq_printf(seq, " %d", refcnt);
- if (refcnt - cset->nr_tasks > 0) {
- int extra = refcnt - cset->nr_tasks;
-
- seq_printf(seq, " +%d", extra);
- /*
- * Take out the one additional reference in
- * init_css_set.
- */
- if (cset == &init_css_set)
- extra--;
- extra_refs += extra;
+ /*
+ * Print out the proc_cset and threaded_cset relationship
+ * and highlight difference between refcount and task_count.
+ */
+ seq_printf(seq, "css_set %pK", cset);
+ if (rcu_dereference_protected(cset->dom_cset, 1) != cset) {
+ threaded_csets++;
+ seq_printf(seq, "=>%pK", cset->dom_cset);
+ }
+ if (!list_empty(&cset->threaded_csets)) {
+ struct css_set *tcset;
+ int idx = 0;
+
+ list_for_each_entry(tcset, &cset->threaded_csets,
+ threaded_csets_node) {
+ seq_puts(seq, idx ? "," : "<=");
+ seq_printf(seq, "%pK", tcset);
+ idx++;
+ }
+ } else {
+ seq_printf(seq, " %d", refcnt);
+ if (refcnt - cset->nr_tasks > 0) {
+ int extra = refcnt - cset->nr_tasks;
+
+ seq_printf(seq, " +%d", extra);
+ /*
+ * Take out the one additional reference in
+ * init_css_set.
+ */
+ if (cset == &init_css_set)
+ extra--;
+ extra_refs += extra;
+ }
}
seq_puts(seq, "\n");
@@ -163,10 +186,12 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
}
spin_unlock_irq(&css_set_lock);
- if (!dead_cnt && !extra_refs)
+ if (!dead_cnt && !extra_refs && !threaded_csets)
return 0;
seq_puts(seq, "\n");
+ if (threaded_csets)
+ seq_printf(seq, "threaded css_sets = %d\n", threaded_csets);
if (extra_refs)
seq_printf(seq, "extra references = %d\n", extra_refs);
if (dead_cnt)
@@ -352,6 +377,7 @@ static int __init enable_cgroup_debug(char *str)
{
debug_cgrp_subsys.dfl_cftypes = debug_files;
debug_cgrp_subsys.implicit_on_dfl = true;
+ debug_cgrp_subsys.threaded = true;
return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..08236798d173 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
rcu_read_unlock();
/* are all tasks frozen? */
- css_task_iter_start(css, &it);
+ css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
if (freezing(task)) {
@@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
freeze_task(task);
css_task_iter_end(&it);
@@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
__thaw_task(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 66129eb4371d..b05f1dd58a62 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "cgroup-internal.h"
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2237201d66d5..9829c67ebc0a 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = {
.free = pids_free,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
+ .threaded = true,
};
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..1e111dd455c4
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,338 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+ return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
+ * cpu_stat->updated_children list. See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+ struct cgroup *parent;
+ unsigned long flags;
+
+ /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @cgrp is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+ return;
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+
+ /* put @cgrp and all ancestors on the corresponding updated lists */
+ for (parent = cgroup_parent(cgrp); parent;
+ cgrp = parent, parent = cgroup_parent(cgrp)) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+ struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+ /*
+ * Both additions and removals are bottom-up. If a cgroup
+ * is already in the tree, all ancestors are.
+ */
+ if (cstat->updated_next)
+ break;
+
+ cstat->updated_next = pcstat->updated_children;
+ pcstat->updated_children = cgrp;
+ }
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
+ * the traversal and %NULL return indicates the end. During traversal,
+ * each returned cgroup is unlinked from the tree. Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+ struct cgroup *root, int cpu)
+{
+ struct cgroup_cpu_stat *cstat;
+ struct cgroup *parent;
+
+ if (pos == root)
+ return NULL;
+
+ /*
+ * We're gonna walk down to the first leaf and visit/remove it. We
+ * can pick whatever unvisited node as the starting point.
+ */
+ if (!pos)
+ pos = root;
+ else
+ pos = cgroup_parent(pos);
+
+ /* walk down to the first leaf */
+ while (true) {
+ cstat = cgroup_cpu_stat(pos, cpu);
+ if (cstat->updated_children == pos)
+ break;
+ pos = cstat->updated_children;
+ }
+
+ /*
+ * Unlink @pos from the tree. As the updated_children list is
+ * singly linked, we have to walk it to find the removal point.
+ * However, due to the way we traverse, @pos will be the first
+ * child in most cases. The only exception is @root.
+ */
+ parent = cgroup_parent(pos);
+ if (parent && cstat->updated_next) {
+ struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+ struct cgroup_cpu_stat *ncstat;
+ struct cgroup **nextp;
+
+ nextp = &pcstat->updated_children;
+ while (true) {
+ ncstat = cgroup_cpu_stat(*nextp, cpu);
+ if (*nextp == pos)
+ break;
+
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &ncstat->updated_next;
+ }
+
+ *nextp = cstat->updated_next;
+ cstat->updated_next = NULL;
+ }
+
+ return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+ struct cgroup_stat *src_stat)
+{
+ dst_stat->cputime.utime += src_stat->cputime.utime;
+ dst_stat->cputime.stime += src_stat->cputime.stime;
+ dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+ struct task_cputime *last_cputime = &cstat->last_cputime;
+ struct task_cputime cputime;
+ struct cgroup_stat delta;
+ unsigned seq;
+
+ lockdep_assert_held(&cgroup_stat_mutex);
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = __u64_stats_fetch_begin(&cstat->sync);
+ cputime = cstat->cputime;
+ } while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+ /* accumulate the deltas to propgate */
+ delta.cputime.utime = cputime.utime - last_cputime->utime;
+ delta.cputime.stime = cputime.stime - last_cputime->stime;
+ delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+ last_cputime->sum_exec_runtime;
+ *last_cputime = cputime;
+
+ /* transfer the pending stat into delta */
+ cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+ memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+ /* propagate delta into the global stat and the parent's pending */
+ cgroup_stat_accumulate(&cgrp->stat, &delta);
+ if (parent)
+ cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+ int cpu;
+
+ lockdep_assert_held(&cgroup_stat_mutex);
+
+ for_each_possible_cpu(cpu) {
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+ struct cgroup *pos = NULL;
+
+ raw_spin_lock_irq(cpu_lock);
+ while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+ cgroup_cpu_stat_flush_one(pos, cpu);
+ raw_spin_unlock_irq(cpu_lock);
+ }
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards. After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+ mutex_lock(&cgroup_stat_mutex);
+ cgroup_stat_flush_locked(cgrp);
+ mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = get_cpu_ptr(cgrp->cpu_stat);
+ u64_stats_update_begin(&cstat->sync);
+ return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+ struct cgroup_cpu_stat *cstat)
+{
+ u64_stats_update_end(&cstat->sync);
+ cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+ put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = cgroup_cpu_stat_account_begin(cgrp);
+ cstat->cputime.sum_exec_runtime += delta_exec;
+ cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+ enum cpu_usage_stat index, u64 delta_exec)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+ switch (index) {
+ case CPUTIME_USER:
+ case CPUTIME_NICE:
+ cstat->cputime.utime += delta_exec;
+ break;
+ case CPUTIME_SYSTEM:
+ case CPUTIME_IRQ:
+ case CPUTIME_SOFTIRQ:
+ cstat->cputime.stime += delta_exec;
+ break;
+ default:
+ break;
+ }
+
+ cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ u64 usage, utime, stime;
+
+ if (!cgroup_parent(cgrp))
+ return;
+
+ mutex_lock(&cgroup_stat_mutex);
+
+ cgroup_stat_flush_locked(cgrp);
+
+ usage = cgrp->stat.cputime.sum_exec_runtime;
+ cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+ &utime, &stime);
+
+ mutex_unlock(&cgroup_stat_mutex);
+
+ do_div(usage, NSEC_PER_USEC);
+ do_div(utime, NSEC_PER_USEC);
+ do_div(stime, NSEC_PER_USEC);
+
+ seq_printf(seq, "usage_usec %llu\n"
+ "user_usec %llu\n"
+ "system_usec %llu\n",
+ usage, utime, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+ int cpu;
+
+ /* the root cgrp has cpu_stat preallocated */
+ if (!cgrp->cpu_stat) {
+ cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+ if (!cgrp->cpu_stat)
+ return -ENOMEM;
+ }
+
+ /* ->updated_children list is self terminated */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+ cstat->updated_children = cgrp;
+ u64_stats_init(&cstat->sync);
+ }
+
+ prev_cputime_init(&cgrp->stat.prev_cputime);
+
+ return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+ int cpu;
+
+ cgroup_stat_flush(cgrp);
+
+ /* sanity check */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+ if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+ WARN_ON_ONCE(cstat->updated_next))
+ return;
+ }
+
+ free_percpu(cgrp->cpu_stat);
+ cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+ BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}