summaryrefslogtreecommitdiff
path: root/kernel/cgroup/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/cgroup.c')
-rw-r--r--kernel/cgroup/cgroup.c760
1 files changed, 561 insertions, 199 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e32b6972c478..e717208cfb18 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -59,6 +59,8 @@
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
+#include <linux/nstree.h>
+#include <linux/irq_work.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -90,11 +92,14 @@
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
-#ifdef CONFIG_PROVE_RCU
+#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
+struct blocking_notifier_head cgroup_lifetime_notifier =
+ BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier);
+
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
static bool cgroup_debug __read_mostly;
@@ -121,10 +126,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
* which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ * which can never complete as it's behind in the same queue and
+ * workqueue's max_active is 1.
*/
-static struct workqueue_struct *cgroup_destroy_wq;
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -161,17 +189,21 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
-static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
+static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu);
+static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu);
/* the default hierarchy */
-struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
+struct cgroup_root cgrp_dfl_root = {
+ .cgrp.self.rstat_cpu = &root_rstat_cpu,
+ .cgrp.rstat_base_cpu = &root_rstat_base_cpu,
+};
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
-static bool cgrp_dfl_visible;
+bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
@@ -209,12 +241,18 @@ static u16 have_canfork_callback __read_mostly;
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+/*
+ * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
+ * read protected by either.
+ *
+ * Can only be turned on, but not turned off.
+ */
+bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
+
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_cgroup_ns),
.user_ns = &init_user_ns,
- .ns.ops = &cgroupns_operations,
- .ns.inum = PROC_CGROUP_INIT_INO,
.root_cset = &init_css_set,
};
@@ -250,6 +288,7 @@ static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+static void cgroup_rt_init(void);
#ifdef CONFIG_DEBUG_CGROUP_REF
#define CGROUP_REF_FN_ATTRS noinline
@@ -633,9 +672,22 @@ int cgroup_task_count(const struct cgroup *cgrp)
return count;
}
+static struct cgroup *kn_priv(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent;
+ /*
+ * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT.
+ * Therefore it is always safe to dereference this pointer outside of a
+ * RCU section.
+ */
+ parent = rcu_dereference_check(kn->__parent,
+ kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT);
+ return parent->priv;
+}
+
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
- struct cgroup *cgrp = of->kn->parent->priv;
+ struct cgroup *cgrp = kn_priv(of->kn);
struct cftype *cft = of_cft(of);
/*
@@ -891,7 +943,8 @@ static void css_set_move_task(struct task_struct *task,
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
- * against cgroup_exit()/cgroup_free() dropping the css_set.
+ * against cgroup_task_dead()/cgroup_task_free() dropping
+ * the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -1282,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
{
bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
- /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ /*
+ * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
+ * favordynmods can flip while task is between
+ * cgroup_threadgroup_change_begin() and end(), so down_write global
+ * cgroup_threadgroup_rwsem to synchronize them.
+ *
+ * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
+ * cgroup_threadgroup_rwsem doesn't exlude tasks between
+ * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
+ * turn off. As the scenario is unlikely, simply disallow disabling once
+ * enabled and print out a warning.
+ */
+ percpu_down_write(&cgroup_threadgroup_rwsem);
if (favor && !favoring) {
+ cgroup_enable_per_threadgroup_rwsem = true;
rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
} else if (!favor && favoring) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
}
+ percpu_up_write(&cgroup_threadgroup_rwsem);
}
static int cgroup_init_root_id(struct cgroup_root *root)
@@ -1322,6 +1391,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
{
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
+ int ret;
trace_cgroup_destroy_root(root);
@@ -1330,6 +1400,10 @@ static void cgroup_destroy_root(struct cgroup_root *root)
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
+
/* Rebind all subsystems back to the default hierarchy */
WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
@@ -1358,7 +1432,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_unlock();
- cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
@@ -1449,9 +1522,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
} else {
/*
* NOTE: This function may be called from bpf_cgroup_from_id()
- * on a task which has already passed exit_task_namespaces() and
- * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
- * cgroups visible for lookups.
+ * on a task which has already passed exit_nsproxy_namespaces()
+ * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
+ * make all cgroups visible for lookups.
*/
return &cgrp_dfl_root.cgrp;
}
@@ -1612,7 +1685,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
- cgrp = kn->parent->priv;
+ cgrp = kn_priv(kn);
cgroup_unlock();
@@ -1644,7 +1717,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
- cgrp = kn->parent->priv;
+ cgrp = kn_priv(kn);
/*
* We're gonna grab cgroup_mutex which nests outside kernfs
@@ -1682,7 +1755,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
cfile->kn = NULL;
spin_unlock_irq(&cgroup_file_kn_lock);
- del_timer_sync(&cfile->notify_timer);
+ timer_delete_sync(&cfile->notify_timer);
}
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -1702,7 +1775,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
- if (!css->ss) {
+ if (css_is_self(css)) {
if (cgroup_on_dfl(cgrp)) {
cgroup_addrm_files(css, cgrp,
cgroup_base_files, false);
@@ -1734,7 +1807,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (css->flags & CSS_VISIBLE)
return 0;
- if (!css->ss) {
+ if (css_is_self(css)) {
if (cgroup_on_dfl(cgrp)) {
ret = cgroup_addrm_files(css, cgrp,
cgroup_base_files, true);
@@ -1744,8 +1817,11 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (cgroup_psi_enabled()) {
ret = cgroup_addrm_files(css, cgrp,
cgroup_psi_files, true);
- if (ret < 0)
+ if (ret < 0) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
return ret;
+ }
}
} else {
ret = cgroup_addrm_files(css, cgrp,
@@ -1839,9 +1915,9 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
- css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
+ css->cgroup = dcgrp;
WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
e_cset_node[ss->id]) {
@@ -1860,13 +1936,6 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
}
spin_unlock_irq(&css_set_lock);
- if (ss->css_rstat_flush) {
- list_del_rcu(&css->rstat_css_node);
- synchronize_rcu();
- list_add_rcu(&css->rstat_css_node,
- &dcgrp->rstat_css_list);
- }
-
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
@@ -1922,6 +1991,7 @@ enum cgroup2_param {
Opt_memory_localevents,
Opt_memory_recursiveprot,
Opt_memory_hugetlb_accounting,
+ Opt_pids_localevents,
nr__cgroup2_params
};
@@ -1931,6 +2001,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+ fsparam_flag("pids_localevents", Opt_pids_localevents),
{}
};
@@ -1960,10 +2031,20 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_memory_hugetlb_accounting:
ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
return 0;
+ case Opt_pids_localevents:
+ ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ return 0;
}
return -EINVAL;
}
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return &ctx->peak;
+}
+
static void apply_cgroup_root_flags(unsigned int root_flags)
{
if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
@@ -1989,6 +2070,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+ if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS;
}
}
@@ -2004,6 +2090,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
seq_puts(seq, ",memory_recursiveprot");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
seq_puts(seq, ",memory_hugetlb_accounting");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ seq_puts(seq, ",pids_localevents");
return 0;
}
@@ -2030,12 +2118,16 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
- INIT_LIST_HEAD(&cgrp->rstat_css_list);
prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+#ifdef CONFIG_CGROUP_BPF
+ for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++)
+ cgrp->bpf.revisions[i] = 1;
+#endif
+
init_waitqueue_head(&cgrp->offline_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -2096,7 +2188,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP |
- KERNFS_ROOT_SUPPORT_USER_XATTR,
+ KERNFS_ROOT_SUPPORT_USER_XATTR |
+ KERNFS_ROOT_INVARIANT_PARENT,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -2110,7 +2203,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto destroy_root;
- ret = cgroup_rstat_init(root_cgrp);
+ ret = css_rstat_init(&root_cgrp->self);
if (ret)
goto destroy_root;
@@ -2118,8 +2211,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto exit_stats;
- ret = cgroup_bpf_inherit(root_cgrp);
- WARN_ON_ONCE(ret);
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE, root_cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
trace_cgroup_setup_root(root);
@@ -2150,7 +2244,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
goto out;
exit_stats:
- cgroup_rstat_exit(root_cgrp);
+ css_rstat_exit(&root_cgrp->self);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
@@ -2292,10 +2386,8 @@ static void cgroup_kill_sb(struct super_block *sb)
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
- cgroup_bpf_offline(&root->cgrp);
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
- }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -2316,10 +2408,38 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
+enum cpuset_param {
+ Opt_cpuset_v2_mode,
+};
+
+static const struct fs_parameter_spec cpuset_fs_parameters[] = {
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ {}
+};
+
+static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, cpuset_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ return 0;
+ }
+ return -EINVAL;
+}
+
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
+ .parse_param = cpuset_parse_param,
};
/*
@@ -2356,6 +2476,7 @@ static int cpuset_init_fs_context(struct fs_context *fc)
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
.init_fs_context = cpuset_init_fs_context,
+ .parameters = cpuset_fs_parameters,
.fs_flags = FS_USERNS_MOUNT,
};
#endif
@@ -2387,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
* cgroup_attach_lock - Lock for ->attach()
- * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether acquire and acquire which rwsem
+ * @tsk: thread group to lock
*
* cgroup migration sometimes needs to stabilize threadgroups against forks and
* exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
@@ -2407,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
* Resolve the situation by always acquiring cpus_read_lock() before optionally
* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
* CPU hotplug is disabled on entry.
+ *
+ * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
+ * on dynamic cgroup modifications. see the comment above
+ * CGRP_ROOT_FAVOR_DYNMODS definition.
+ *
+ * tsk is not NULL only when writing to cgroup.procs.
*/
-void cgroup_attach_lock(bool lock_threadgroup)
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
cpus_read_lock();
- if (lock_threadgroup)
+
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
percpu_down_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ down_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
}
/**
* cgroup_attach_unlock - Undo cgroup_attach_lock()
- * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether release and release which rwsem
+ * @tsk: thread group to lock
*/
-void cgroup_attach_unlock(bool lock_threadgroup)
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
- if (lock_threadgroup)
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ up_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
+
cpus_read_unlock();
}
@@ -2872,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
/* look up all src csets */
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
@@ -2896,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *threadgroup_locked)
+ enum cgroup_attach_lock_mode *lock_mode)
{
struct task_struct *tsk;
pid_t pid;
@@ -2904,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- /*
- * If we migrate a single thread, we don't care about threadgroup
- * stability. If the thread is `current`, it won't exit(2) under our
- * hands or change PID through exec(2). We exclude
- * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
- * callers by cgroup_mutex.
- * Therefore, we can skip the global lock.
- */
- lockdep_assert_held(&cgroup_mutex);
- *threadgroup_locked = pid || threadgroup;
- cgroup_attach_lock(*threadgroup_locked);
-
+retry_find_task:
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
tsk = ERR_PTR(-ESRCH);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
} else {
tsk = current;
@@ -2938,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
tsk = ERR_PTR(-EINVAL);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
-
get_task_struct(tsk);
- goto out_unlock_rcu;
+ rcu_read_unlock();
+
+ /*
+ * If we migrate a single thread, we don't care about threadgroup
+ * stability. If the thread is `current`, it won't exit(2) under our
+ * hands or change PID through exec(2). We exclude
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
+ * by cgroup_mutex. Therefore, we can skip the global lock.
+ */
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (pid || threadgroup) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
+ else
+ *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ } else {
+ *lock_mode = CGRP_ATTACH_LOCK_NONE;
+ }
+
+ cgroup_attach_lock(*lock_mode, tsk);
+
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * A race with de_thread from another thread's exec()
+ * may strip us of our leadership. If this happens,
+ * throw this task away and try again.
+ */
+ cgroup_attach_unlock(*lock_mode, tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
+
+ return tsk;
-out_unlock_threadgroup:
- cgroup_attach_unlock(*threadgroup_locked);
- *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
{
- struct cgroup_subsys *ss;
- int ssid;
+ cgroup_attach_unlock(lock_mode, task);
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
-
- cgroup_attach_unlock(threadgroup_locked);
-
- for_each_subsys(ss, ssid)
- if (ss->post_attach)
- ss->post_attach();
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -3016,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ enum cgroup_attach_lock_mode lock_mode;
bool has_tasks;
int ret;
@@ -3047,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
* write-locking can be skipped safely.
*/
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
- cgroup_attach_lock(has_tasks);
+
+ if (has_tasks)
+ lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ else
+ lock_mode = CGRP_ATTACH_LOCK_NONE;
+
+ cgroup_attach_lock(lock_mode, NULL);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -3068,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(has_tasks);
+ cgroup_attach_unlock(lock_mode, NULL);
return ret;
}
@@ -3654,11 +3828,60 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
static int cgroup_stat_show(struct seq_file *seq, void *v)
{
struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct cgroup_subsys_state *css;
+ int dying_cnt[CGROUP_SUBSYS_COUNT];
+ int ssid;
seq_printf(seq, "nr_descendants %d\n",
cgroup->nr_descendants);
+
+ /*
+ * Show the number of live and dying csses associated with each of
+ * non-inhibited cgroup subsystems that is bound to cgroup v2.
+ *
+ * Without proper lock protection, racing is possible. So the
+ * numbers may not be consistent when that happens.
+ */
+ rcu_read_lock();
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ dying_cnt[ssid] = -1;
+ if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+ (cgroup_subsys[ssid]->root != &cgrp_dfl_root))
+ continue;
+ css = rcu_dereference_raw(cgroup->subsys[ssid]);
+ dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
+ seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+ css ? (css->nr_descendants + 1) : 0);
+ }
+
seq_printf(seq, "nr_dying_descendants %d\n",
cgroup->nr_dying_descendants);
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ if (dying_cnt[ssid] >= 0)
+ seq_printf(seq, "nr_dying_subsys_%s %d\n",
+ cgroup_subsys[ssid]->name, dying_cnt[ssid]);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ unsigned int sequence;
+ u64 freeze_time;
+
+ do {
+ sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
+ freeze_time = cgrp->freezer.frozen_nsec;
+ /* Add in current freezer interval if the cgroup is freezing. */
+ if (test_bit(CGRP_FREEZE, &cgrp->flags))
+ freeze_time += (ktime_get_ns() -
+ cgrp->freezer.freeze_start_nsec);
+ } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
+
+ do_div(freeze_time, NSEC_PER_USEC);
+ seq_printf(seq, "frozen_usec %llu\n", freeze_time);
return 0;
}
@@ -3963,7 +4186,7 @@ static void __cgroup_kill(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- set_bit(CGRP_KILL, &cgrp->flags);
+ cgrp->kill_seq++;
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -3979,10 +4202,6 @@ static void __cgroup_kill(struct cgroup *cgrp)
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
-
- spin_lock_irq(&css_set_lock);
- clear_bit(CGRP_KILL, &cgrp->flags);
- spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
@@ -4063,13 +4282,14 @@ static void cgroup_file_release(struct kernfs_open_file *of)
cft->release(of);
put_cgroup_ns(ctx->ns);
kfree(ctx);
+ of->priv = NULL;
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cgroup_file_ctx *ctx = of->priv;
- struct cgroup *cgrp = of->kn->parent->priv;
+ struct cgroup *cgrp = kn_priv(of->kn);
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
@@ -4081,7 +4301,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
* except for the files explicitly marked delegatable -
- * cgroup.procs and cgroup.subtree_control.
+ * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
@@ -4401,7 +4621,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
@@ -4484,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
+EXPORT_SYMBOL_GPL(cgroup_file_notify);
/**
* cgroup_file_show - show or hide a hidden cgroup file
@@ -4580,8 +4801,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @root are accessible and @pos is a descendant of @root.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @root are accessible and @pos is a descendant of @root.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
@@ -4629,8 +4851,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct rightmost descendant as
- * long as @pos is accessible.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct rightmost descendant as long as @pos
+ * is accessible.
*/
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
@@ -4674,9 +4897,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @cgroup are accessible and @pos is a descendant of
- * @cgroup.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
@@ -5141,15 +5364,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
- const struct cred *saved_cred;
ssize_t ret;
- bool threadgroup_locked;
+ enum cgroup_attach_lock_mode lock_mode;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -5164,18 +5386,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* permissions using the credentials from file open to protect against
* inherited fd attacks.
*/
- saved_cred = override_creds(of->file->f_cred);
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb,
- threadgroup, ctx->ns);
- revert_creds(saved_cred);
+ scoped_with_creds(of->file->f_cred)
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
if (ret)
goto out_finish;
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, threadgroup_locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -5257,6 +5478,11 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_stat_show,
},
{
+ .name = "cgroup.stat.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_core_local_stat_show,
+ },
+ {
.name = "cgroup.freeze",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_freeze_show,
@@ -5353,8 +5579,9 @@ static void css_free_rwork_fn(struct work_struct *work)
struct cgroup *cgrp = css->cgroup;
percpu_ref_exit(&css->refcnt);
+ css_rstat_exit(css);
- if (ss) {
+ if (!css_is_self(css)) {
/* css free path */
struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
@@ -5383,7 +5610,6 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
- cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -5408,23 +5634,36 @@ static void css_release_work_fn(struct work_struct *work)
css->flags |= CSS_RELEASED;
list_del_rcu(&css->sibling);
- if (ss) {
- /* css release path */
- if (!list_empty(&css->rstat_css_node)) {
- cgroup_rstat_flush(cgrp);
- list_del_rcu(&css->rstat_css_node);
- }
+ if (!css_is_self(css)) {
+ struct cgroup *parent_cgrp;
+
+ css_rstat_flush(css);
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
+
+ cgrp->nr_dying_subsys[ss->id]--;
+ /*
+ * When a css is released and ready to be freed, its
+ * nr_descendants must be zero. However, the corresponding
+ * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
+ * is activated and deactivated multiple times with one or
+ * more of its previous activation leaving behind dying csses.
+ */
+ WARN_ON_ONCE(css->nr_descendants);
+ parent_cgrp = cgroup_parent(cgrp);
+ while (parent_cgrp) {
+ parent_cgrp->nr_dying_subsys[ss->id]--;
+ parent_cgrp = cgroup_parent(parent_cgrp);
+ }
} else {
struct cgroup *tcgrp;
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
- cgroup_rstat_flush(cgrp);
+ css_rstat_flush(&cgrp->self);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5447,7 +5686,7 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -5456,7 +5695,7 @@ static void css_release(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_release_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5472,7 +5711,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
- INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -5481,9 +5719,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
- if (ss->css_rstat_flush)
- list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
-
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -5502,8 +5737,11 @@ static int online_css(struct cgroup_subsys_state *css)
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt);
- if (css->parent)
+ if (css->parent) {
atomic_inc(&css->parent->online_cnt);
+ while ((css = css->parent))
+ css->nr_descendants++;
+ }
}
return ret;
}
@@ -5525,6 +5763,16 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
+
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
}
/**
@@ -5563,6 +5811,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
goto err_free_css;
css->id = err;
+ err = css_rstat_init(css);
+ if (err)
+ goto err_free_css;
+
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id);
@@ -5576,9 +5828,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
- list_del_rcu(&css->rstat_css_node);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
@@ -5592,7 +5843,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
struct cgroup_root *root = parent->root;
struct cgroup *cgrp, *tcgrp;
struct kernfs_node *kn;
- int level = parent->level + 1;
+ int i, level = parent->level + 1;
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
@@ -5604,17 +5855,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_free_cgrp;
- ret = cgroup_rstat_init(cgrp);
- if (ret)
- goto out_cancel_ref;
-
/* create the directory */
kn = kernfs_create_dir_ns(parent->kn, name, mode,
current_fsuid(), current_fsgid(),
cgrp, NULL);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
- goto out_stat_exit;
+ goto out_cancel_ref;
}
cgrp->kn = kn;
@@ -5624,19 +5871,27 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->root = root;
cgrp->level = level;
- ret = psi_cgroup_alloc(cgrp);
+ /*
+ * Now that init_cgroup_housekeeping() has been called and cgrp->self
+ * is setup, it is safe to perform rstat initialization on it.
+ */
+ ret = css_rstat_init(&cgrp->self);
if (ret)
goto out_kernfs_remove;
- ret = cgroup_bpf_inherit(cgrp);
+ ret = psi_cgroup_alloc(cgrp);
if (ret)
- goto out_psi_free;
+ goto out_stat_exit;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestors[tcgrp->level] = tcgrp;
/*
* New cgroup inherits effective freeze counter, and
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock);
if (cgrp->freezer.e_freeze) {
/*
* Set the CGRP_FREEZE flag, so when a process will be
@@ -5645,27 +5900,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* consider it frozen immediately.
*/
set_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.freeze_start_nsec = ktime_get_ns();
set_bit(CGRP_FROZEN, &cgrp->flags);
}
- spin_lock_irq(&css_set_lock);
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestors[tcgrp->level] = tcgrp;
-
- if (tcgrp != cgrp) {
- tcgrp->nr_descendants++;
-
- /*
- * If the new cgroup is frozen, all ancestor cgroups
- * get a new frozen descendant, but their state can't
- * change because of this.
- */
- if (cgrp->freezer.e_freeze)
- tcgrp->freezer.nr_frozen_descendants++;
- }
- }
- spin_unlock_irq(&css_set_lock);
-
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5674,7 +5912,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->self.serial_nr = css_serial_nr_next++;
+ ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto out_psi_free;
+
/* allocation complete, commit to creation */
+ spin_lock_irq(&css_set_lock);
+ for (i = 0; i < level; i++) {
+ tcgrp = cgrp->ancestors[i];
+ tcgrp->nr_descendants++;
+
+ /*
+ * If the new cgroup is frozen, all ancestor cgroups get a new
+ * frozen descendant, but their state can't change because of
+ * this.
+ */
+ if (cgrp->freezer.e_freeze)
+ tcgrp->freezer.nr_frozen_descendants++;
+ }
+ spin_unlock_irq(&css_set_lock);
+
list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
atomic_inc(&root->nr_cgrps);
cgroup_get_live(parent);
@@ -5692,10 +5952,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
out_psi_free:
psi_cgroup_free(cgrp);
+out_stat_exit:
+ css_rstat_exit(&cgrp->self);
out_kernfs_remove:
kernfs_remove(cgrp->kn);
-out_stat_exit:
- cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5707,7 +5967,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
struct cgroup *cgroup;
int ret = false;
- int level = 1;
+ int level = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -5715,7 +5975,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
if (cgroup->nr_descendants >= cgroup->max_descendants)
goto fail;
- if (level > cgroup->max_depth)
+ if (level >= cgroup->max_depth)
goto fail;
level++;
@@ -5751,7 +6011,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
}
/*
- * This extra ref will be put in cgroup_free_fn() and guarantees
+ * This extra ref will be put in css_free_rwork_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
kernfs_get(cgrp->kn);
@@ -5809,7 +6069,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
@@ -5829,6 +6089,12 @@ static void kill_css(struct cgroup_subsys_state *css)
if (css->flags & CSS_DYING)
return;
+ /*
+ * Call css_killed(), if defined, before setting the CSS_DYING flag
+ */
+ if (css->ss->css_killed)
+ css->ss->css_killed(css);
+
css->flags |= CSS_DYING;
/*
@@ -5886,7 +6152,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
- int ssid;
+ int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
@@ -5944,7 +6210,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
cgroup1_check_for_release(parent);
- cgroup_bpf_offline(cgrp);
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -6006,6 +6274,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
} else {
css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
BUG_ON(css->id < 0);
+
+ BUG_ON(ss_rstat_init(ss));
+ BUG_ON(css_rstat_init(css));
}
/* Update the init_css_set to contain a subsys
@@ -6054,6 +6325,8 @@ int __init cgroup_init_early(void)
ss->id, ss->name);
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+ WARN(ss->early_init && ss->css_rstat_flush,
+ "cgroup rstat cannot be used with early init subsystem\n");
ss->id = i;
ss->name = cgroup_subsys_name[i];
@@ -6082,9 +6355,10 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
- cgroup_rstat_boot();
+ BUG_ON(ss_rstat_init(NULL));
get_user_ns(init_cgroup_ns.user_ns);
+ cgroup_rt_init();
cgroup_lock();
@@ -6095,6 +6369,8 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
+ cgroup_bpf_lifetime_notifier_init();
+
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
cgroup_unlock();
@@ -6163,10 +6439,11 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
+ ns_tree_add(&init_cgroup_ns);
return 0;
}
@@ -6180,8 +6457,14 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_offline_wq);
+
+ cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_release_wq);
+
+ cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_free_wq);
return 0;
}
core_initcall(cgroup_wq_init);
@@ -6198,15 +6481,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
- * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * __cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
* On success return the cgrp or ERR_PTR on failure
- * Only cgroups within current task's cgroup NS are valid.
+ * There are no cgroup NS restrictions.
*/
-struct cgroup *cgroup_get_from_id(u64 id)
+struct cgroup *__cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
- struct cgroup *cgrp, *root_cgrp;
+ struct cgroup *cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
@@ -6228,6 +6511,22 @@ struct cgroup *cgroup_get_from_id(u64 id)
if (!cgrp)
return ERR_PTR(-ENOENT);
+ return cgrp;
+}
+
+/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct cgroup *cgrp, *root_cgrp;
+
+ cgrp = __cgroup_get_from_id(id);
+ if (IS_ERR(cgrp))
+ return cgrp;
root_cgrp = current_cgns_cgroup_dfl();
if (!cgroup_is_descendant(cgrp, root_cgrp)) {
@@ -6394,7 +6693,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
struct cgroup *dst_cgrp = NULL;
struct css_set *cset;
struct super_block *sb;
- struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
cgroup_lock();
@@ -6404,6 +6702,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
+ if (kargs->cgrp)
+ kargs->kill_seq = kargs->cgrp->kill_seq;
+ else
+ kargs->kill_seq = cset->dfl_cgrp->kill_seq;
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
@@ -6411,14 +6713,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
return 0;
}
- f = fget_raw(kargs->cgroup);
- if (!f) {
+ CLASS(fd_raw, f)(kargs->cgroup);
+ if (fd_empty(f)) {
ret = -EBADF;
goto err;
}
- sb = f->f_path.dentry->d_sb;
+ sb = fd_file(f)->f_path.dentry->d_sb;
- dst_cgrp = cgroup_get_from_file(f);
+ dst_cgrp = cgroup_get_from_file(fd_file(f));
if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL;
@@ -6466,15 +6768,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
}
put_css_set(cset);
- fput(f);
kargs->cgrp = dst_cgrp;
return ret;
err:
cgroup_threadgroup_change_end(current);
cgroup_unlock();
- if (f)
- fput(f);
if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset);
@@ -6587,6 +6886,7 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned int cgrp_kill_seq = 0;
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss;
@@ -6600,10 +6900,13 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
- if (kargs->cgrp)
+ if (kargs->cgrp) {
cgrp_flags = kargs->cgrp->flags;
- else
+ cgrp_kill_seq = kargs->cgrp->kill_seq;
+ } else {
cgrp_flags = cset->dfl_cgrp->flags;
+ cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
+ }
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
@@ -6638,7 +6941,7 @@ void cgroup_post_fork(struct task_struct *child,
* child down right after we finished preparing it for
* userspace.
*/
- kill = test_bit(CGRP_KILL, &cgrp_flags);
+ kill = kargs->kill_seq != cgrp_kill_seq;
}
spin_unlock_irq(&css_set_lock);
@@ -6669,25 +6972,37 @@ void cgroup_post_fork(struct task_struct *child,
}
/**
- * cgroup_exit - detach cgroup from exiting task
+ * cgroup_task_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk.
*
*/
-void cgroup_exit(struct task_struct *tsk)
+void cgroup_task_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
- struct css_set *cset;
int i;
- spin_lock_irq(&css_set_lock);
+ /* see cgroup_post_fork() for details */
+ do_each_subsys_mask(ss, i, have_exit_callback) {
+ ss->exit(tsk);
+ } while_each_subsys_mask();
+}
+
+static void do_cgroup_task_dead(struct task_struct *tsk)
+{
+ struct css_set *cset;
+ unsigned long flags;
+
+ spin_lock_irqsave(&css_set_lock, flags);
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
css_set_move_task(tsk, cset, NULL, false);
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+ /* matches the signal->live check in css_task_iter_advance() */
+ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
@@ -6697,15 +7012,61 @@ void cgroup_exit(struct task_struct *tsk)
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
- /* see cgroup_post_fork() for details */
- do_each_subsys_mask(ss, i, have_exit_callback) {
- ss->exit(tsk);
- } while_each_subsys_mask();
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously but can't be delayed indefinitely either as the dead task pins
+ * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
+ * irq_work to allow batching while ensuring timely completion.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+ struct llist_node *lnode;
+ struct task_struct *task, *next;
+
+ lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+ llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+ do_cgroup_task_dead(task);
+ put_task_struct(task);
+ }
+}
+
+static void __init cgroup_rt_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+ per_cpu(cgrp_dead_tasks_iwork, cpu) =
+ IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
+ }
}
-void cgroup_release(struct task_struct *task)
+void cgroup_task_dead(struct task_struct *task)
+{
+ get_task_struct(task);
+ llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+ irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
+}
+#else /* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ do_cgroup_task_dead(task);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+void cgroup_task_release(struct task_struct *task)
{
struct cgroup_subsys *ss;
int ssid;
@@ -6713,16 +7074,19 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
-
- spin_lock_irq(&css_set_lock);
- css_set_skip_task_iters(task_css_set(task), task);
- list_del_init(&task->cg_list);
- spin_unlock_irq(&css_set_lock);
}
-void cgroup_free(struct task_struct *task)
+void cgroup_task_free(struct task_struct *task)
{
struct css_set *cset = task_css_set(task);
+
+ if (!list_empty(&task->cg_list)) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
+
put_css_set(cset);
}
@@ -6880,14 +7244,11 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
*/
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
- struct cgroup *cgrp;
- struct fd f = fdget_raw(fd);
- if (!f.file)
+ CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- cgrp = cgroup_v1v2_get_from_file(f.file);
- fdput(f);
- return cgrp;
+ return cgroup_v1v2_get_from_file(fd_file(f));
}
/**
@@ -7062,7 +7423,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
"favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n"
- "memory_hugetlb_accounting\n");
+ "memory_hugetlb_accounting\n"
+ "pids_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);