Merge tag 'cgroup-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo: - cpuset isolation improvements - cpuset cgroup1 support is split into its own file behind the new config option CONFIG_CPUSET_V1. This makes it the second controller which makes cgroup1 support optional after memcg - Handling of unavailable v1 controller handling improved during cgroup1 mount operations - union_find applied to cpuset. It makes code simpler and more efficient - Reduce spurious events in pids.events - Cleanups and other misc changes - Contains a merge of cgroup/for-6.11-fixes to receive cpuset fixes that further changes build upon * tag 'cgroup-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (34 commits) cgroup: Do not report unavailable v1 controllers in /proc/cgroups cgroup: Disallow mounting v1 hierarchies without controller implementation cgroup/cpuset: Expose cpuset filesystem with cpuset v1 only cgroup/cpuset: Move cpu.h include to cpuset-internal.h cgroup/cpuset: add sefltest for cpuset v1 cgroup/cpuset: guard cpuset-v1 code under CONFIG_CPUSETS_V1 cgroup/cpuset: rename functions shared between v1 and v2 cgroup/cpuset: move v1 interfaces to cpuset-v1.c cgroup/cpuset: move validate_change_legacy to cpuset-v1.c cgroup/cpuset: move legacy hotplug update to cpuset-v1.c cgroup/cpuset: add callback_lock helper cgroup/cpuset: move memory_spread to cpuset-v1.c cgroup/cpuset: move relax_domain_level to cpuset-v1.c cgroup/cpuset: move memory_pressure to cpuset-v1.c cgroup/cpuset: move common code to cpuset-internal.h cgroup/cpuset: introduce cpuset-v1.c selftest/cgroup: Make test_cpuset_prs.sh deal with pre-isolated CPUs cgroup/cpuset: Account for boot time isolated CPUs cgroup/cpuset: remove use_parent_ecpus of cpuset cgroup/cpuset: remove fetch_xcpus ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-09-18 06:39:03 +0200
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-09-18 06:39:03 +0200
commit: 78567e2bc723b444228644d2e34ae5255d4ab8a0 (patch)
tree: 3f07fbf68ae127d9c97531a5a80f4eac74acef07 /kernel/cgroup/cgroup.c
parent: 2f27fce67173bbb05d5a0ee03dae5c021202c912 (diff)
parent: af000ce85293b8e608f696f0c6c280bc3a75887f (diff)
1 files changed, 63 insertions, 5 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c8e4b62b436a..90e50d6d3cf3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2331,7 +2331,7 @@ static struct file_system_type cgroup2_fs_type = {
 	.fs_flags		= FS_USERNS_MOUNT,
 };
 
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
 static const struct fs_context_operations cpuset_fs_context_ops = {
 	.get_tree	= cgroup1_get_tree,
 	.free		= cgroup_fs_context_free,
@@ -3669,12 +3669,40 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
 static int cgroup_stat_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct cgroup_subsys_state *css;
+	int dying_cnt[CGROUP_SUBSYS_COUNT];
+	int ssid;
 
 	seq_printf(seq, "nr_descendants %d\n",
 		   cgroup->nr_descendants);
+
+	/*
+	 * Show the number of live and dying csses associated with each of
+	 * non-inhibited cgroup subsystems that is bound to cgroup v2.
+	 *
+	 * Without proper lock protection, racing is possible. So the
+	 * numbers may not be consistent when that happens.
+	 */
+	rcu_read_lock();
+	for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+		dying_cnt[ssid] = -1;
+		if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+		    (cgroup_subsys[ssid]->root !=  &cgrp_dfl_root))
+			continue;
+		css = rcu_dereference_raw(cgroup->subsys[ssid]);
+		dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
+		seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+			   css ? (css->nr_descendants + 1) : 0);
+	}
+
 	seq_printf(seq, "nr_dying_descendants %d\n",
 		   cgroup->nr_dying_descendants);
-
+	for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+		if (dying_cnt[ssid] >= 0)
+			seq_printf(seq, "nr_dying_subsys_%s %d\n",
+				   cgroup_subsys[ssid]->name, dying_cnt[ssid]);
+	}
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -4096,7 +4124,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 	 * If namespaces are delegation boundaries, disallow writes to
 	 * files in an non-init namespace root from inside the namespace
 	 * except for the files explicitly marked delegatable -
-	 * cgroup.procs and cgroup.subtree_control.
+	 * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
 	 */
 	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
 	    !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
@@ -5424,6 +5452,8 @@ static void css_release_work_fn(struct work_struct *work)
 	list_del_rcu(&css->sibling);
 
 	if (ss) {
+		struct cgroup *parent_cgrp;
+
 		/* css release path */
 		if (!list_empty(&css->rstat_css_node)) {
 			cgroup_rstat_flush(cgrp);
@@ -5433,6 +5463,21 @@ static void css_release_work_fn(struct work_struct *work)
 		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
 		if (ss->css_released)
 			ss->css_released(css);
+
+		cgrp->nr_dying_subsys[ss->id]--;
+		/*
+		 * When a css is released and ready to be freed, its
+		 * nr_descendants must be zero. However, the corresponding
+		 * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
+		 * is activated and deactivated multiple times with one or
+		 * more of its previous activation leaving behind dying csses.
+		 */
+		WARN_ON_ONCE(css->nr_descendants);
+		parent_cgrp = cgroup_parent(cgrp);
+		while (parent_cgrp) {
+			parent_cgrp->nr_dying_subsys[ss->id]--;
+			parent_cgrp = cgroup_parent(parent_cgrp);
+		}
 	} else {
 		struct cgroup *tcgrp;
 
@@ -5517,8 +5562,11 @@ static int online_css(struct cgroup_subsys_state *css)
 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
 
 		atomic_inc(&css->online_cnt);
-		if (css->parent)
+		if (css->parent) {
 			atomic_inc(&css->parent->online_cnt);
+			while ((css = css->parent))
+				css->nr_descendants++;
+		}
 	}
 	return ret;
 }
@@ -5540,6 +5588,16 @@ static void offline_css(struct cgroup_subsys_state *css)
 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
 
 	wake_up_all(&css->cgroup->offline_waitq);
+
+	css->cgroup->nr_dying_subsys[ss->id]++;
+	/*
+	 * Parent css and cgroup cannot be freed until after the freeing
+	 * of child css, see css_free_rwork_fn().
+	 */
+	while ((css = css->parent)) {
+		css->nr_descendants--;
+		css->cgroup->nr_dying_subsys[ss->id]++;
+	}
 }
 
 /**
@@ -6178,7 +6236,7 @@ int __init cgroup_init(void)
 	WARN_ON(register_filesystem(&cgroup_fs_type));
 	WARN_ON(register_filesystem(&cgroup2_fs_type));
 	WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
 	WARN_ON(register_filesystem(&cpuset_fs_type));
 #endif
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-09-18 06:39:03 +0200
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-09-18 06:39:03 +0200
commit	78567e2bc723b444228644d2e34ae5255d4ab8a0 (patch)
tree	3f07fbf68ae127d9c97531a5a80f4eac74acef07 /kernel/cgroup/cgroup.c
parent	2f27fce67173bbb05d5a0ee03dae5c021202c912 (diff)
parent	af000ce85293b8e608f696f0c6c280bc3a75887f (diff)