From 2246ca53d7b3c286348c9c4cc4d2551972619cc2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 1 Jul 2023 15:38:56 +0800 Subject: cgroup: remove unneeded return value of cgroup_rm_cftypes_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The return value of cgroup_rm_cftypes_locked() is always 0. So remove it to simplify the code. No functional change intended. Signed-off-by: Miaohe Lin Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bfe3cd8ccf36..b0d98542eea2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4320,14 +4320,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return ret; } -static int cgroup_rm_cftypes_locked(struct cftype *cfts) +static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); - return 0; } /** @@ -4343,8 +4342,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - int ret; - if (!cfts || cfts[0].name[0] == '\0') return 0; @@ -4352,9 +4349,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) return -ENOENT; cgroup_lock(); - ret = cgroup_rm_cftypes_locked(cfts); + cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); - return ret; + return 0; } /** -- cgit From 1299eb2b0ad5812dd6e5ea5b631da61f9e791bb3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Jul 2023 17:42:42 +0800 Subject: cgroup: minor cleanup for cgroup_extra_stat_show() Make it under CONFIG_CGROUP_SCHED to rid of __maybe_unused annotation. And further fetch cgrp inside cgroup_extra_stat_show() directly to rid of __maybe_unused annotation of cgrp. No functional change intended. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b0d98542eea2..4137eb4a3000 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3654,9 +3654,10 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } -static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, - struct cgroup *cgrp, int ssid) +#ifdef CONFIG_CGROUP_SCHED +static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { + struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; @@ -3672,15 +3673,15 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, css_put(css); return ret; } +#endif static int cpu_stat_show(struct seq_file *seq, void *v) { - struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED - ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); + ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); #endif return ret; } -- cgit From a453be9725a1aa3f602f5b4c66eefd1fb4ba7c44 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 Jul 2023 19:30:49 +0800 Subject: cgroup/cpuset: simplify the percpu kthreads check in update_tasks_cpumask() kthread_is_per_cpu() can be called directly without checking whether PF_KTHREAD is set in task->flags. So remove PF_KTHREAD check to make code more concise. Signed-off-by: Miaohe Lin Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58e6f18f01c1..601c40da8e03 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1230,7 +1230,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) /* * Percpu kthreads in top_cpuset are ignored */ - if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) + if (kthread_is_per_cpu(task)) continue; cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); } else { -- cgit From 48f074565bb7eaa9ae502b2b2d423b1e50325e1b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 Jul 2023 20:03:52 +0800 Subject: cgroup/cpuset: avoid unneeded cpuset_mutex re-lock cpuset_mutex unlock and lock pair is only needed when transferring tasks out of empty cpuset. Avoid unneeded cpuset_mutex re-lock when !is_empty to save cpu cycles. Signed-off-by: Miaohe Lin Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 601c40da8e03..e136269c152c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3521,17 +3521,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - mutex_unlock(&cpuset_mutex); - /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into * cpuset. Should be done outside any lock. */ - if (is_empty) + if (is_empty) { + mutex_unlock(&cpuset_mutex); remove_tasks_in_empty_cpuset(cs); - - mutex_lock(&cpuset_mutex); + mutex_lock(&cpuset_mutex); + } } static void -- cgit From 0a67b847e1f06a70a7b560b69a06b3c78d3e72f0 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Mon, 3 Jul 2023 19:27:39 +0200 Subject: cpuset: Allow setscheduler regardless of manipulated task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we migrate a task between two cgroups, one of the checks is a verification whether we can modify task's scheduler settings (cap_task_setscheduler()). An implicit migration occurs also when enabling a controller on the unified hierarchy (think of parent to child migration). The aforementioned check may be problematic if the caller of the migration (enabling a controller) has no permissions over migrated tasks. For instance, a user's cgroup that ends up running a process of a different user. Although cgroup permissions are configured favorably, the enablement fails due to the foreign process [1]. Change the behavior by relaxing the permissions check on the unified hierarchy when no effective change would happen. This is in accordance with unified hierarchy attachment behavior when permissions of the source to target cgroups are decisive whereas the migrated task is opaque (as opposed to more restrictive check in __cgroup1_procs_write()). Notice that foreign task's affinity may still be modified if the user can modify destination cgroup's cpuset attributes (update_tasks_cpumask() does no permissions check). The permissions check could thus be skipped on v2 even when affinity changes. Stay conservative in this patch though. [1] https://github.com/systemd/systemd/issues/18293#issuecomment-831205649 Signed-off-by: Michal Koutný Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e136269c152c..a46deb8a64dd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2487,6 +2487,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; + bool cpus_updated, mems_updated; int ret; /* used later by cpuset_attach() */ @@ -2501,13 +2502,25 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) if (ret) goto out_unlock; + cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; - ret = security_task_setscheduler(task); - if (ret) - goto out_unlock; + + /* + * Skip rights over task check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (cpus_updated || mems_updated)) { + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + } if (dl_task(task)) { cs->nr_migrate_dl_tasks++; -- cgit From 12101424d7d26131495bafc2ecfa17d39b8e3c64 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Mon, 3 Jul 2023 19:27:40 +0200 Subject: selftests: cgroup: Minor code reorganizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended, these small changes are merged into one commit and they serve as a preparation for an upcoming new testcase. Signed-off-by: Michal Koutný Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- MAINTAINERS | 1 + tools/testing/selftests/cgroup/cgroup_util.c | 2 ++ tools/testing/selftests/cgroup/cgroup_util.h | 2 ++ tools/testing/selftests/cgroup/test_core.c | 2 +- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 2 +- 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3be1bdfe8ecc..318bd4e2d7f5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5222,6 +5222,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git F: Documentation/admin-guide/cgroup-v1/cpusets.rst F: include/linux/cpuset.h F: kernel/cgroup/cpuset.c +F: tools/testing/selftests/cgroup/test_cpuset_prs.sh CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index e8bbbdb77e0d..0340d4ca8f51 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -286,6 +286,8 @@ int cg_destroy(const char *cgroup) { int ret; + if (!cgroup) + return 0; retry: ret = rmdir(cgroup); if (ret && errno == EBUSY) { diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index c92df4e5d395..1df7f202214a 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -11,6 +11,8 @@ #define USEC_PER_SEC 1000000L #define NSEC_PER_SEC 1000000000L +#define TEST_UID 65534 /* usually nobody, any !root is fine */ + /* * Checks if two given values differ by less than err% of their sum. */ diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 600123503063..80aa6b2373b9 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -683,7 +683,7 @@ cleanup: */ static int test_cgcore_lesser_euid_open(const char *root) { - const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ + const uid_t test_euid = TEST_UID; int ret = KSFT_FAIL; char *cg_test_a = NULL, *cg_test_b = NULL; char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL; diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 2b5215cc599f..4afb132e4e4f 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -10,7 +10,7 @@ skip_test() { echo "$1" echo "Test SKIPPED" - exit 0 + exit 4 # ksft_skip } [[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" -- cgit From cd3c6f682df4df7de74a364c34b3b10f84db271b Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Mon, 3 Jul 2023 19:27:41 +0200 Subject: selftests: cgroup: Add cpuset migrations testcase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a separate testfile to verify treating permissions when tasks are migrated on cgroup v2 hierarchy between cpuset cgroups. In accordance with v2 design, migration should be allowed based on delegation boundaries (i.e. cgroup.procs permissions) and does not depend on the migrated object (i.e. unprivileged process can migrate another process (even privileged) as long as it remains in the original dedicated scope). Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- MAINTAINERS | 1 + tools/testing/selftests/cgroup/.gitignore | 1 + tools/testing/selftests/cgroup/Makefile | 2 + tools/testing/selftests/cgroup/test_cpuset.c | 275 +++++++++++++++++++++++++++ 4 files changed, 279 insertions(+) create mode 100644 tools/testing/selftests/cgroup/test_cpuset.c diff --git a/MAINTAINERS b/MAINTAINERS index 318bd4e2d7f5..86e9f56a89bf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5222,6 +5222,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git F: Documentation/admin-guide/cgroup-v1/cpusets.rst F: include/linux/cpuset.h F: kernel/cgroup/cpuset.c +F: tools/testing/selftests/cgroup/test_cpuset.c F: tools/testing/selftests/cgroup/test_cpuset_prs.sh CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index c4a57e69f749..8443a8d46a1c 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -5,4 +5,5 @@ test_freezer test_kmem test_kill test_cpu +test_cpuset wait_inotify diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 3d263747d2ad..dee0f013c7f4 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -12,6 +12,7 @@ TEST_GEN_PROGS += test_core TEST_GEN_PROGS += test_freezer TEST_GEN_PROGS += test_kill TEST_GEN_PROGS += test_cpu +TEST_GEN_PROGS += test_cpuset LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h @@ -23,3 +24,4 @@ $(OUTPUT)/test_core: cgroup_util.c $(OUTPUT)/test_freezer: cgroup_util.c $(OUTPUT)/test_kill: cgroup_util.c $(OUTPUT)/test_cpu: cgroup_util.c +$(OUTPUT)/test_cpuset: cgroup_util.c diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c new file mode 100644 index 000000000000..b061ed1e05b4 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "../kselftest.h" +#include "cgroup_util.h" + +static int idle_process_fn(const char *cgroup, void *arg) +{ + (void)pause(); + return 0; +} + +static int do_migration_fn(const char *cgroup, void *arg) +{ + int object_pid = (int)(size_t)arg; + + if (setuid(TEST_UID)) + return EXIT_FAILURE; + + // XXX checking /proc/$pid/cgroup would be quicker than wait + if (cg_enter(cgroup, object_pid) || + cg_wait_for_proc_count(cgroup, 1)) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} + +static int do_controller_fn(const char *cgroup, void *arg) +{ + const char *child = cgroup; + const char *parent = arg; + + if (setuid(TEST_UID)) + return EXIT_FAILURE; + + if (!cg_read_strstr(child, "cgroup.controllers", "cpuset")) + return EXIT_FAILURE; + + if (cg_write(parent, "cgroup.subtree_control", "+cpuset")) + return EXIT_FAILURE; + + if (cg_read_strstr(child, "cgroup.controllers", "cpuset")) + return EXIT_FAILURE; + + if (cg_write(parent, "cgroup.subtree_control", "-cpuset")) + return EXIT_FAILURE; + + if (!cg_read_strstr(child, "cgroup.controllers", "cpuset")) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} + +/* + * Migrate a process between two sibling cgroups. + * The success should only depend on the parent cgroup permissions and not the + * migrated process itself (cpuset controller is in place because it uses + * security_task_setscheduler() in cgroup v1). + * + * Deliberately don't set cpuset.cpus in children to avoid definining migration + * permissions between two different cpusets. + */ +static int test_cpuset_perms_object(const char *root, bool allow) +{ + char *parent = NULL, *child_src = NULL, *child_dst = NULL; + char *parent_procs = NULL, *child_src_procs = NULL, *child_dst_procs = NULL; + const uid_t test_euid = TEST_UID; + int object_pid = 0; + int ret = KSFT_FAIL; + + parent = cg_name(root, "cpuset_test_0"); + if (!parent) + goto cleanup; + parent_procs = cg_name(parent, "cgroup.procs"); + if (!parent_procs) + goto cleanup; + if (cg_create(parent)) + goto cleanup; + + child_src = cg_name(parent, "cpuset_test_1"); + if (!child_src) + goto cleanup; + child_src_procs = cg_name(child_src, "cgroup.procs"); + if (!child_src_procs) + goto cleanup; + if (cg_create(child_src)) + goto cleanup; + + child_dst = cg_name(parent, "cpuset_test_2"); + if (!child_dst) + goto cleanup; + child_dst_procs = cg_name(child_dst, "cgroup.procs"); + if (!child_dst_procs) + goto cleanup; + if (cg_create(child_dst)) + goto cleanup; + + if (cg_write(parent, "cgroup.subtree_control", "+cpuset")) + goto cleanup; + + if (cg_read_strstr(child_src, "cgroup.controllers", "cpuset") || + cg_read_strstr(child_dst, "cgroup.controllers", "cpuset")) + goto cleanup; + + /* Enable permissions along src->dst tree path */ + if (chown(child_src_procs, test_euid, -1) || + chown(child_dst_procs, test_euid, -1)) + goto cleanup; + + if (allow && chown(parent_procs, test_euid, -1)) + goto cleanup; + + /* Fork a privileged child as a test object */ + object_pid = cg_run_nowait(child_src, idle_process_fn, NULL); + if (object_pid < 0) + goto cleanup; + + /* Carry out migration in a child process that can drop all privileges + * (including capabilities), the main process must remain privileged for + * cleanup. + * Child process's cgroup is irrelevant but we place it into child_dst + * as hacky way to pass information about migration target to the child. + */ + if (allow ^ (cg_run(child_dst, do_migration_fn, (void *)(size_t)object_pid) == EXIT_SUCCESS)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (object_pid > 0) { + (void)kill(object_pid, SIGTERM); + (void)clone_reap(object_pid, WEXITED); + } + + cg_destroy(child_dst); + free(child_dst_procs); + free(child_dst); + + cg_destroy(child_src); + free(child_src_procs); + free(child_src); + + cg_destroy(parent); + free(parent_procs); + free(parent); + + return ret; +} + +static int test_cpuset_perms_object_allow(const char *root) +{ + return test_cpuset_perms_object(root, true); +} + +static int test_cpuset_perms_object_deny(const char *root) +{ + return test_cpuset_perms_object(root, false); +} + +/* + * Migrate a process between parent and child implicitely + * Implicit migration happens when a controller is enabled/disabled. + * + */ +static int test_cpuset_perms_subtree(const char *root) +{ + char *parent = NULL, *child = NULL; + char *parent_procs = NULL, *parent_subctl = NULL, *child_procs = NULL; + const uid_t test_euid = TEST_UID; + int object_pid = 0; + int ret = KSFT_FAIL; + + parent = cg_name(root, "cpuset_test_0"); + if (!parent) + goto cleanup; + parent_procs = cg_name(parent, "cgroup.procs"); + if (!parent_procs) + goto cleanup; + parent_subctl = cg_name(parent, "cgroup.subtree_control"); + if (!parent_subctl) + goto cleanup; + if (cg_create(parent)) + goto cleanup; + + child = cg_name(parent, "cpuset_test_1"); + if (!child) + goto cleanup; + child_procs = cg_name(child, "cgroup.procs"); + if (!child_procs) + goto cleanup; + if (cg_create(child)) + goto cleanup; + + /* Enable permissions as in a delegated subtree */ + if (chown(parent_procs, test_euid, -1) || + chown(parent_subctl, test_euid, -1) || + chown(child_procs, test_euid, -1)) + goto cleanup; + + /* Put a privileged child in the subtree and modify controller state + * from an unprivileged process, the main process remains privileged + * for cleanup. + * The unprivileged child runs in subtree too to avoid parent and + * internal-node constraing violation. + */ + object_pid = cg_run_nowait(child, idle_process_fn, NULL); + if (object_pid < 0) + goto cleanup; + + if (cg_run(child, do_controller_fn, parent) != EXIT_SUCCESS) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (object_pid > 0) { + (void)kill(object_pid, SIGTERM); + (void)clone_reap(object_pid, WEXITED); + } + + cg_destroy(child); + free(child_procs); + free(child); + + cg_destroy(parent); + free(parent_subctl); + free(parent_procs); + free(parent); + + return ret; +} + + +#define T(x) { x, #x } +struct cpuset_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { + T(test_cpuset_perms_object_allow), + T(test_cpuset_perms_object_deny), + T(test_cpuset_perms_subtree), +}; +#undef T + +int main(int argc, char *argv[]) +{ + char root[PATH_MAX]; + int i, ret = EXIT_SUCCESS; + + if (cg_find_unified_root(root, sizeof(root))) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset")) + if (cg_write(root, "cgroup.subtree_control", "+cpuset")) + ksft_exit_skip("Failed to set cpuset controller\n"); + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ret = EXIT_FAILURE; + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + return ret; +} -- cgit From 868f87b3759bb7bedb081fdd40ef8d143c003fa6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 27 Jun 2023 19:40:59 +0800 Subject: cgroup: fix obsolete comment above for_each_css() cgroup_tree_mutex is removed since commit 8353da1f91f1 ("cgroup: remove cgroup_tree_mutex"), update corresponding comment. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4137eb4a3000..13d9ea33eb6d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -679,7 +679,7 @@ EXPORT_SYMBOL_GPL(of_css); * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_[tree_]mutex. + * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ -- cgit From c8c926200c55454101f072a4b16c9ff5b8c9e56f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 27 Jun 2023 10:35:00 -0400 Subject: cgroup/cpuset: Inherit parent's load balance state in v2 Since commit f28e22441f35 ("cgroup/cpuset: Add a new isolated cpus.partition type"), the CS_SCHED_LOAD_BALANCE bit of a v2 cpuset can be on or off. The child cpusets of a partition root must have the same setting as its parent or it may screw up the rebuilding of sched domains. Fix this problem by making sure the a child v2 cpuset will follows its parent cpuset load balance state unless the child cpuset is a new partition root itself. Fixes: f28e22441f35 ("cgroup/cpuset: Add a new isolated cpus.partition type") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a46deb8a64dd..71e4b5b83614 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1588,11 +1588,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, } /* - * Skip the whole subtree if the cpumask remains the same - * and has no partition root state and force flag not set. + * Skip the whole subtree if + * 1) the cpumask remains the same, + * 2) has no partition root state, + * 3) force flag not set, and + * 4) for v2 load balance state same as its parent. */ if (!cp->partition_root_state && !force && - cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { + cpumask_equal(tmp->new_cpus, cp->effective_cpus) && + (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } @@ -1675,6 +1680,20 @@ update_parent_subparts: update_tasks_cpumask(cp, tmp->new_cpus); + /* + * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE + * from parent if current cpuset isn't a valid partition root + * and their load balance states differ. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_partition_valid(cp) && + (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { + if (is_sched_load_balance(parent)) + set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + } + /* * On legacy hierarchy, if the effective cpumask of any non- * empty cpuset is changed, we need to rebuild sched domains. @@ -3235,6 +3254,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->use_parent_ecpus = true; parent->child_ecpus_count++; } + + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) -- cgit From a86ce68078b200a5dcc263da01154ee926c25188 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 27 Jun 2023 10:35:01 -0400 Subject: cgroup/cpuset: Extract out CS_CPU_EXCLUSIVE & CS_SCHED_LOAD_BALANCE handling Extract out the setting of CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE flags as well as the rebuilding of scheduling domains into the new update_partition_exclusive() and update_partition_sd_lb() helper functions to simplify the logic. The update_partition_exclusive() helper is called mainly at the beginning of the caller, but it may be called at the end too. The update_partition_sd_lb() helper is called at the end of the caller. This patch should reduce the chance that cpuset partition will end up in an incorrect state. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 141 ++++++++++++++++++++++++++++++------------------- 1 file changed, 86 insertions(+), 55 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 71e4b5b83614..d8d3b750c138 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1255,7 +1255,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus) { + if (parent->nr_subparts_cpus && is_partition_valid(cs)) { cpumask_or(new_cpus, parent->effective_cpus, parent->subparts_cpus); cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); @@ -1277,6 +1277,50 @@ enum subparts_cmd { static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); + +/* + * Update partition exclusive flag + * + * Return: 0 if successful, an error code otherwise + */ +static int update_partition_exclusive(struct cpuset *cs, int new_prs) +{ + bool exclusive = (new_prs > 0); + + if (exclusive && !is_cpu_exclusive(cs)) { + if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + return PERR_NOTEXCL; + } else if (!exclusive && is_cpu_exclusive(cs)) { + /* Turning off CS_CPU_EXCLUSIVE will not return error */ + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + return 0; +} + +/* + * Update partition load balance flag and/or rebuild sched domain + * + * Changing load balance flag will automatically call + * rebuild_sched_domains_locked(). + */ +static void update_partition_sd_lb(struct cpuset *cs, int old_prs) +{ + int new_prs = cs->partition_root_state; + bool new_lb = (new_prs != PRS_ISOLATED); + bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + + if (new_lb != !!is_sched_load_balance(cs)) { + rebuild_domains = true; + if (new_lb) + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + } + + if (rebuild_domains) + rebuild_sched_domains_locked(); +} + /** * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1336,8 +1380,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if ((newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cs->cpus_allowed))) + if (!newmask && cpumask_empty(cs->cpus_allowed)) return PERR_CPUSEMPTY; /* @@ -1403,11 +1446,16 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, cpumask_and(tmp->addmask, newmask, parent->cpus_allowed); adding = cpumask_andnot(tmp->addmask, tmp->addmask, parent->subparts_cpus); + /* + * Empty cpumask is not allewed + */ + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ - if (adding && + } else if (adding && cpumask_subset(parent->effective_cpus, tmp->addmask) && !cpumask_intersects(tmp->delmask, cpu_active_mask) && partition_is_populated(parent, cs)) { @@ -1480,14 +1528,13 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, /* * Transitioning between invalid to valid or vice versa may require - * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE. + * changing CS_CPU_EXCLUSIVE. */ if (old_prs != new_prs) { - if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) && - (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) - return PERR_NOTEXCL; - if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs)) - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + int err = update_partition_exclusive(cs, new_prs); + + if (err) + return err; } /* @@ -1524,15 +1571,16 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, update_tasks_cpumask(parent, tmp->addmask); /* - * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. - * rebuild_sched_domains_locked() may be called. + * For partcmd_update without newmask, it is being called from + * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. + * Update the load balance flag and scheduling domain if + * cpus_read_trylock() is successful. */ - if (old_prs != new_prs) { - if (old_prs == PRS_ISOLATED) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); - else if (new_prs == PRS_ISOLATED) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + update_partition_sd_lb(cs, old_prs); + cpus_read_unlock(); } + notify_partition_change(cs, old_prs); return 0; } @@ -1766,6 +1814,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, int retval; struct tmpmasks tmp; bool invalidate = false; + int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -1885,6 +1934,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmp); + + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */ + update_partition_sd_lb(cs, old_prs); } return 0; } @@ -2261,7 +2313,6 @@ out: static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; - bool sched_domain_rebuilt = false; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; @@ -2280,45 +2331,28 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; + err = update_partition_exclusive(cs, new_prs); + if (err) + goto out; + if (!old_prs) { /* - * Turning on partition root requires setting the - * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be empty. + * cpus_allowed cannot be empty. */ if (cpumask_empty(cs->cpus_allowed)) { err = PERR_CPUSEMPTY; goto out; } - err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) { - err = PERR_NOTEXCL; - goto out; - } - err = update_parent_subparts_cpumask(cs, partcmd_enable, NULL, &tmpmask); - if (err) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + if (err) goto out; - } - - if (new_prs == PRS_ISOLATED) { - /* - * Disable the load balance flag should not return an - * error unless the system is running out of memory. - */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - sched_domain_rebuilt = true; - } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); - sched_domain_rebuilt = true; - goto out; /* Sched domain is rebuilt in update_flag() */ + goto out; } else { /* * Switching back to member is always allowed even if it @@ -2337,15 +2371,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) compute_effective_cpumask(cs->effective_cpus, cs, parent); spin_unlock_irq(&callback_lock); } - - /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - - if (!is_sched_load_balance(cs)) { - /* Make sure load balance is on */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); - sched_domain_rebuilt = true; - } } update_tasks_cpumask(parent, tmpmask.new_cpus); @@ -2353,18 +2378,21 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmpmask); - if (!sched_domain_rebuilt) - rebuild_sched_domains_locked(); out: /* - * Make partition invalid if an error happen + * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error + * happens. */ - if (err) + if (err) { new_prs = -new_prs; + update_partition_exclusive(cs, new_prs); + } + spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); spin_unlock_irq(&callback_lock); + /* * Update child cpusets, if present. * Force update if switching back to member. @@ -2372,6 +2400,9 @@ out: if (!list_empty(&cs->css.children)) update_cpumasks_hier(cs, &tmpmask, !new_prs); + /* Update sched domains and load balance flag */ + update_partition_sd_lb(cs, old_prs); + notify_partition_change(cs, old_prs); free_cpumasks(NULL, &tmpmask); return 0; -- cgit From 99fe36ba6fc16aa0962bc1f41ebcdec696203889 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 27 Jun 2023 10:35:02 -0400 Subject: cgroup/cpuset: Improve temporary cpumasks handling The limitation that update_parent_subparts_cpumask() can only use addmask & delmask in the given tmp cpumasks is fragile and may lead to unexpected error. Fix this problem by allocating/freeing a struct tmpmasks in update_cpumask() to avoid reusing the cpumasks in trial_cs. With this change, we can move the update_tasks_cpumask() for the parent and update_sibling_cpumasks() for the sibling to inside update_parent_subparts_cpumask(). Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d8d3b750c138..c9bb44e50eb2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1277,6 +1277,8 @@ enum subparts_cmd { static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, + struct tmpmasks *tmp); /* * Update partition exclusive flag @@ -1447,7 +1449,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, adding = cpumask_andnot(tmp->addmask, tmp->addmask, parent->subparts_cpus); /* - * Empty cpumask is not allewed + * Empty cpumask is not allowed */ if (cpumask_empty(newmask)) { part_error = PERR_CPUSEMPTY; @@ -1567,8 +1569,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); - if (adding || deleting) + if (adding || deleting) { update_tasks_cpumask(parent, tmp->addmask); + if (parent->child_ecpus_count) + update_sibling_cpumasks(parent, cs, tmp); + } /* * For partcmd_update without newmask, it is being called from @@ -1842,18 +1847,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; -#ifdef CONFIG_CPUMASK_OFFSTACK - /* - * Use the cpumasks in trialcs for tmpmasks when they are pointers - * to allocated cpumasks. - * - * Note that update_parent_subparts_cpumask() uses only addmask & - * delmask, but not new_cpus. - */ - tmp.addmask = trialcs->subparts_cpus; - tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = NULL; -#endif + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; retval = validate_change(cs, trialcs); @@ -1882,7 +1877,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = 0; } if (retval < 0) - return retval; + goto out_free; if (cs->partition_root_state) { if (invalidate) @@ -1917,11 +1912,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } spin_unlock_irq(&callback_lock); -#ifdef CONFIG_CPUMASK_OFFSTACK - /* Now trialcs->cpus_allowed is available */ - tmp.new_cpus = trialcs->cpus_allowed; -#endif - /* effective_cpus will be updated here */ update_cpumasks_hier(cs, &tmp, false); @@ -1938,6 +1928,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */ update_partition_sd_lb(cs, old_prs); } +out_free: + free_cpumasks(NULL, &tmp); return 0; } @@ -2346,13 +2338,11 @@ static int update_prstate(struct cpuset *cs, int new_prs) err = update_parent_subparts_cpumask(cs, partcmd_enable, NULL, &tmpmask); - if (err) - goto out; } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ - goto out; + ; } else { /* * Switching back to member is always allowed even if it @@ -2372,12 +2362,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) spin_unlock_irq(&callback_lock); } } - - update_tasks_cpumask(parent, tmpmask.new_cpus); - - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmpmask); - out: /* * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error -- cgit From 3ae0b773211ed0231e7ee3e8d28ec4ab9bc5134b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 27 Jun 2023 10:35:03 -0400 Subject: cgroup/cpuset: Allow suppression of sched domain rebuild in update_cpumasks_hier() A single partition setup and tear-down operation can lead to multiple rebuild_sched_domains_locked() calls which is a waste of effort. This can partly be mitigated by adding a flag to suppress the rebuild_sched_domains_locked() call in update_cpumasks_hier(). Since a Boolean flag has already been passed as the 3rd argument to update_cpumasks_hier(), we can extend that to a full flag word. The sched domain rebuild suppression is now enabled in update_sibling_cpumasks() as all it callers will do the sched domain rebuild after its return later on anyway. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c9bb44e50eb2..b278b60ed788 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1590,6 +1590,12 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, return 0; } +/* + * update_cpumasks_hier() flags + */ +#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ +#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ + /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider @@ -1604,7 +1610,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - bool force) + int flags) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1644,10 +1650,10 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, - * 3) force flag not set, and + * 3) HIER_CHECKALL flag not set, and * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !force && + if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { @@ -1764,7 +1770,7 @@ update_parent_subparts: } rcu_read_unlock(); - if (need_rebuild_sched_domains) + if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) rebuild_sched_domains_locked(); } @@ -1788,7 +1794,9 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * to use the right effective_cpus value. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. + * release the RCU read lock before calling it. HIER_NO_SD_REBUILD + * flag is used to suppress rebuild of sched domains as the callers + * will take care of that. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1800,7 +1808,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, false); + update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); rcu_read_lock(); css_put(&sibling->css); } @@ -1913,7 +1921,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* effective_cpus will be updated here */ - update_cpumasks_hier(cs, &tmp, false); + update_cpumasks_hier(cs, &tmp, 0); if (cs->partition_root_state) { struct cpuset *parent = parent_cs(cs); @@ -2382,7 +2390,7 @@ out: * Force update if switching back to member. */ if (!list_empty(&cs->css.children)) - update_cpumasks_hier(cs, &tmpmask, !new_prs); + update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); -- cgit From d1d4ff5d11a5887a9c4cfc00294bc68ba03e7c16 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 10:38:20 +0800 Subject: cgroup: put cgroup_tryget_css() inside CONFIG_CGROUP_SCHED Put cgroup_tryget_css() inside CONFIG_CGROUP_SCHED to fix the warning of 'cgroup_tryget_css' defined but not used [-Wunused-function] when CONFIG_CGROUP_SCHED is disabled. Signed-off-by: Miaohe Lin Reviewed-by: Kamalesh Babulal Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 13d9ea33eb6d..30c9c2503ccf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -492,28 +492,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return &cgrp->self; } -/** - * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get @cgrp's css associated with @ss. If the css doesn't exist - * or is offline, %NULL is returned. - */ -static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - css = cgroup_css(cgrp, ss); - if (css && !css_tryget_online(css)) - css = NULL; - rcu_read_unlock(); - - return css; -} - /** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest @@ -3655,6 +3633,28 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) } #ifdef CONFIG_CGROUP_SCHED +/** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css associated with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (css && !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { struct cgroup *cgrp = seq_css(seq)->cgroup; -- cgit From ceddae22cd08ba9f52a995cfb573fee89fa4afc4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Jul 2023 19:59:38 +0800 Subject: cgroup: remove obsolete comment above struct cgroupstats There's no flag in the delay accounting structure indicates that the task is waiting on IO since commit 1193829da1a6 ("delayacct: cleanup flags in struct task_delay_info and functions use it"). So remove the comment. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- include/uapi/linux/cgroupstats.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/uapi/linux/cgroupstats.h b/include/uapi/linux/cgroupstats.h index aa306e4cd6c1..80b2c8594480 100644 --- a/include/uapi/linux/cgroupstats.h +++ b/include/uapi/linux/cgroupstats.h @@ -24,8 +24,6 @@ * basis. This data is shared using taskstats. * * Most of these states are derived by looking at the task->state value - * For the nr_io_wait state, a flag in the delay accounting structure - * indicates that the task is waiting on IO * * Each member is aligned to a 8 byte boundary. */ -- cgit From fcbb485d9f720b176d7bac0802181901eedddbed Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 15 Jul 2023 11:08:29 +0800 Subject: cgroup: use cached local variable parent in for loop Use local variable parent to initialize iter tcgrp in for loop so the size of cgroup.o can be reduced by 64 bytes. No functional change intended. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 30c9c2503ccf..e9a62de3faf9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5894,7 +5894,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* -- cgit From 6f71780e7fadf778eeb45e830d31339af53533fc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 17 Jul 2023 19:28:00 +0800 Subject: cgroup: fix obsolete function name cgroup_taskset_migrate() has been renamed to cgroup_migrate_execute() since commit e595cd706982 ("cgroup: track migration context in cgroup_mgctx"). Update the corresponding comment. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e9a62de3faf9..aa2cc32e60c0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2477,7 +2477,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /* * This function may be called both before and - * after cgroup_taskset_migrate(). The two cases + * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ -- cgit From c25ff4b911a11908c7f05bc07cc6762f9cece2b5 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Mon, 17 Jul 2023 20:09:23 +0530 Subject: cgroup: remove cgrp->kn check in css_populate_dir() cgroup_create() creates cgrp and assigns the kernfs_node to cgrp->kn, then cgroup_mkdir() populates base and csses cft file by calling css_populate_dir() and cgroup_apply_control_enable() with a valid cgrp->kn. Check for NULL cgrp->kn, will always be false, remove it. Signed-off-by: Kamalesh Babulal Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aa2cc32e60c0..3c1bb9ecea10 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1714,7 +1714,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) struct cftype *cfts, *failed_cfts; int ret; - if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + if (css->flags & CSS_VISIBLE) return 0; if (!css->ss) { -- cgit From 62157e11d9a4ca7210bb2b0e8fa0557a6ada7fad Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 18 Jul 2023 14:38:34 +0530 Subject: cgroup/misc: update struct members descriptions Update the miscellaneous controller's structure member's description of struct misc_res and struct misc_cg. Signed-off-by: Kamalesh Babulal Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index c238207d1615..6555c0f57158 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -31,7 +31,7 @@ struct misc_cg; * struct misc_res: Per cgroup per misc type resource * @max: Maximum limit on the resource. * @usage: Current usage of the resource. - * @failed: True if charged failed for the resource in a cgroup. + * @events: Number of times, the resource limit exceeded. */ struct misc_res { unsigned long max; @@ -42,6 +42,7 @@ struct misc_res { /** * struct misc_cg - Miscellaneous controller's cgroup structure. * @css: cgroup subsys state object. + * @events_file: Handle for the misc resources events file. * @res: Array of misc resources usage in the cgroup. */ struct misc_cg { -- cgit From 32bf85c60ca3584a7ba3bef19da2779b73b2e7d6 Mon Sep 17 00:00:00 2001 From: Haitao Huang Date: Mon, 17 Jul 2023 18:08:45 -0700 Subject: cgroup/misc: Change counters to be explicit 64bit types So the variables can account for resources of huge quantities even on 32-bit machines. Signed-off-by: Haitao Huang Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 25 +++++++++------------ kernel/cgroup/misc.c | 55 ++++++++++++++++++++++----------------------- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 6555c0f57158..e799b1f8d05b 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -34,9 +34,9 @@ struct misc_cg; * @events: Number of times, the resource limit exceeded. */ struct misc_res { - unsigned long max; - atomic_long_t usage; - atomic_long_t events; + u64 max; + atomic64_t usage; + atomic64_t events; }; /** @@ -54,12 +54,10 @@ struct misc_cg { struct misc_res res[MISC_CG_RES_TYPES]; }; -unsigned long misc_cg_res_total_usage(enum misc_res_type type); -int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity); -int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount); -void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount); +u64 misc_cg_res_total_usage(enum misc_res_type type); +int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); /** * css_misc() - Get misc cgroup from the css. @@ -100,27 +98,26 @@ static inline void put_misc_cg(struct misc_cg *cg) #else /* !CONFIG_CGROUP_MISC */ -static inline unsigned long misc_cg_res_total_usage(enum misc_res_type type) +static inline u64 misc_cg_res_total_usage(enum misc_res_type type) { return 0; } -static inline int misc_cg_set_capacity(enum misc_res_type type, - unsigned long capacity) +static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { return 0; } static inline int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { return 0; } static inline void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { } diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index ae2f4dd47508..abbe9aa5cdd1 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -14,7 +14,7 @@ #include #define MAX_STR "max" -#define MAX_NUM ULONG_MAX +#define MAX_NUM U64_MAX /* Miscellaneous res name, keep it in sync with enum misc_res_type */ static const char *const misc_res_name[] = { @@ -37,7 +37,7 @@ static struct misc_cg root_cg; * more than the actual capacity. We are using Limits resource distribution * model of cgroup for miscellaneous controller. */ -static unsigned long misc_res_capacity[MISC_CG_RES_TYPES]; +static u64 misc_res_capacity[MISC_CG_RES_TYPES]; /** * parent_misc() - Get the parent of the passed misc cgroup. @@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type) * Context: Any context. * Return: Current total usage of the resource. */ -unsigned long misc_cg_res_total_usage(enum misc_res_type type) +u64 misc_cg_res_total_usage(enum misc_res_type type) { if (valid_type(type)) - return atomic_long_read(&root_cg.res[type].usage); + return atomic64_read(&root_cg.res[type].usage); return 0; } @@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); * * %0 - Successfully registered the capacity. * * %-EINVAL - If @type is invalid. */ -int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity) +int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { if (!valid_type(type)) return -EINVAL; @@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity); * Context: Any context. */ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { - WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage), + WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage), "misc cgroup resource %s became less than 0", misc_res_name[type]); } @@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, * * -EBUSY - If max limit will be crossed or total usage will be more than the * capacity. */ -int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i, *j; int ret; struct misc_res *res; - int new_usage; + s64 new_usage; if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) return -EINVAL; @@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, for (i = cg; i; i = parent_misc(i)) { res = &i->res[type]; - new_usage = atomic_long_add_return(amount, &res->usage); + new_usage = atomic64_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { ret = -EBUSY; @@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, err_charge: for (j = i; j; j = parent_misc(j)) { - atomic_long_inc(&j->res[type].events); + atomic64_inc(&j->res[type].events); cgroup_file_notify(&j->events_file); } @@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge); * * Context: Any context. */ -void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i; @@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) { int i; struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long max; + u64 max; for (i = 0; i < MISC_CG_RES_TYPES; i++) { if (READ_ONCE(misc_res_capacity[i])) { @@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) if (max == MAX_NUM) seq_printf(sf, "%s max\n", misc_res_name[i]); else - seq_printf(sf, "%s %lu\n", misc_res_name[i], + seq_printf(sf, "%s %llu\n", misc_res_name[i], max); } } @@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) * Return: * * >= 0 - Number of bytes processed in the input. * * -EINVAL - If buf is not valid. - * * -ERANGE - If number is bigger than the unsigned long capacity. + * * -ERANGE - If number is bigger than the u64 capacity. */ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct misc_cg *cg; - unsigned long max; + u64 max; int ret = 0, i; enum misc_res_type type = MISC_CG_RES_TYPES; char *token; @@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, if (!strcmp(MAX_STR, buf)) { max = MAX_NUM; } else { - ret = kstrtoul(buf, 0, &max); + ret = kstrtou64(buf, 0, &max); if (ret) return ret; } @@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, static int misc_cg_current_show(struct seq_file *sf, void *v) { int i; - unsigned long usage; + u64 usage; struct misc_cg *cg = css_misc(seq_css(sf)); for (i = 0; i < MISC_CG_RES_TYPES; i++) { - usage = atomic_long_read(&cg->res[i].usage); + usage = atomic64_read(&cg->res[i].usage); if (READ_ONCE(misc_res_capacity[i]) || usage) - seq_printf(sf, "%s %lu\n", misc_res_name[i], usage); + seq_printf(sf, "%s %llu\n", misc_res_name[i], usage); } return 0; @@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) static int misc_cg_capacity_show(struct seq_file *sf, void *v) { int i; - unsigned long cap; + u64 cap; for (i = 0; i < MISC_CG_RES_TYPES; i++) { cap = READ_ONCE(misc_res_capacity[i]); if (cap) - seq_printf(sf, "%s %lu\n", misc_res_name[i], cap); + seq_printf(sf, "%s %llu\n", misc_res_name[i], cap); } return 0; @@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) static int misc_events_show(struct seq_file *sf, void *v) { struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long events, i; + u64 events; + int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic_long_read(&cg->res[i].events); + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) - seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } @@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css) for (i = 0; i < MISC_CG_RES_TYPES; i++) { WRITE_ONCE(cg->res[i].max, MAX_NUM); - atomic_long_set(&cg->res[i].usage, 0); + atomic64_set(&cg->res[i].usage, 0); } return &cg->css; -- cgit From 714e08cc3ec564dc1717000f5a4c1f40ca8c8878 Mon Sep 17 00:00:00 2001 From: Haitao Huang Date: Fri, 21 Jul 2023 05:02:31 -0700 Subject: cgroup/misc: Store atomic64_t reads to u64 Change 'new_usage' type to u64 so it can be compared with unsigned 'max' and 'capacity' properly even if the value crosses the signed boundary. Signed-off-by: Haitao Huang Signed-off-by: Tejun Heo --- kernel/cgroup/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index abbe9aa5cdd1..79a3717a5803 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -142,7 +142,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) struct misc_cg *i, *j; int ret; struct misc_res *res; - s64 new_usage; + u64 new_usage; if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) return -EINVAL; -- cgit From fe9ebb8cec7968e2e758670952bbe4f1a453ef08 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Fri, 21 Jul 2023 13:49:37 +0800 Subject: docs: cgroup-v1: correct the term of Page Cache organization in inode The radix-tree for Page Cache has been replaced with xarray, see commit eb797a8ee0ab ("page cache: Rearrange address_space"), so move "radix-tree" to "xarray". Signed-off-by: Xiongwei Song Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memory.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index fabaad3fd9c2..b49bae598a38 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -197,11 +197,11 @@ are not accounted. We just account pages under usual VM management. RSS pages are accounted at page_fault unless they've already been accounted for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of +inserted into inode (xarray). While it's mapped into the page tables of processes, duplicate accounting is carefully avoided. An RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. Even if RSS pages are fully +unaccounted when it's removed from xarray. Even if RSS pages are fully unmapped (by kswapd), they may exist as SwapCache in the system until they are really freed. Such SwapCaches are also accounted. A swapped-in page is accounted after adding into swapcache. -- cgit From ab8aebdc9f7b9e2768c8a5776d028f4a10d05c4b Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Fri, 21 Jul 2023 13:49:38 +0800 Subject: docs: cgroup-v1: fix typo "listers" -> "listeners" Signed-off-by: Xiongwei Song Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index b49bae598a38..304b8aae03c9 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -909,7 +909,7 @@ experiences some pressure. In this situation, only group C will receive the notification, i.e. groups A and B will not receive it. This is done to avoid excessive "broadcasting" of messages, which disturbs the system and which is especially bad if we are low on memory or thrashing. Group B, will receive -notification only if there are no event listers for group C. +notification only if there are no event listeners for group C. There are three optional modes that specify different propagation behavior: -- cgit From a3fdeeb3f1c1fad3b7b07aa96a7e422fde3c2c15 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 19 Jul 2023 17:06:40 +0800 Subject: cgroup: fix obsolete comment above cgroup_create() Since commit 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID"), cgrp is associated with its kernfs_node. Update corresponding comment. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3c1bb9ecea10..c7aafb59ecf2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5545,8 +5545,7 @@ err_free_css: /* * The returned cgroup is fully initialized including its control mask, but - * it isn't associated with its kernfs_node and doesn't have the control - * mask applied. + * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) -- cgit From 55a5956a55b453ccbfb3297bdf34091e23d47455 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 1 Aug 2023 12:52:14 +0530 Subject: cgroup: clean up printk() Convert the only printk() to use pr_*() helper. No functional change. Signed-off-by: Kamalesh Babulal Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c7aafb59ecf2..33b586db14ef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6086,8 +6086,8 @@ int __init cgroup_init(void) continue; if (cgroup1_ssid_disabled(ssid)) - printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", - ss->name); + pr_info("Disabling %s control group subsystem in v1 mounts\n", + ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; -- cgit From 05f76ae95e71149a9c7085c323d3f710dff22730 Mon Sep 17 00:00:00 2001 From: Cai Xinchen Date: Wed, 2 Aug 2023 03:04:12 +0000 Subject: cgroup/cpuset: fix kernel-doc Add kernel-doc of param @rotor to fix warnings: kernel/cgroup/cpuset.c:4162: warning: Function parameter or member 'rotor' not described in 'cpuset_spread_node' kernel/cgroup/cpuset.c:3771: warning: Function parameter or member 'work' not described in 'cpuset_hotplug_workfn' Signed-off-by: Cai Xinchen Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b278b60ed788..58ec88efa4f8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3753,6 +3753,7 @@ unlock: /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * @work: unused * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4135,6 +4136,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask) /** * cpuset_spread_node() - On which node to begin search for a page + * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), -- cgit From a2c15fece4b47e2e5a89a3bf273c9c6517c3e8a5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 1 Aug 2023 20:40:34 +0800 Subject: cgroup: fix obsolete function name above css_free_rwork_fn() Since commit 8f36aaec9c92 ("cgroup: Use rcu_work instead of explicit rcu and work item"), css_free_work_fn has been renamed to css_free_rwork_fn. Update corresponding comment. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 33b586db14ef..f9cde56cf947 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5301,7 +5301,7 @@ static struct cftype cgroup_psi_files[] = { * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in - * css_free_work_fn(). + * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional -- cgit From 95f5c19c8c04a81477b73b478d10d4eda7aa2f69 Mon Sep 17 00:00:00 2001 From: Han Dapeng Date: Thu, 3 Aug 2023 23:55:27 +0800 Subject: Documentation: cgroup-v2.rst: Correct number of stats entries Signed-off-by: Han Dapeng Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4ef890191196..b26b5274eaaf 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1045,7 +1045,7 @@ All time durations are in microseconds. - user_usec - system_usec - and the following three when the controller is enabled: + and the following five when the controller is enabled: - nr_periods - nr_throttled -- cgit From 7f828eacc4bbfd3ceea8ea17051858262fe04122 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 3 Aug 2023 19:57:02 +0800 Subject: cgroup: fix obsolete function name in cgroup_destroy_locked() Since commit e76ecaeef65c ("cgroup: use cgroup_kn_lock_live() in other cgroup kernfs methods"), cgroup_kn_lock_live() is used in cgroup kernfs methods. Update corresponding comment. Signed-off-by: Miaohe Lin Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f9cde56cf947..85e723abd9e2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5871,7 +5871,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling - * cgroup_lock_live_group(). The latter makes the csets ignored by + * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; -- cgit From e7e64a1bff12f212be12b048723718c2152c4489 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 7 Aug 2023 19:58:31 +0800 Subject: cgroup: clean up if condition in cgroup_pidlist_start() There's no need to use '<=' when knowing 'l->list[mid] != pid' already. No functional change intended. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 83044312bc41..c487ffef6652 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -431,7 +431,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) if (l->list[mid] == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (l->list[mid] < pid) index = mid + 1; else end = mid; -- cgit From 0437719c1a97791481c5fd59642494f2108701a8 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Mon, 7 Aug 2023 11:29:30 +0800 Subject: cgroup/rstat: Record the cumulative per-cpu time of cgroup and its descendants The member variable bstat of the structure cgroup_rstat_cpu records the per-cpu time of the cgroup itself, but does not include the per-cpu time of its descendants. The per-cpu time including descendants is very useful for calculating the per-cpu usage of cgroups. Although we can indirectly obtain the total per-cpu time of the cgroup and its descendants by accumulating the per-cpu bstat of each descendant of the cgroup. But after a child cgroup is removed, we will lose its bstat information. This will cause the cumulative value to be non-monotonic, thus affecting the accuracy of cgroup per-cpu usage. So we add the subtree_bstat variable to record the total per-cpu time of this cgroup and its descendants, which is similar to "cpuacct.usage*" in cgroup v1. And this is also helpful for the migration from cgroup v1 to cgroup v2. After adding this variable, we can obtain the per-cpu time of cgroup and its descendants in user mode through eBPF/drgn, etc. And we are still trying to determine how to expose it in the cgroupfs interface. Suggested-by: Tejun Heo Signed-off-by: Hao Jia Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 14 ++++++++++++++ kernel/cgroup/rstat.c | 12 ++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8a0d5466c7be..7a2862172f51 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -341,6 +341,20 @@ struct cgroup_rstat_cpu { */ struct cgroup_base_stat last_bstat; + /* + * This field is used to record the cumulative per-cpu time of + * the cgroup and its descendants. Currently it can be read via + * eBPF/drgn etc, and we are still trying to determine how to + * expose it in the cgroupfs interface. + */ + struct cgroup_base_stat subtree_bstat; + + /* + * Snapshots at the last reading. These are used to calculate the + * deltas to propagate to the per-cpu subtree_bstat. + */ + struct cgroup_base_stat last_subtree_bstat; + /* * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 2542c21b6b6d..d80d7a608141 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -344,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; struct cgroup_base_stat delta; unsigned seq; @@ -357,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* propagate percpu delta to global */ + /* propagate per-cpu delta to cgroup and per-cpu global statistics */ cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); + cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); - /* propagate global delta to parent (unless that's root) */ + /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); + + delta = rstatc->subtree_bstat; + prstatc = cgroup_rstat_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); + cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); } } -- cgit From 82b90b6c5b38e457c7081d50dff11ecbafc1e61a Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 10 Aug 2023 11:25:28 +0000 Subject: cgroup:namespace: Remove unused cgroup_namespaces_init() cgroup_namspace_init() just return 0. Therefore, there is no need to call it during start_kernel. Just remove it. Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Signed-off-by: Lu Jialin Signed-off-by: Tejun Heo --- kernel/cgroup/namespace.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 0d5c29879a50..144a464e45c6 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = { .install = cgroupns_install, .owner = cgroupns_owner, }; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); -- cgit From 78d44b824ed04dd1553c55c5b839c9a55cbcaf4e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 17 Aug 2023 11:19:13 -0600 Subject: cgroup: Avoid -Wstringop-overflow warnings Change the notation from pointer-to-array to pointer-to-pointer. With this, we avoid the compiler complaining about trying to access a region of size zero as an argument during function calls. This is a workaround to prevent the compiler complaining about accessing an array of size zero when evaluating the arguments of a couple of function calls. See below: kernel/cgroup/cgroup.c: In function 'find_css_set': kernel/cgroup/cgroup.c:1206:16: warning: 'find_existing_css_set' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] 1206 | cset = find_existing_css_set(old_cset, cgrp, template); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/cgroup/cgroup.c:1206:16: note: referencing argument 3 of type 'struct cgroup_subsys_state *[0]' kernel/cgroup/cgroup.c:1071:24: note: in a call to function 'find_existing_css_set' 1071 | static struct css_set *find_existing_css_set(struct css_set *old_cset, | ^~~~~~~~~~~~~~~~~~~~~ With the change to pointer-to-pointer, the functions are not prevented from being executed, and they will do what they have to do when CGROUP_SUBSYS_COUNT == 0. Address the following -Wstringop-overflow warnings seen when built with ARM architecture and aspeed_g4_defconfig configuration (notice that under this configuration CGROUP_SUBSYS_COUNT == 0): kernel/cgroup/cgroup.c:1208:16: warning: 'find_existing_css_set' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:1258:15: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:6089:18: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:6153:18: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] This results in no differences in binary output. Link: https://github.com/KSPP/linux/issues/316 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 85e723abd9e2..2604db9cb13a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -907,7 +907,7 @@ static void css_set_move_task(struct task_struct *task, #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; @@ -1048,7 +1048,7 @@ static bool compare_css_sets(struct css_set *cset, */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) + struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; -- cgit