From 959ab6350add903e352890af53e86663739fcb9a Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Sun, 4 Aug 2024 21:30:15 -0400 Subject: cgroup/cpuset: fix panic caused by partcmd_update We find a bug as below: BUG: unable to handle page fault for address: 00000003 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 358 Comm: bash Tainted: G W I 6.6.0-10893-g60d6 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/4 RIP: 0010:partition_sched_domains_locked+0x483/0x600 Code: 01 48 85 d2 74 0d 48 83 05 29 3f f8 03 01 f3 48 0f bc c2 89 c0 48 9 RSP: 0018:ffffc90000fdbc58 EFLAGS: 00000202 RAX: 0000000100000003 RBX: ffff888100b3dfa0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000002fe80 RBP: ffff888100b3dfb0 R08: 0000000000000001 R09: 0000000000000000 R10: ffffc90000fdbcb0 R11: 0000000000000004 R12: 0000000000000002 R13: ffff888100a92b48 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f44a5425740(0000) GS:ffff888237d80000(0000) knlGS:0000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000100030973 CR3: 000000010722c000 CR4: 00000000000006e0 Call Trace: ? show_regs+0x8c/0xa0 ? __die_body+0x23/0xa0 ? __die+0x3a/0x50 ? page_fault_oops+0x1d2/0x5c0 ? partition_sched_domains_locked+0x483/0x600 ? search_module_extables+0x2a/0xb0 ? search_exception_tables+0x67/0x90 ? kernelmode_fixup_or_oops+0x144/0x1b0 ? __bad_area_nosemaphore+0x211/0x360 ? up_read+0x3b/0x50 ? bad_area_nosemaphore+0x1a/0x30 ? exc_page_fault+0x890/0xd90 ? __lock_acquire.constprop.0+0x24f/0x8d0 ? __lock_acquire.constprop.0+0x24f/0x8d0 ? asm_exc_page_fault+0x26/0x30 ? partition_sched_domains_locked+0x483/0x600 ? partition_sched_domains_locked+0xf0/0x600 rebuild_sched_domains_locked+0x806/0xdc0 update_partition_sd_lb+0x118/0x130 cpuset_write_resmask+0xffc/0x1420 cgroup_file_write+0xb2/0x290 kernfs_fop_write_iter+0x194/0x290 new_sync_write+0xeb/0x160 vfs_write+0x16f/0x1d0 ksys_write+0x81/0x180 __x64_sys_write+0x21/0x30 x64_sys_call+0x2f25/0x4630 do_syscall_64+0x44/0xb0 entry_SYSCALL_64_after_hwframe+0x78/0xe2 RIP: 0033:0x7f44a553c887 It can be reproduced with cammands: cd /sys/fs/cgroup/ mkdir test cd test/ echo +cpuset > ../cgroup.subtree_control echo root > cpuset.cpus.partition cat /sys/fs/cgroup/cpuset.cpus.effective 0-3 echo 0-3 > cpuset.cpus // taking away all cpus from root This issue is caused by the incorrect rebuilding of scheduling domains. In this scenario, test/cpuset.cpus.partition should be an invalid root and should not trigger the rebuilding of scheduling domains. When calling update_parent_effective_cpumask with partcmd_update, if newmask is not null, it should recheck newmask whether there are cpus is available for parect/cs that has tasks. Fixes: 0c7f293efc87 ("cgroup/cpuset: Add cpuset.cpus.exclusive.effective for v2") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Chen Ridong Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 40ec4abaf440..a9b6d56eeffa 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1991,6 +1991,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, part_error = PERR_CPUSEMPTY; goto write_error; } + /* Check newmask again, whether cpus are available for parent/cs */ + nocpu |= tasks_nocpu_error(parent, cs, newmask); /* * partcmd_update with newmask: -- cgit From 311a1bdc44a8e06024df4fd3392be0dfc8298655 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 4 Aug 2024 21:30:16 -0400 Subject: cgroup/cpuset: Clear effective_xcpus on cpus_allowed clearing only if cpus.exclusive not set Commit e2ffe502ba45 ("cgroup/cpuset: Add cpuset.cpus.exclusive for v2") adds a user writable cpuset.cpus.exclusive file for setting exclusive CPUs to be used for the creation of partitions. Since then effective_xcpus depends on both the cpuset.cpus and cpuset.cpus.exclusive setting. If cpuset.cpus.exclusive is set, effective_xcpus will depend only on cpuset.cpus.exclusive. When it is not set, effective_xcpus will be set according to the cpuset.cpus value when the cpuset becomes a valid partition root. When cpuset.cpus is being cleared by the user, effective_xcpus should only be cleared when cpuset.cpus.exclusive is not set. However, that is not currently the case. # cd /sys/fs/cgroup/ # mkdir test # echo +cpuset > cgroup.subtree_control # cd test # echo 3 > cpuset.cpus.exclusive # cat cpuset.cpus.exclusive.effective 3 # echo > cpuset.cpus # cat cpuset.cpus.exclusive.effective // was cleared Fix it by clearing effective_xcpus only if cpuset.cpus.exclusive is not set. Fixes: e2ffe502ba45 ("cgroup/cpuset: Add cpuset.cpus.exclusive for v2") Cc: stable@vger.kernel.org # v6.7+ Reported-by: Chen Ridong Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a9b6d56eeffa..97d02612b3a6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2525,7 +2525,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (!*buf) { cpumask_clear(trialcs->cpus_allowed); - cpumask_clear(trialcs->effective_xcpus); + if (cpumask_empty(trialcs->exclusive_cpus)) + cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) -- cgit From ff0ce721ec213499ec5a532041fb3a1db2dc5ecb Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 4 Aug 2024 21:30:17 -0400 Subject: cgroup/cpuset: Eliminate unncessary sched domains rebuilds in hotplug It was found that some hotplug operations may cause multiple rebuild_sched_domains_locked() calls. Some of those intermediate calls may use cpuset states not in the final correct form leading to incorrect sched domain setting. Fix this problem by using the existing force_rebuild flag to inhibit immediate rebuild_sched_domains_locked() calls if set and only doing one final call at the end. Also renaming the force_rebuild flag to force_sd_rebuild to make its meaning for clear. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 97d02612b3a6..4bd9e50bcc8e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -232,6 +232,13 @@ static cpumask_var_t isolated_cpus; /* List of remote partition root children */ static struct list_head remote_children; +/* + * A flag to force sched domain rebuild at the end of an operation while + * inhibiting it in the intermediate stages when set. Currently it is only + * set in hotplug code. + */ +static bool force_sd_rebuild; + /* * Partition root states: * @@ -1475,7 +1482,7 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } - if (rebuild_domains) + if (rebuild_domains && !force_sd_rebuild) rebuild_sched_domains_locked(); } @@ -1833,7 +1840,7 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, remote_partition_disable(child, tmp); disable_cnt++; } - if (disable_cnt) + if (disable_cnt && !force_sd_rebuild) rebuild_sched_domains_locked(); } @@ -2442,7 +2449,8 @@ get_css: } rcu_read_unlock(); - if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) + if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) && + !force_sd_rebuild) rebuild_sched_domains_locked(); } @@ -3104,7 +3112,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed && + !force_sd_rebuild) rebuild_sched_domains_locked(); if (spread_flag_changed) @@ -4501,11 +4510,9 @@ hotplug_update_tasks(struct cpuset *cs, update_tasks_nodemask(cs); } -static bool force_rebuild; - void cpuset_force_rebuild(void) { - force_rebuild = true; + force_sd_rebuild = true; } /** @@ -4653,15 +4660,9 @@ static void cpuset_handle_hotplug(void) !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); - /* - * In the rare case that hotplug removes all the cpus in - * subpartitions_cpus, we assumed that cpus are updated. - */ - if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) - cpus_updated = true; - /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { + cpuset_force_rebuild(); spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); @@ -4717,8 +4718,8 @@ static void cpuset_handle_hotplug(void) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; + if (force_sd_rebuild) { + force_sd_rebuild = false; rebuild_sched_domains_cpuslocked(); } -- cgit