diff options
-rw-r--r-- | Documentation/admin-guide/cgroup-v2.rst | 10 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 116 | ||||
-rwxr-xr-x | tools/testing/selftests/cgroup/test_cpuset_prs.sh | 74 |
3 files changed, 166 insertions, 34 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 3f85254f3cef..cf5651a11df8 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2358,11 +2358,11 @@ Cpuset Interface Files partition or scheduling domain. The set of exclusive CPUs is determined by the value of its "cpuset.cpus.exclusive.effective". - When set to "isolated", the CPUs in that partition will - be in an isolated state without any load balancing from the - scheduler. Tasks placed in such a partition with multiple - CPUs should be carefully distributed and bound to each of the - individual CPUs for optimal performance. + When set to "isolated", the CPUs in that partition will be in + an isolated state without any load balancing from the scheduler + and excluded from the unbound workqueues. Tasks placed in such + a partition with multiple CPUs should be carefully distributed + and bound to each of the individual CPUs for optimal performance. A partition root ("root" or "isolated") can be in one of the two possible states - valid or invalid. An invalid partition diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 19c8779798fd..1bad4007ff4b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -25,6 +25,7 @@ #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/cpuset.h> +#include <linux/delay.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> @@ -43,6 +44,7 @@ #include <linux/sched/isolation.h> #include <linux/cgroup.h> #include <linux/wait.h> +#include <linux/workqueue.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -1444,25 +1446,31 @@ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *x * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added + * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static void partition_xcpus_add(int new_prs, struct cpuset *parent, +static bool partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { + bool isolcpus_updated; + WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; + if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); - if (new_prs != parent->partition_root_state) + isolcpus_updated = (new_prs != parent->partition_root_state); + if (isolcpus_updated) partition_xcpus_newstate(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); + return isolcpus_updated; } /* @@ -1470,12 +1478,15 @@ static void partition_xcpus_add(int new_prs, struct cpuset *parent, * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed + * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static void partition_xcpus_del(int old_prs, struct cpuset *parent, +static bool partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { + bool isolcpus_updated; + WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1484,12 +1495,27 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); - if (old_prs != parent->partition_root_state) + isolcpus_updated = (old_prs != parent->partition_root_state); + if (isolcpus_updated) partition_xcpus_newstate(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); + return isolcpus_updated; +} + +static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +{ + int ret; + + lockdep_assert_cpus_held(); + + if (!isolcpus_updated) + return; + + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); } /* @@ -1540,6 +1566,8 @@ static inline bool is_local_partition(struct cpuset *cs) static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { + bool isolcpus_updated; + /* * The user must have sysadmin privilege. */ @@ -1561,7 +1589,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, return 0; spin_lock_irq(&callback_lock); - partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); if (cs->use_parent_ecpus) { struct cpuset *parent = parent_cs(cs); @@ -1570,13 +1598,13 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, parent->child_ecpus_count--; } spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); - return 1; } @@ -1591,18 +1619,22 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { + bool isolcpus_updated; + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); WARN_ON_ONCE(!is_remote_partition(cs)); WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); - partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus); + isolcpus_updated = partition_xcpus_del(cs->partition_root_state, + NULL, tmp->new_cpus); cs->partition_root_state = -cs->partition_root_state; if (!cs->prs_err) cs->prs_err = PERR_INVCPUS; reset_partition_data(cs); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1625,6 +1657,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, { bool adding, deleting; int prs = cs->partition_root_state; + int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1649,10 +1682,11 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, spin_lock_irq(&callback_lock); if (adding) - partition_xcpus_add(prs, NULL, tmp->addmask); + isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) - partition_xcpus_del(prs, NULL, tmp->delmask); + isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1774,6 +1808,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; struct cpumask *xcpus; /* cs effective_xcpus */ + int isolcpus_updated = 0; bool nocpu; lockdep_assert_held(&cpuset_mutex); @@ -2010,15 +2045,18 @@ write_error: * and vice versa. */ if (adding) - partition_xcpus_del(old_prs, parent, tmp->addmask); + isolcpus_updated += partition_xcpus_del(old_prs, parent, + tmp->addmask); if (deleting) - partition_xcpus_add(new_prs, parent, tmp->delmask); + isolcpus_updated += partition_xcpus_add(new_prs, parent, + tmp->delmask); if (is_partition_valid(parent)) { parent->nr_subparts += subparts_delta; WARN_ON_ONCE(parent->nr_subparts < 0); } spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive(cs, new_prs); @@ -3082,6 +3120,7 @@ out: else if (new_xcpus_state) partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(new_xcpus_state); /* Force update if switching back to member */ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); @@ -4370,6 +4409,30 @@ void cpuset_force_rebuild(void) force_rebuild = true; } +/* + * Attempt to acquire a cpus_read_lock while a hotplug operation may be in + * progress. + * Return: true if successful, false otherwise + * + * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, + * cpus_read_trylock() is used here to acquire the lock. + */ +static bool cpuset_hotplug_cpus_read_trylock(void) +{ + int retries = 0; + + while (!cpus_read_trylock()) { + /* + * CPU hotplug still in progress. Retry 5 times + * with a 10ms wait before bailing out. + */ + if (++retries > 5) + return false; + msleep(10); + } + return true; +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -4386,6 +4449,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) bool cpus_updated; bool mems_updated; bool remote; + int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -4417,11 +4481,13 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL)) { + partition_is_populated(cs, NULL) && + cpuset_hotplug_cpus_read_trylock()) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); + cpus_read_unlock(); } /* @@ -4432,18 +4498,28 @@ retry: * partitions. */ if (is_local_partition(cs) && (!is_partition_valid(parent) || - tasks_nocpu_error(parent, cs, &new_cpus))) { - update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp); - compute_effective_cpumask(&new_cpus, cs, parent); - cpuset_force_rebuild(); - } + tasks_nocpu_error(parent, cs, &new_cpus))) + partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned * back to a regular one. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) { - update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp); - if (is_partition_valid(cs)) { + else if (is_partition_valid(parent) && is_partition_invalid(cs)) + partcmd = partcmd_update; + + /* + * cpus_read_lock needs to be held before calling + * update_parent_effective_cpumask(). To avoid circular lock + * dependency between cpuset_mutex and cpus_read_lock, + * cpus_read_trylock() is used here to acquire the lock. + */ + if (partcmd >= 0) { + if (!cpuset_hotplug_cpus_read_trylock()) + goto update_tasks; + + update_parent_effective_cpumask(cs, partcmd, NULL, tmp); + cpus_read_unlock(); + if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); } diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 2b825019f806..7b7c4c2b6d85 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -232,11 +232,11 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4-5 \ A1:P0,A2:P1,A3:P2,B1:P1 2-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4 \ - A1:P0,A2:P1,A3:P2,B1:P1 2-4" + A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3" " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:2,A3:3,B1:4 \ - A1:P0,A2:P1,A3:P2,B1:P1 2-4" + A1:P0,A2:P1,A3:P2,B1:P1 2-4,3" " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1,A2:2-3,A3:4 \ - A1:P0,A2:P2,A3:P1 2-4" + A1:P0,A2:P2,A3:P1 2-4,2-3" " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ . . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \ A1:P0,A2:P-2,A3:P-1" @@ -248,7 +248,7 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 . 0 A1:0-1,A2:1,A3:3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 P2:O3=0 . 0 A1:0-2,A2:1-2,A3: A1:P0,A3:P2 3" - " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3" + " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3," # An invalidated remote partition cannot self-recover from hotplug " C0-3:S+ C1-3:S+ C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3,A2:1-3,A3:2 A1:P0,A3:P-2" @@ -376,7 +376,7 @@ write_cpu_online() } fi echo $VAL > $CPUFILE - pause 0.01 + pause 0.05 } # @@ -508,12 +508,14 @@ dump_states() XECPUS=$DIR/cpuset.cpus.exclusive.effective PRS=$DIR/cpuset.cpus.partition PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions + ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated [[ -e $CPUS ]] && echo "$CPUS: $(cat $CPUS)" [[ -e $XCPUS ]] && echo "$XCPUS: $(cat $XCPUS)" [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)" [[ -e $XECPUS ]] && echo "$XECPUS: $(cat $XECPUS)" [[ -e $PRS ]] && echo "$PRS: $(cat $PRS)" [[ -e $PCPUS ]] && echo "$PCPUS: $(cat $PCPUS)" + [[ -e $ISCPUS ]] && echo "$ISCPUS: $(cat $ISCPUS)" done } @@ -591,11 +593,17 @@ check_cgroup_states() # # Get isolated (including offline) CPUs by looking at -# /sys/kernel/debug/sched/domains and compare that with the expected value. +# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file, +# if available, and compare that with the expected value. # -# Note that a sched domain of just 1 CPU will be considered isolated. +# Note that isolated CPUs from the sched/domains context include offline +# CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may +# not be included in the *cpuset.cpus.isolated control file which contains +# only CPUs in isolated partitions. # -# $1 - expected isolated cpu list +# $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>} +# <isolcpus1> - expected sched/domains value +# <isolcpus2> - *cpuset.cpus.isolated value = <isolcpus1> if not defined # check_isolcpus() { @@ -603,8 +611,38 @@ check_isolcpus() ISOLCPUS= LASTISOLCPU= SCHED_DOMAINS=/sys/kernel/debug/sched/domains + ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated + if [[ $EXPECT_VAL = . ]] + then + EXPECT_VAL= + EXPECT_VAL2= + elif [[ $(expr $EXPECT_VAL : ".*,.*") > 0 ]] + then + set -- $(echo $EXPECT_VAL | sed -e "s/,/ /g") + EXPECT_VAL=$1 + EXPECT_VAL2=$2 + else + EXPECT_VAL2=$EXPECT_VAL + fi + + # + # Check the debug isolated cpumask, if present + # + [[ -f $ISCPUS ]] && { + ISOLCPUS=$(cat $ISCPUS) + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { + # Take a 50ms pause and try again + pause 0.05 + ISOLCPUS=$(cat $ISCPUS) + } + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 + ISOLCPUS= + } + + # + # Use the sched domain in debugfs to check isolated CPUs, if available + # [[ -d $SCHED_DOMAINS ]] || return 0 - [[ $EXPECT_VAL = . ]] && EXPECT_VAL= for ((CPU=0; CPU < $NR_CPUS; CPU++)) do @@ -649,6 +687,22 @@ test_fail() } # +# Check to see if there are unexpected isolated CPUs left +# +null_isolcpus_check() +{ + [[ $VERBOSE -gt 0 ]] || return 0 + pause 0.02 + check_isolcpus "." + if [[ $? -ne 0 ]] + then + echo "Unexpected isolated CPUs: $ISOLCPUS" + dump_states + exit 1 + fi +} + +# # Run cpuset state transition test # $1 - test matrix name # @@ -733,6 +787,7 @@ run_state_test() echo "Effective cpus changed to $NEWLIST after test $I!" exit 1 } + null_isolcpus_check [[ $VERBOSE -gt 0 ]] && echo "Test $I done." ((I++)) done @@ -802,6 +857,7 @@ test_isolated() console_msg "Cleaning up" echo $$ > $CGROUP2/cgroup.procs [[ -d A1 ]] && rmdir A1 + null_isolcpus_check } # |