summaryrefslogtreecommitdiff
path: root/kernel/sched/topology.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r--kernel/sched/topology.c425
1 files changed, 284 insertions, 141 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d668..cf643a5ddedd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,16 +3,24 @@
* Scheduler topology setup/handling methods
*/
+#include <linux/sched/isolation.h>
#include <linux/bsearch.h>
+#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
+void sched_domains_mutex_lock(void)
+{
+ mutex_lock(&sched_domains_mutex);
+}
+void sched_domains_mutex_unlock(void)
+{
+ mutex_unlock(&sched_domains_mutex);
+}
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
-#ifdef CONFIG_SCHED_DEBUG
-
static int __init sched_debug_setup(char *str)
{
sched_debug_verbose = true;
@@ -81,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!(sd->flags & SD_OVERLAP) &&
+ if (!(sd->flags & SD_NUMA) &&
cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -94,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
group->sgc->id,
cpumask_pr_args(sched_group_span(group)));
- if ((sd->flags & SD_OVERLAP) &&
+ if ((sd->flags & SD_NUMA) &&
!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
printk(KERN_CONT " mask=%*pbl",
cpumask_pr_args(group_balance_mask(group)));
@@ -151,15 +159,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break;
}
}
-#else /* !CONFIG_SCHED_DEBUG */
-
-# define sched_debug_verbose 0
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
- return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
@@ -215,8 +214,6 @@ static bool sched_energy_update;
static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
{
bool any_asym_capacity = false;
- struct cpufreq_policy *policy;
- struct cpufreq_governor *gov;
int i;
/* EAS is enabled for asymmetric CPU capacity topologies. */
@@ -251,25 +248,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
return false;
}
- /* Do not attempt EAS if schedutil is not being used. */
- for_each_cpu(i, cpu_mask) {
- policy = cpufreq_cpu_get(i);
- if (!policy) {
- if (sched_debug()) {
- pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
- cpumask_pr_args(cpu_mask), i);
- }
- return false;
- }
- gov = policy->governor;
- cpufreq_cpu_put(policy);
- if (gov != &schedutil_gov) {
- if (sched_debug()) {
- pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
- cpumask_pr_args(cpu_mask));
- }
- return false;
+ if (!cpufreq_ready_for_eas(cpu_mask)) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
+ cpumask_pr_args(cpu_mask));
}
+ return false;
}
return true;
@@ -312,7 +296,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table sched_energy_aware_sysctls[] = {
+static const struct ctl_table sched_energy_aware_sysctls[] = {
{
.procname = "sched_energy_aware",
.data = &sysctl_sched_energy_aware,
@@ -331,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void)
}
late_initcall(sched_energy_aware_sysctl_init);
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
static void free_pd(struct perf_domain *pd)
{
@@ -467,9 +451,9 @@ free:
return false;
}
-#else
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
static void free_pd(struct perf_domain *pd) { }
-#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
static void free_rootdomain(struct rcu_head *rcu)
{
@@ -560,7 +544,7 @@ static int init_rootdomain(struct root_domain *rd)
rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
- rd->visit_gen = 0;
+ rd->visit_cookie = 0;
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@@ -1336,6 +1320,60 @@ next:
update_group_capacity(sd, cpu);
}
+/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
+void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
+{
+ int asym_prefer_cpu = cpu;
+ struct sched_domain *sd;
+
+ guard(rcu)();
+
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg;
+ int group_cpu;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ continue;
+
+ /*
+ * Groups of overlapping domain are replicated per NUMA
+ * node and will require updating "asym_prefer_cpu" on
+ * each local copy.
+ *
+ * If you are hitting this warning, consider moving
+ * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
+ * which is shared by all the overlapping groups.
+ */
+ WARN_ON_ONCE(sd->flags & SD_NUMA);
+
+ sg = sd->groups;
+ if (cpu != sg->asym_prefer_cpu) {
+ /*
+ * Since the parent is a superset of the current group,
+ * if the cpu is not the "asym_prefer_cpu" at the
+ * current level, it cannot be the preferred CPU at a
+ * higher levels either.
+ */
+ if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu))
+ return;
+
+ WRITE_ONCE(sg->asym_prefer_cpu, cpu);
+ continue;
+ }
+
+ /* Ranking has improved; CPU is still the preferred one. */
+ if (new_prio >= old_prio)
+ continue;
+
+ for_each_cpu(group_cpu, sched_group_span(sg)) {
+ if (sched_asym_prefer(group_cpu, asym_prefer_cpu))
+ asym_prefer_cpu = group_cpu;
+ }
+
+ WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu);
+ }
+}
+
/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
@@ -1552,13 +1590,19 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
enum numa_topology_type sched_numa_topology_type;
+/*
+ * sched_domains_numa_distance is derived from sched_numa_node_distance
+ * and provides a simplified view of NUMA distances used specifically
+ * for building NUMA scheduling domains.
+ */
static int sched_domains_numa_levels;
-static int sched_domains_curr_level;
+static int sched_numa_node_levels;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
+static int *sched_numa_node_distance;
static struct cpumask ***sched_domains_numa_masks;
-#endif
+#endif /* CONFIG_NUMA */
/*
* SD_flags allowed in topology descriptions.
@@ -1594,14 +1638,7 @@ sd_init(struct sched_domain_topology_level *tl,
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
-#ifdef CONFIG_NUMA
- /*
- * Ugly hack to pass state to sd_numa_mask()...
- */
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
+ sd_weight = cpumask_weight(tl->mask(tl, cpu));
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
@@ -1632,16 +1669,20 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
+
+ /* 50% success rate */
+ .newidle_call = 512,
+ .newidle_success = 256,
+ .newidle_ratio = 512,
+
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
-#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
-#endif
};
sd_span = sched_domain_span(sd);
- cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
sd_id = cpumask_first(sd_span);
sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
@@ -1676,7 +1717,7 @@ sd_init(struct sched_domain_topology_level *tl,
SD_WAKE_AFFINE);
}
-#endif
+#endif /* CONFIG_NUMA */
} else {
sd->cache_nice_tries = 1;
}
@@ -1696,22 +1737,63 @@ sd_init(struct sched_domain_topology_level *tl,
return sd;
}
+#ifdef CONFIG_SCHED_SMT
+int cpu_smt_flags(void)
+{
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_smt_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+int cpu_cluster_flags(void)
+{
+ return SD_CLUSTER | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_clustergroup_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+int cpu_core_flags(void)
+{
+ return SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_coregroup_mask(cpu);
+}
+#endif
+
+const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_node_mask(cpu);
+}
+
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+ SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
#endif
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(tl_pkg_mask, NULL, PKG),
{ NULL, },
};
@@ -1732,10 +1814,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl)
}
#ifdef CONFIG_NUMA
+static int cpu_numa_flags(void)
+{
+ return SD_NUMA;
+}
-static const struct cpumask *sd_numa_mask(int cpu)
+static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu)
{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+ return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)];
}
static void sched_numa_warn(const char *str)
@@ -1772,10 +1858,10 @@ bool find_numa_distance(int distance)
return true;
rcu_read_lock();
- distances = rcu_dereference(sched_domains_numa_distance);
+ distances = rcu_dereference(sched_numa_node_distance);
if (!distances)
goto unlock;
- for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (i = 0; i < sched_numa_node_levels; i++) {
if (distances[i] == distance) {
found = true;
break;
@@ -1851,14 +1937,34 @@ static void init_numa_topology_type(int offline_node)
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
-void sched_init_numa(int offline_node)
+/*
+ * An architecture could modify its NUMA distance, to change
+ * grouping of NUMA nodes and number of NUMA levels when creating
+ * NUMA level sched domains.
+ *
+ * A NUMA level is created for each unique
+ * arch_sched_node_distance.
+ */
+static int numa_node_dist(int i, int j)
{
- struct sched_domain_topology_level *tl;
- unsigned long *distance_map;
+ return node_distance(i, j);
+}
+
+int arch_sched_node_distance(int from, int to)
+ __weak __alias(numa_node_dist);
+
+static bool modified_sched_node_distance(void)
+{
+ return numa_node_dist != arch_sched_node_distance;
+}
+
+static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
+ int **dist, int *levels)
+{
+ unsigned long *distance_map __free(bitmap) = NULL;
int nr_levels = 0;
int i, j;
int *distances;
- struct cpumask ***masks;
/*
* O(nr_nodes^2) de-duplicating selection sort -- in order to find the
@@ -1866,17 +1972,16 @@ void sched_init_numa(int offline_node)
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
if (!distance_map)
- return;
+ return -ENOMEM;
bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for_each_cpu_node_but(i, offline_node) {
for_each_cpu_node_but(j, offline_node) {
- int distance = node_distance(i, j);
+ int distance = n_dist(i, j);
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
- bitmap_free(distance_map);
- return;
+ return -EINVAL;
}
bitmap_set(distance_map, distance, 1);
@@ -1889,18 +1994,46 @@ void sched_init_numa(int offline_node)
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
- if (!distances) {
- bitmap_free(distance_map);
- return;
- }
+ if (!distances)
+ return -ENOMEM;
for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
distances[i] = j;
}
- rcu_assign_pointer(sched_domains_numa_distance, distances);
+ *dist = distances;
+ *levels = nr_levels;
- bitmap_free(distance_map);
+ return 0;
+}
+
+void sched_init_numa(int offline_node)
+{
+ struct sched_domain_topology_level *tl;
+ int nr_levels, nr_node_levels;
+ int i, j;
+ int *distances, *domain_distances;
+ struct cpumask ***masks;
+
+ /* Record the NUMA distances from SLIT table */
+ if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
+ &nr_node_levels))
+ return;
+
+ /* Record modified NUMA distances for building sched domains */
+ if (modified_sched_node_distance()) {
+ if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
+ &domain_distances, &nr_levels)) {
+ kfree(distances);
+ return;
+ }
+ } else {
+ domain_distances = distances;
+ nr_levels = nr_node_levels;
+ }
+ rcu_assign_pointer(sched_numa_node_distance, distances);
+ WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
+ WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
/*
* 'nr_levels' contains the number of unique distances
@@ -1918,6 +2051,8 @@ void sched_init_numa(int offline_node)
*
* We reset it to 'nr_levels' at the end of this function.
*/
+ rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
+
sched_domains_numa_levels = 0;
masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
@@ -1943,10 +2078,13 @@ void sched_init_numa(int offline_node)
masks[i][j] = mask;
for_each_cpu_node_but(k, offline_node) {
- if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ if (sched_debug() &&
+ (arch_sched_node_distance(j, k) !=
+ arch_sched_node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
- if (node_distance(j, k) > sched_domains_numa_distance[i])
+ if (arch_sched_node_distance(j, k) >
+ sched_domains_numa_distance[i])
continue;
cpumask_or(mask, mask, cpumask_of_node(k));
@@ -1972,30 +2110,20 @@ void sched_init_numa(int offline_node)
/*
* Add the NUMA identity distance, aka single NODE.
*/
- tl[i++] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .numa_level = 0,
- SD_INIT_NAME(NODE)
- };
+ tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 1; j < nr_levels; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
+ tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+ tl[i].numa_level = j;
}
sched_domain_topology_saved = sched_domain_topology;
sched_domain_topology = tl;
sched_domains_numa_levels = nr_levels;
- WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
init_numa_topology_type(offline_node);
}
@@ -2003,14 +2131,18 @@ void sched_init_numa(int offline_node)
static void sched_reset_numa(void)
{
- int nr_levels, *distances;
+ int nr_levels, *distances, *dom_distances = NULL;
struct cpumask ***masks;
nr_levels = sched_domains_numa_levels;
+ sched_numa_node_levels = 0;
sched_domains_numa_levels = 0;
sched_max_numa_distance = 0;
sched_numa_topology_type = NUMA_DIRECT;
- distances = sched_domains_numa_distance;
+ distances = sched_numa_node_distance;
+ if (sched_numa_node_distance != sched_domains_numa_distance)
+ dom_distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_numa_node_distance, NULL);
rcu_assign_pointer(sched_domains_numa_distance, NULL);
masks = sched_domains_numa_masks;
rcu_assign_pointer(sched_domains_numa_masks, NULL);
@@ -2019,6 +2151,7 @@ static void sched_reset_numa(void)
synchronize_rcu();
kfree(distances);
+ kfree(dom_distances);
for (i = 0; i < nr_levels && masks; i++) {
if (!masks[i])
continue;
@@ -2065,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu)
continue;
/* Set ourselves in the remote node's masks */
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ if (arch_sched_node_distance(j, node) <=
+ sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}
@@ -2103,7 +2237,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
for (i = 0; i < sched_domains_numa_levels; i++) {
if (!masks[i][j])
break;
- cpu = cpumask_any_and(cpus, masks[i][j]);
+ cpu = cpumask_any_and_distribute(cpus, masks[i][j]);
if (cpu < nr_cpu_ids) {
found = cpu;
break;
@@ -2174,6 +2308,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
goto unlock;
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ if (!hop_masks)
+ goto unlock;
hop = hop_masks - k.masks;
ret = hop ?
@@ -2277,9 +2413,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sgc)
return -ENOMEM;
-#ifdef CONFIG_SCHED_DEBUG
sgc->id = j;
-#endif
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
@@ -2301,7 +2435,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sd) {
sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
+ if (sd && (sd->flags & SD_NUMA))
free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sd, j));
}
@@ -2338,10 +2472,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
-#endif
/* Fixup, ensure @sd has at least @child CPUs. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
@@ -2356,35 +2488,58 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
/*
* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
- * any two given CPUs at this (non-NUMA) topology level.
+ * any two given CPUs on non-NUMA topology levels.
*/
-static bool topology_span_sane(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map, int cpu)
+static bool topology_span_sane(const struct cpumask *cpu_map)
{
- int i = cpu + 1;
+ struct sched_domain_topology_level *tl;
+ struct cpumask *covered, *id_seen;
+ int cpu;
- /* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
- return true;
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+ id_seen = sched_domains_tmpmask2;
+
+ for_each_sd_topology(tl) {
+ int tl_common_flags = 0;
+
+ if (tl->sd_flags)
+ tl_common_flags = (*tl->sd_flags)();
+
+ /* NUMA levels are allowed to overlap */
+ if (tl_common_flags & SD_NUMA)
+ continue;
+
+ cpumask_clear(covered);
+ cpumask_clear(id_seen);
- /*
- * Non-NUMA levels cannot partially overlap - they must be either
- * completely equal or completely disjoint. Otherwise we can end up
- * breaking the sched_group lists - i.e. a later get_group() pass
- * breaks the linking done for an earlier span.
- */
- for_each_cpu_from(i, cpu_map) {
/*
- * We should 'and' all those masks with 'cpu_map' to exactly
- * match the topology we're about to build, but that can only
- * remove CPUs, which only lessens our ability to detect
- * overlaps
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
*/
- if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
- cpumask_intersects(tl->mask(cpu), tl->mask(i)))
- return false;
- }
+ for_each_cpu(cpu, cpu_map) {
+ const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu);
+ int id;
+
+ /* lowest bit set in this mask is used as a unique id */
+ id = cpumask_first(tl_cpu_mask);
+ if (cpumask_test_cpu(id, id_seen)) {
+ /* First CPU has already been seen, ensure identical spans */
+ if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask))
+ return false;
+ } else {
+ /* First CPU hasn't been seen before, ensure it's a completely new span */
+ if (cpumask_intersects(tl_cpu_mask, covered))
+ return false;
+
+ cpumask_or(covered, covered, tl_cpu_mask);
+ cpumask_set_cpu(id, id_seen);
+ }
+ }
+ }
return true;
}
@@ -2417,27 +2572,25 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = NULL;
for_each_sd_topology(tl) {
- if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
- goto error;
-
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP)
- sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
}
+ if (WARN_ON(!topology_span_sane(cpu_map)))
+ goto error;
+
/* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
+ if (sd->flags & SD_NUMA) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
@@ -2684,7 +2837,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
*
* Call with hotplug lock and sched_domains_mutex held
*/
-void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
@@ -2716,19 +2869,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) &&
- dattrs_equal(dattr_cur, i, dattr_new, j)) {
- struct root_domain *rd;
-
- /*
- * This domain won't be destroyed and as such
- * its dl_bw->total_bw needs to be cleared. It
- * will be recomputed in function
- * update_tasks_root_domain().
- */
- rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
- dl_clear_root_domain(rd);
+ dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
- }
}
/* No match - a current sched domain not in new doms_new[] */
detach_destroy_domains(doms_cur[i]);
@@ -2785,6 +2927,7 @@ match3:
ndoms_cur = ndoms_new;
update_sched_domain_debugfs();
+ dl_rebuild_rd_accounting();
}
/*
@@ -2793,7 +2936,7 @@ match3:
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
}