diff options
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r-- | kernel/sched/topology.c | 280 |
1 files changed, 163 insertions, 117 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..b958fe48e020 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -6,13 +6,19 @@ #include <linux/bsearch.h> DEFINE_MUTEX(sched_domains_mutex); +void sched_domains_mutex_lock(void) +{ + mutex_lock(&sched_domains_mutex); +} +void sched_domains_mutex_unlock(void) +{ + mutex_unlock(&sched_domains_mutex); +} /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; -#ifdef CONFIG_SCHED_DEBUG - static int __init sched_debug_setup(char *str) { sched_debug_verbose = true; @@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } } -#else /* !CONFIG_SCHED_DEBUG */ - -# define sched_debug_verbose 0 -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | @@ -215,8 +212,6 @@ static bool sched_energy_update; static bool sched_is_eas_possible(const struct cpumask *cpu_mask) { bool any_asym_capacity = false; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; int i; /* EAS is enabled for asymmetric CPU capacity topologies. */ @@ -251,25 +246,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask) return false; } - /* Do not attempt EAS if schedutil is not being used. */ - for_each_cpu(i, cpu_mask) { - policy = cpufreq_cpu_get(i); - if (!policy) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", - cpumask_pr_args(cpu_mask), i); - } - return false; - } - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_mask)); - } - return false; + if (!cpufreq_ready_for_eas(cpu_mask)) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n", + cpumask_pr_args(cpu_mask)); } + return false; } return true; @@ -285,7 +267,7 @@ void rebuild_sched_domains_energy(void) } #ifdef CONFIG_PROC_SYSCTL -static int sched_energy_aware_handler(struct ctl_table *table, int write, +static int sched_energy_aware_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -312,7 +294,7 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write, return ret; } -static struct ctl_table sched_energy_aware_sysctls[] = { +static const struct ctl_table sched_energy_aware_sysctls[] = { { .procname = "sched_energy_aware", .data = &sysctl_sched_energy_aware, @@ -322,7 +304,6 @@ static struct ctl_table sched_energy_aware_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sched_energy_aware_sysctl_init(void) @@ -502,7 +483,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rd yet then + * If we don't want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ @@ -517,6 +498,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); + /* + * Because the rq is not a task, dl_add_task_root_domain() did not + * move the fair server bw to the rd if it already started. + * Add it now. + */ + if (rq->fair_server.dl_server) + __dl_server_attach_root(&rq->fair_server, rq); + rq_unlock_irqrestore(rq, &rf); if (old_rd) @@ -553,7 +542,7 @@ static int init_rootdomain(struct root_domain *rd) rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif - rd->visit_gen = 0; + rd->visit_cookie = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -1177,7 +1166,7 @@ fail: * uniquely identify each group (for a given domain): * * - The first is the balance_cpu (see should_we_balance() and the - * load-balance blub in fair.c); for each group we only want 1 CPU to + * load-balance blurb in fair.c); for each group we only want 1 CPU to * continue balancing at a higher domain. * * - The second is the sched_group_capacity; we want all identical groups @@ -1329,14 +1318,63 @@ next: update_group_capacity(sd, cpu); } -/* - * Asymmetric CPU capacity bits - */ -struct asym_cap_data { - struct list_head link; - unsigned long capacity; - unsigned long cpus[]; -}; +#ifdef CONFIG_SMP + +/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ +void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ + int asym_prefer_cpu = cpu; + struct sched_domain *sd; + + guard(rcu)(); + + for_each_domain(cpu, sd) { + struct sched_group *sg; + int group_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + continue; + + /* + * Groups of overlapping domain are replicated per NUMA + * node and will require updating "asym_prefer_cpu" on + * each local copy. + * + * If you are hitting this warning, consider moving + * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" + * which is shared by all the overlapping groups. + */ + WARN_ON_ONCE(sd->flags & SD_OVERLAP); + + sg = sd->groups; + if (cpu != sg->asym_prefer_cpu) { + /* + * Since the parent is a superset of the current group, + * if the cpu is not the "asym_prefer_cpu" at the + * current level, it cannot be the preferred CPU at a + * higher levels either. + */ + if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) + return; + + WRITE_ONCE(sg->asym_prefer_cpu, cpu); + continue; + } + + /* Ranking has improved; CPU is still the preferred one. */ + if (new_prio >= old_prio) + continue; + + for_each_cpu(group_cpu, sched_group_span(sg)) { + if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) + asym_prefer_cpu = group_cpu; + } + + WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); + } +} + +#endif /* CONFIG_SMP */ /* * Set of available CPUs grouped by their corresponding capacities @@ -1344,9 +1382,7 @@ struct asym_cap_data { * capacity. * The lifespan of data is unlimited. */ -static LIST_HEAD(asym_cap_list); - -#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) +LIST_HEAD(asym_cap_list); /* * Verify whether there is any CPU capacity asymmetry in a given sched domain. @@ -1386,21 +1422,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span, } +static void free_asym_cap_entry(struct rcu_head *head) +{ + struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu); + kfree(entry); +} + static inline void asym_cpu_capacity_update_data(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); - struct asym_cap_data *entry = NULL; + struct asym_cap_data *insert_entry = NULL; + struct asym_cap_data *entry; + /* + * Search if capacity already exits. If not, track which the entry + * where we should insert to keep the list ordered descending. + */ list_for_each_entry(entry, &asym_cap_list, link) { if (capacity == entry->capacity) goto done; + else if (!insert_entry && capacity > entry->capacity) + insert_entry = list_prev_entry(entry, link); } entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) return; entry->capacity = capacity; - list_add(&entry->link, &asym_cap_list); + + /* If NULL then the new capacity is the smallest, add last. */ + if (!insert_entry) + list_add_tail_rcu(&entry->link, &asym_cap_list); + else + list_add_rcu(&entry->link, &insert_entry->link); done: __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); } @@ -1423,8 +1477,8 @@ static void asym_cpu_capacity_scan(void) list_for_each_entry_safe(entry, next, &asym_cap_list, link) { if (cpumask_empty(cpu_capacity_span(entry))) { - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } @@ -1434,8 +1488,8 @@ static void asym_cpu_capacity_scan(void) */ if (list_is_singular(&asym_cap_list)) { entry = list_first_entry(&asym_cap_list, typeof(*entry), link); - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } @@ -1468,7 +1522,7 @@ static void set_domain_attribute(struct sched_domain *sd, } else request = attr->relax_domain_level; - if (sd->level > request) { + if (sd->level >= request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } @@ -1621,9 +1675,7 @@ sd_init(struct sched_domain_topology_level *tl, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; sd_span = sched_domain_span(sd); @@ -1847,7 +1899,7 @@ void sched_init_numa(int offline_node) struct cpumask ***masks; /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * O(nr_nodes^2) de-duplicating selection sort -- in order to find the * unique distances in the node_distance() table. */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); @@ -2089,7 +2141,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) for (i = 0; i < sched_domains_numa_levels; i++) { if (!masks[i][j]) break; - cpu = cpumask_any_and(cpus, masks[i][j]); + cpu = cpumask_any_and_distribute(cpus, masks[i][j]); if (cpu < nr_cpu_ids) { found = cpu; break; @@ -2263,9 +2315,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; -#ifdef CONFIG_SCHED_DEBUG sgc->id = j; -#endif *per_cpu_ptr(sdd->sgc, j) = sgc; } @@ -2324,10 +2374,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -2342,37 +2390,54 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve /* * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for - * any two given CPUs at this (non-NUMA) topology level. + * any two given CPUs on non-NUMA topology levels. */ -static bool topology_span_sane(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, int cpu) +static bool topology_span_sane(const struct cpumask *cpu_map) { - int i; + struct sched_domain_topology_level *tl; + struct cpumask *covered, *id_seen; + int cpu; - /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) - return true; + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + id_seen = sched_domains_tmpmask2; - /* - * Non-NUMA levels cannot partially overlap - they must be either - * completely equal or completely disjoint. Otherwise we can end up - * breaking the sched_group lists - i.e. a later get_group() pass - * breaks the linking done for an earlier span. - */ - for_each_cpu(i, cpu_map) { - if (i == cpu) + for_each_sd_topology(tl) { + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) continue; + + cpumask_clear(covered); + cpumask_clear(id_seen); + /* - * We should 'and' all those masks with 'cpu_map' to exactly - * match the topology we're about to build, but that can only - * remove CPUs, which only lessens our ability to detect - * overlaps + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. */ - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && - cpumask_intersects(tl->mask(cpu), tl->mask(i))) - return false; - } + for_each_cpu(cpu, cpu_map) { + const struct cpumask *tl_cpu_mask = tl->mask(cpu); + int id; + /* lowest bit set in this mask is used as a unique id */ + id = cpumask_first(tl_cpu_mask); + + if (cpumask_test_cpu(id, id_seen)) { + /* First CPU has already been seen, ensure identical spans */ + if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + return false; + } else { + /* First CPU hasn't been seen before, ensure it's a completely new span */ + if (cpumask_intersects(tl_cpu_mask, covered)) + return false; + + cpumask_or(covered, covered, tl_cpu_mask); + cpumask_set_cpu(id, id_seen); + } + } + } return true; } @@ -2405,9 +2470,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = NULL; for_each_sd_topology(tl) { - if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) - goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, i); has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; @@ -2421,6 +2483,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + if (WARN_ON(!topology_span_sane(cpu_map))) + goto error; + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { @@ -2507,16 +2572,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - unsigned long capacity; - rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); - capacity = arch_scale_cpu_capacity(i); - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, capacity); - cpu_attach_domain(sd, d.rd, i); if (lowest_flag_domain(i, SD_CLUSTER)) @@ -2530,10 +2588,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_cluster) static_branch_inc_cpuslocked(&sched_cluster_active); - if (rq && sched_debug_verbose) { - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); ret = 0; error: @@ -2681,7 +2737,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], +static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; @@ -2713,19 +2769,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) { - struct root_domain *rd; - - /* - * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. It - * will be recomputed in function - * update_tasks_root_domain(). - */ - rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; - dl_clear_root_domain(rd); + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; - } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); @@ -2755,7 +2800,7 @@ match2: } #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - /* Build perf. domains: */ + /* Build perf domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && @@ -2764,7 +2809,7 @@ match2: goto match3; } } - /* No match - add perf. domains for a new rd */ + /* No match - add perf domains for a new rd */ has_eas |= build_perf_domains(doms_new[i]); match3: ; @@ -2782,6 +2827,7 @@ match3: ndoms_cur = ndoms_new; update_sched_domain_debugfs(); + dl_rebuild_rd_accounting(); } /* @@ -2790,7 +2836,7 @@ match3: void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); } |