diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 7116 |
1 files changed, 4704 insertions, 2412 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04a3ce20da67..c798d2795243 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,23 +20,43 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include "sched.h" +#include <linux/energy_model.h> +#include <linux/mmap_lock.h> +#include <linux/hugetlb_inline.h> +#include <linux/jiffies.h> +#include <linux/mm_api.h> +#include <linux/highmem.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/isolation.h> +#include <linux/sched/nohz.h> +#include <linux/sched/prio.h> + +#include <linux/cpuidle.h> +#include <linux/interrupt.h> +#include <linux/memory-tiers.h> +#include <linux/mempolicy.h> +#include <linux/mutex_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/ratelimit.h> +#include <linux/task_work.h> +#include <linux/rbtree_augmented.h> + +#include <asm/switch_to.h> + +#include <uapi/linux/sched/types.h> -/* - * Targeted preemption latency for CPU-bound tasks: - * - * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length - * and have no persistent notion like in traditional, time-slice - * based scheduling concepts. - * - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches (cs) field) - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_latency = 6000000ULL; -static unsigned int normalized_sysctl_sched_latency = 6000000ULL; +#include "sched.h" +#include "stats.h" +#include "autogroup.h" /* * The initial- and re-scaling of tunables is configurable @@ -44,55 +64,26 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL; * Options are: * * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; - -/* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -static unsigned int sched_nr_latency = 8; - -/* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ -unsigned int sysctl_sched_child_runs_first __read_mostly; - -/* - * SCHED_OTHER wake-up granularity. - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int sysctl_sched_base_slice = 750000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { - int _shift = 0; - - if (kstrtoint(str, 0, &_shift)) - pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - - sched_thermal_decay_shift = clamp(_shift, 0, 10); + pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -113,6 +104,13 @@ int __weak arch_asym_cpu_priority(int cpu) */ #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) +/* + * The margin used when comparing CPU capacities. + * is 'cap1' noticeably greater than 'cap2' + * + * (default: ~5%) + */ +#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -126,7 +124,44 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +#ifdef CONFIG_NUMA_BALANCING +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +#endif + +#ifdef CONFIG_SYSCTL +static const struct ctl_table sched_fair_sysctls[] = { +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_promote_rate_limit_MBps", + .data = &sysctl_numa_balancing_promote_rate_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif /* CONFIG_NUMA_BALANCING */ +}; + +static int __init sched_fair_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_fair_sysctls); + return 0; +} +late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) @@ -183,9 +218,7 @@ static void update_sysctl(void) #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } @@ -229,27 +262,40 @@ static void __update_inv_weight(struct load_weight *lw) static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { u64 fact = scale_load_down(weight); + u32 fact_hi = (u32)(fact >> 32); int shift = WMULT_SHIFT; + int fs; __update_inv_weight(lw); - if (unlikely(fact >> 32)) { - while (fact >> 32) { - fact >>= 1; - shift--; - } + if (unlikely(fact_hi)) { + fs = fls(fact_hi); + shift -= fs; + fact >>= fs; } fact = mul_u32_u32(fact, lw->inv_weight); - while (fact >> 32) { - fact >>= 1; - shift--; + fact_hi = (u32)(fact >> 32); + if (fact_hi) { + fs = fls(fact_hi); + shift -= fs; + fact >>= fs; } return mul_u64_u32_shr(delta_exec, fact, shift); } +/* + * delta /= w + */ +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) +{ + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); + + return delta; +} const struct sched_class fair_sched_class; @@ -258,46 +304,11 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -373,8 +384,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) /* * With cfs_rq being unthrottled/throttled during an enqueue, - * it can happen the tmp_alone_branch points the a leaf that - * we finally want to del. In this case, tmp_alone_branch moves + * it can happen the tmp_alone_branch points to the leaf that + * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue. */ @@ -391,7 +402,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq) SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } -/* Iterate thr' all leaf cfs_rq's on a runqueue */ +/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ leaf_cfs_rq_list) @@ -406,7 +417,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse) return NULL; } -static inline struct sched_entity *parent_entity(struct sched_entity *se) +static inline struct sched_entity *parent_entity(const struct sched_entity *se) { return se->parent; } @@ -443,40 +454,27 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* !CONFIG_FAIR_GROUP_SCHED */ - -static inline struct task_struct *task_of(struct sched_entity *se) +static int tg_is_idle(struct task_group *tg) { - return container_of(se, struct task_struct, se); + return tg->idle > 0; } -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) { - return &task_rq(p)->cfs; + return cfs_rq->idle > 0; } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static int se_is_idle(struct sched_entity *se) { - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); } -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} +#else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (path) - strlcpy(path, "(null)", len); -} +#define for_each_sched_entity(se) \ + for (; se; se = NULL) static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { @@ -504,6 +502,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } +static inline int tg_is_idle(struct task_group *tg) +{ + return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + return task_has_idle_policy(task_of(se)); +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@ -513,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) @@ -522,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) return max_vruntime; } -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) @@ -531,17 +544,222 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline int entity_before(struct sched_entity *a, - struct sched_entity *b) +static inline bool entity_before(const struct sched_entity *a, + const struct sched_entity *b) { - return (s64)(a->vruntime - b->vruntime) < 0; + /* + * Tiebreak on vruntime seems unnecessary since it can + * hardly happen. + */ + return (s64)(a->deadline - b->deadline) < 0; } -static void update_min_vruntime(struct cfs_rq *cfs_rq) +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return (s64)(se->vruntime - cfs_rq->min_vruntime); +} + +#define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + +/* + * Compute virtual time from the per-task service numbers: + * + * Fair schedulers conserve lag: + * + * \Sum lag_i = 0 + * + * Where lag_i is given by: + * + * lag_i = S - s_i = w_i * (V - v_i) + * + * Where S is the ideal service time and V is it's virtual time counterpart. + * Therefore: + * + * \Sum lag_i = 0 + * \Sum w_i * (V - v_i) = 0 + * \Sum w_i * V - w_i * v_i = 0 + * + * From which we can solve an expression for V in v_i (which we have in + * se->vruntime): + * + * \Sum v_i * w_i \Sum v_i * w_i + * V = -------------- = -------------- + * \Sum w_i W + * + * Specifically, this is the weighted average of all entity virtual runtimes. + * + * [[ NOTE: this is only equal to the ideal scheduler under the condition + * that join/leave operations happen at lag_i = 0, otherwise the + * virtual time has non-contiguous motion equivalent to: + * + * V +-= lag_i / W + * + * Also see the comment in place_entity() that deals with this. ]] + * + * However, since v_i is u64, and the multiplication could easily overflow + * transform it into a relative form that uses smaller quantities: + * + * Substitute: v_i == (v_i - v0) + v0 + * + * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i + * V = ---------------------------- = --------------------- + v0 + * W W + * + * Which we track using: + * + * v0 := cfs_rq->min_vruntime + * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime + * \Sum w_i := cfs_rq->avg_load + * + * Since min_vruntime is a monotonic increasing variable that closely tracks + * the per-task service, these deltas: (v_i - v), will be in the order of the + * maximal (virtual) lag induced in the system due to quantisation. + * + * Also, we use scale_load_down() to reduce the size. + * + * As measured, the max (key * weight) value was ~44 bits for a kernel build. + */ +static void +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime += key * weight; + cfs_rq->avg_load += weight; +} + +static void +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime -= key * weight; + cfs_rq->avg_load -= weight; +} + +static inline +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +{ + /* + * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + */ + cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; +} + +/* + * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * For this to be so, the result of this function must have a left bias. + */ +u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { + unsigned long weight = scale_load_down(curr->load.weight); + + avg += entity_key(cfs_rq, curr) * weight; + load += weight; + } + + if (load) { + /* sign flips effective floor / ceiling */ + if (avg < 0) + avg -= (load - 1); + avg = div_s64(avg, load); + } + return cfs_rq->min_vruntime + avg; +} + +/* + * lag_i = S - s_i = w_i * (V - v_i) + * + * However, since V is approximated by the weighted average of all entities it + * is possible -- by addition/removal/reweight to the tree -- to move V around + * and end up with a larger lag than we started with. + * + * Limit this to either double the slice length with a minimum of TICK_NSEC + * since that is the timing granularity. + * + * EEVDF gives the following limit for a steady state system: + * + * -r_max < lag < max(r_max, q) + * + * XXX could add max_slice to the augmented data to track this. + */ +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 vlag, limit; + + SCHED_WARN_ON(!se->on_rq); + + vlag = avg_vruntime(cfs_rq) - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + + se->vlag = clamp(vlag, -limit, limit); +} + +/* + * Entity is eligible once it received less service than it ought to have, + * eg. lag >= 0. + * + * lag_i = S - s_i = w_i*(V - v_i) + * + * lag_i >= 0 -> V >= v_i + * + * \Sum (v_i - v)*w_i + * V = ------------------ + v + * \Sum w_i + * + * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * + * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due + * to the loss in precision caused by the division. + */ +static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { + unsigned long weight = scale_load_down(curr->load.weight); + + avg += entity_key(cfs_rq, curr) * weight; + load += weight; + } + + return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; +} + +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return vruntime_eligible(cfs_rq, se->vruntime); +} + +static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +{ + u64 min_vruntime = cfs_rq->min_vruntime; + /* + * open coded max_vruntime() to allow updating avg_vruntime + */ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) { + avg_vruntime_update(cfs_rq, delta); + min_vruntime = vruntime; + } + return min_vruntime; +} + +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; u64 vruntime = cfs_rq->min_vruntime; if (curr) { @@ -551,60 +769,108 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); - + if (se) { if (!curr) - vruntime = se->vruntime; + vruntime = se->min_vruntime; else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = min_vruntime(vruntime, se->min_vruntime); } /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif + cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); +} + +static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 min_slice = ~0ULL; + + if (curr && curr->on_rq) + min_slice = curr->slice; + + if (root) + min_slice = min(min_slice, root->min_slice); + + return min_slice; +} + +static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +{ + return entity_before(__node_2_se(a), __node_2_se(b)); +} + +#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) + +static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (vruntime_gt(min_vruntime, se, rse)) + se->min_vruntime = rse->min_vruntime; + } +} + +static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->min_slice < se->min_slice) + se->min_slice = rse->min_slice; + } } /* - * Enqueue an entity into the rb-tree: + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ -static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; + u64 old_min_vruntime = se->min_vruntime; + u64 old_min_slice = se->min_slice; + struct rb_node *node = &se->run_node; - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left); + + se->min_slice = se->slice; + __min_slice_update(se, node->rb_right); + __min_slice_update(se, node->rb_left); - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); + return se->min_vruntime == old_min_vruntime && + se->min_slice == old_min_slice; +} + +RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, + run_node, min_vruntime, min_vruntime_update); + +/* + * Enqueue an entity into the rb-tree: + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + avg_vruntime_add(cfs_rq, se); + se->min_vruntime = se->vruntime; + se->min_slice = se->slice; + rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + __entity_less, &min_vruntime_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + &min_vruntime_cb); + avg_vruntime_sub(cfs_rq, se); +} + +struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node; + + if (!root) + return NULL; + + return __node_2_se(root); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) @@ -614,17 +880,91 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) if (!left) return NULL; - return rb_entry(left, struct sched_entity, run_node); + return __node_2_se(left); } -static struct sched_entity *__pick_next_entity(struct sched_entity *se) +/* + * Earliest Eligible Virtual Deadline First + * + * In order to provide latency guarantees for different request sizes + * EEVDF selects the best runnable task from two criteria: + * + * 1) the task must be eligible (must be owed service) + * + * 2) from those tasks that meet 1), we select the one + * with the earliest virtual deadline. + * + * We can do this in O(log n) time due to an augmented RB-tree. The + * tree keeps the entries sorted on deadline, but also functions as a + * heap based on the vruntime by keeping: + * + * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) + * + * Which allows tree pruning through eligibility. + */ +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { - struct rb_node *next = rb_next(&se->run_node); + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + struct sched_entity *best = NULL; - if (!next) - return NULL; + /* + * We can safely skip eligibility check if there is only one entity + * in this cfs_rq, saving some cycles. + */ + if (cfs_rq->nr_queued == 1) + return curr && curr->on_rq ? curr : se; + + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) + curr = NULL; + + /* + * Once selected, run a task until it either becomes non-eligible or + * until it gets a new slice. See the HACK in set_next_entity(). + */ + if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + return curr; + + /* Pick the leftmost entity if it's eligible */ + if (se && entity_eligible(cfs_rq, se)) { + best = se; + goto found; + } + + /* Heap search for the EEVD entity */ + while (node) { + struct rb_node *left = node->rb_left; + + /* + * Eligible entities in left subtree are always better + * choices, since they have earlier deadlines. + */ + if (left && vruntime_eligible(cfs_rq, + __node_2_se(left)->min_vruntime)) { + node = left; + continue; + } + + se = __node_2_se(node); + + /* + * The left subtree either is empty or has no eligible + * entity, so check the current node since it is the one + * with earliest deadline that might be eligible. + */ + if (entity_eligible(cfs_rq, se)) { + best = se; + break; + } + + node = node->rb_right; + } +found: + if (!best || (curr && entity_before(curr, best))) + best = curr; - return rb_entry(next, struct sched_entity, run_node); + return best; } #ifdef CONFIG_SCHED_DEBUG @@ -635,99 +975,55 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) if (!last) return NULL; - return rb_entry(last, struct sched_entity, run_node); + return __node_2_se(last); } /************************************************************** * Scheduling class statistics methods: */ - -int sched_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SMP +int sched_update_scaling(void) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); - if (ret || !write) - return ret; - - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, - sysctl_sched_min_granularity); - #define WRT_SYSCTL(name) \ (normalized_sysctl_##name = sysctl_##name / (factor)) - WRT_SYSCTL(sched_min_granularity); - WRT_SYSCTL(sched_latency); - WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_base_slice); #undef WRT_SYSCTL return 0; } #endif +#endif -/* - * delta /= w - */ -static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) -{ - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = __calc_delta(delta, NICE_0_LOAD, &se->load); - - return delta; -} - -/* - * The idea is to set a period in which each task runs once. - * - * When there are too many tasks (sched_nr_latency) we have to stretch - * this period because otherwise the slices get too small. - * - * p = (nr <= nl) ? l : l*nr/nl - */ -static u64 __sched_period(unsigned long nr_running) -{ - if (unlikely(nr_running > sched_nr_latency)) - return nr_running * sysctl_sched_min_granularity; - else - return sysctl_sched_latency; -} +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); /* - * We calculate the wall-time slice from the period by taking a part - * proportional to the weight. - * - * s = p*P[w/rw] + * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i + * this is probably good enough. */ -static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); - - for_each_sched_entity(se) { - struct load_weight *load; - struct load_weight lw; - - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; + if ((s64)(se->vruntime - se->deadline) < 0) + return false; - if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; + /* + * For EEVDF the virtual time slope is determined by w_i (iow. + * nice) while the request time r_i is determined by + * sysctl_sched_base_slice. + */ + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; - update_load_add(&lw, se->load.weight); - load = &lw; - } - slice = __calc_delta(slice, se->load.weight, load); - } - return slice; -} + /* + * EEVDF: vd_i = ve_i + r_i / w_i + */ + se->deadline = se->vruntime + calc_delta_fair(se->slice, se); -/* - * We calculate the vruntime slice of a to-be-inserted task. - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); + /* + * The task has consumed its request, reschedule. + */ + return true; } #include "pelt.h" @@ -753,16 +1049,15 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ + /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */ } -static void attach_entity_cfs_rq(struct sched_entity *se); - /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * - * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) + * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such @@ -792,20 +1087,6 @@ void post_init_entity_util_avg(struct task_struct *p) long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; - if (cap > 0) { - if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; - sa->util_avg /= (cfs_rq->avg.load_avg + 1); - - if (sa->util_avg > cap) - sa->util_avg = cap; - } else { - sa->util_avg = cap; - } - } - - sa->runnable_avg = sa->util_avg; - if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -821,7 +1102,19 @@ void post_init_entity_util_avg(struct task_struct *p) return; } - attach_entity_cfs_rq(se); + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se_weight(se); + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; } #else /* !CONFIG_SMP */ @@ -836,181 +1129,204 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) +{ + u64 now = rq_clock_task(rq); + s64 delta_exec; + + delta_exec = now - curr->exec_start; + if (unlikely(delta_exec <= 0)) + return delta_exec; + + curr->exec_start = now; + curr->sum_exec_runtime += delta_exec; + + if (schedstat_enabled()) { + struct sched_statistics *stats; + + stats = __schedstats_from_se(curr); + __schedstat_set(stats->exec_max, + max(delta_exec, stats->exec_max)); + } + + return delta_exec; +} + +static inline void update_curr_task(struct task_struct *p, s64 delta_exec) +{ + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); + cgroup_account_cputime(p, delta_exec); +} + +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (curr->vlag == curr->deadline) + return false; + + return !entity_eligible(cfs_rq, curr); +} + +static inline bool do_preempt_short(struct cfs_rq *cfs_rq, + struct sched_entity *pse, struct sched_entity *se) +{ + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (pse->slice >= se->slice) + return false; + + if (!entity_eligible(cfs_rq, pse)) + return false; + + if (entity_before(pse, se)) + return true; + + if (!entity_eligible(cfs_rq, se)) + return true; + + return false; +} + +/* + * Used by other classes to account runtime. + */ +s64 update_curr_common(struct rq *rq) +{ + struct task_struct *donor = rq->donor; + s64 delta_exec; + + delta_exec = update_curr_se(rq, &donor->se); + if (likely(delta_exec > 0)) + update_curr_task(donor, delta_exec); + + return delta_exec; +} + /* * Update the current task's runtime statistics. */ static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_clock_task(rq_of(cfs_rq)); - u64 delta_exec; + struct rq *rq = rq_of(cfs_rq); + s64 delta_exec; + bool resched; if (unlikely(!curr)) return; - delta_exec = now - curr->exec_start; - if (unlikely((s64)delta_exec <= 0)) + delta_exec = update_curr_se(rq, curr); + if (unlikely(delta_exec <= 0)) return; - curr->exec_start = now; - - schedstat_set(curr->statistics.exec_max, - max(delta_exec, curr->statistics.exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq->exec_clock, delta_exec); - curr->vruntime += calc_delta_fair(delta_exec, curr); + resched = update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); + struct task_struct *p = task_of(curr); - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cgroup_account_cputime(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); + update_curr_task(p, delta_exec); + + /* + * If the fair_server is active, we need to account for the + * fair_server time whether or not the task is running on + * behalf of fair_server or not: + * - If the task is running on behalf of fair_server, we need + * to limit its time based on the assigned runtime. + * - Fair task that runs outside of fair_server should account + * against fair_server such that it can account for this time + * and possibly avoid running this period. + */ + if (dl_server_active(&rq->fair_server)) + dl_server_update(&rq->fair_server, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); + + if (cfs_rq->nr_queued == 1) + return; + + if (resched || did_preempt_short(cfs_rq, curr)) { + resched_curr_lazy(rq); + clear_buddies(cfs_rq, curr); + } } static void update_curr_fair(struct rq *rq) { - update_curr(cfs_rq_of(&rq->curr->se)); + update_curr(cfs_rq_of(&rq->donor->se)); } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start, prev_wait_start; + struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(se->statistics.wait_start); + stats = __schedstats_from_se(se); - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; + if (entity_is_task(se)) + p = task_of(se); - __schedstat_set(se->statistics.wait_start, wait_start); + __update_stats_wait_start(rq_of(cfs_rq), p, stats); } static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *p; - u64 delta; + struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; + stats = __schedstats_from_se(se); + /* * When the sched_schedstat changes from 0 to 1, some sched se * maybe already in the runqueue, the se->statistics.wait_start * will be 0.So it will let the delta wrong. We need to avoid this * scenario. */ - if (unlikely(!schedstat_val(se->statistics.wait_start))) + if (unlikely(!schedstat_val(stats->wait_start))) return; - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); - - if (entity_is_task(se)) { + if (entity_is_task(se)) p = task_of(se); - if (task_on_rq_migrating(p)) { - /* - * Preserve migrating task's wait time so wait_start - * time stamp can be adjusted to accumulate wait time - * prior to migration. - */ - __schedstat_set(se->statistics.wait_start, delta); - return; - } - trace_sched_stat_wait(p, delta); - } - __schedstat_set(se->statistics.wait_max, - max(schedstat_val(se->statistics.wait_max), delta)); - __schedstat_inc(se->statistics.wait_count); - __schedstat_add(se->statistics.wait_sum, delta); - __schedstat_set(se->statistics.wait_start, 0); + __update_stats_wait_end(rq_of(cfs_rq), p, stats); } static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { + struct sched_statistics *stats; struct task_struct *tsk = NULL; - u64 sleep_start, block_start; if (!schedstat_enabled()) return; - sleep_start = schedstat_val(se->statistics.sleep_start); - block_start = schedstat_val(se->statistics.block_start); + stats = __schedstats_from_se(se); if (entity_is_task(se)) tsk = task_of(se); - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - __schedstat_set(se->statistics.sleep_max, delta); - - __schedstat_set(se->statistics.sleep_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.block_max))) - __schedstat_set(se->statistics.block_max, delta); - - __schedstat_set(se->statistics.block_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - if (tsk->in_iowait) { - __schedstat_add(se->statistics.iowait_sum, delta); - __schedstat_inc(se->statistics.iowait_count); - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } + __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats); } /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) return; @@ -1020,14 +1336,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) - update_stats_wait_start(cfs_rq, se); + update_stats_wait_start_fair(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) - update_stats_enqueue_sleeper(cfs_rq, se); + update_stats_enqueue_sleeper_fair(cfs_rq, se); } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) @@ -1038,16 +1354,19 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * waiting task: */ if (se != cfs_rq->curr) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { struct task_struct *tsk = task_of(se); + unsigned int state; - if (tsk->state & TASK_INTERRUPTIBLE) - __schedstat_set(se->statistics.sleep_start, + /* XXX racy against TTWU */ + state = READ_ONCE(tsk->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(tsk->stats.sleep_start, rq_clock(rq_of(cfs_rq))); - if (tsk->state & TASK_UNINTERRUPTIBLE) - __schedstat_set(se->statistics.block_start, + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(tsk->stats.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -1068,6 +1387,50 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +static inline bool is_core_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + int sibling; + + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; + + if (!idle_cpu(sibling)) + return false; + } +#endif + + return true; +} + +#ifdef CONFIG_NUMA +#define NUMA_IMBALANCE_MIN 2 + +static inline long +adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr) +{ + /* + * Allow a NUMA imbalance if busy CPUs is less than the maximum + * threshold. Above this threshold, individual tasks may be contending + * for both memory bandwidth and any shared HT resources. This is an + * approximation as the number of running tasks may not be related to + * the number of busy CPUs due to sched_setaffinity. + */ + if (dst_running > imb_numa_nr) + return imbalance; + + /* + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the destination is lightly loaded. + */ + if (imbalance <= NUMA_IMBALANCE_MIN) + return 0; + + return imbalance; +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_NUMA_BALANCING /* * Approximate time to scan a full NUMA task in ms. The task scan period is @@ -1083,6 +1446,9 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* The page with hint page fault latency < threshold in ms is considered hot */ +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; + struct numa_group { refcount_t refcount; @@ -1095,11 +1461,12 @@ struct numa_group { unsigned long total_faults; unsigned long max_faults_cpu; /* + * faults[] array is split into two regions: faults_mem and faults_cpu. + * * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted * more by CPU use than by memory faults. */ - unsigned long *faults_cpu; unsigned long faults[]; }; @@ -1110,7 +1477,7 @@ struct numa_group { static struct numa_group *deref_task_numa_group(struct task_struct *p) { return rcu_dereference_check(p->numa_group, p == current || - (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); + (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); } static struct numa_group *deref_curr_numa_group(struct task_struct *p) @@ -1140,7 +1507,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) return rss / nr_scan_pages; } -/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ #define MAX_SCAN_WINDOW 2560 static unsigned int task_scan_min(struct task_struct *p) @@ -1273,8 +1640,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + - group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; + return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] + + group->faults[task_faults_idx(NUMA_CPU, nid, 1)]; } static inline unsigned long group_faults_priv(struct numa_group *ng) @@ -1315,10 +1682,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng) /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, - int maxdist, bool task) + int lim_dist, bool task) { unsigned long score = 0; - int node; + int node, max_dist; /* * All nodes are directly connected, and the same distance @@ -1327,9 +1694,11 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, if (sched_numa_topology_type == NUMA_DIRECT) return 0; + /* sched_max_numa_distance may be changed in parallel. */ + max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, - * which should be ok given the number of nodes rarely exceeds 8. + * which should be OK given the number of nodes rarely exceeds 8. */ for_each_online_node(node) { unsigned long faults; @@ -1339,7 +1708,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * The furthest away nodes in the system are not interesting * for placement; nid was already counted. */ - if (dist == sched_max_numa_distance || node == nid) + if (dist >= max_dist || node == nid) continue; /* @@ -1349,8 +1718,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes. */ - if (sched_numa_topology_type == NUMA_BACKPLANE && - dist >= maxdist) + if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue; /* Add up the faults from nearby nodes. */ @@ -1368,8 +1736,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * This seems to result in good task placement. */ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { - faults *= (sched_max_numa_distance - dist); - faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + faults *= (max_dist - dist); + faults /= (max_dist - LOCAL_DISTANCE); } score += faults; @@ -1423,15 +1791,169 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } -bool should_numa_migrate_memory(struct task_struct *p, struct page * page, +/* + * If memory tiering mode is enabled, cpupid of slow memory page is + * used to record scan time instead of CPU and PID. When tiering mode + * is disabled at run time, the scan time (in cpupid) will be + * interpreted as CPU and PID. So CPU needs to be checked to avoid to + * access out of array bound. + */ +static inline bool cpupid_valid(int cpupid) +{ + return cpupid_to_cpu(cpupid) < nr_cpu_ids; +} + +/* + * For memory tiering mode, if there are enough free pages (more than + * enough watermark defined here) in fast memory node, to take full + * advantage of fast memory capacity, all recently accessed slow + * memory pages will be migrated to fast memory node without + * considering hot threshold. + */ +static bool pgdat_free_space_enough(struct pglist_data *pgdat) +{ + int z; + unsigned long enough_wmark; + + enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, + pgdat->node_present_pages >> 4); + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone_watermark_ok(zone, 0, + promo_wmark_pages(zone) + enough_wmark, + ZONE_MOVABLE, 0)) + return true; + } + return false; +} + +/* + * For memory tiering mode, when page tables are scanned, the scan + * time will be recorded in struct page in addition to make page + * PROT_NONE for slow memory page. So when the page is accessed, in + * hint page fault handler, the hint page fault latency is calculated + * via, + * + * hint page fault latency = hint page fault time - scan time + * + * The smaller the hint page fault latency, the higher the possibility + * for the page to be hot. + */ +static int numa_hint_fault_latency(struct folio *folio) +{ + int last_time, time; + + time = jiffies_to_msecs(jiffies); + last_time = folio_xchg_access_time(folio, time); + + return (time - last_time) & PAGE_ACCESS_TIME_MASK; +} + +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + +#define NUMA_MIGRATION_ADJUST_STEPS 16 + +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, + unsigned long rate_limit, + unsigned int ref_th) +{ + unsigned int now, start, th_period, unit_th, th; + unsigned long nr_cand, ref_cand, diff_cand; + + now = jiffies_to_msecs(jiffies); + th_period = sysctl_numa_balancing_scan_period_max; + start = pgdat->nbp_th_start; + if (now - start > th_period && + cmpxchg(&pgdat->nbp_th_start, start, now) == start) { + ref_cand = rate_limit * + sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + diff_cand = nr_cand - pgdat->nbp_th_nr_cand; + unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; + th = pgdat->nbp_threshold ? : ref_th; + if (diff_cand > ref_cand * 11 / 10) + th = max(th - unit_th, unit_th); + else if (diff_cand < ref_cand * 9 / 10) + th = min(th + unit_th, ref_th * 2); + pgdat->nbp_th_nr_cand = nr_cand; + pgdat->nbp_threshold = th; + } +} + +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu) { struct numa_group *ng = deref_curr_numa_group(p); int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * Cannot migrate to memoryless nodes. + */ + if (!node_state(dst_nid, N_MEMORY)) + return false; + + /* + * The pages in slow memory node should be migrated according + * to hot/cold instead of private/shared. + */ + if (folio_use_access_time(folio)) { + struct pglist_data *pgdat; + unsigned long rate_limit; + unsigned int latency, th, def_th; + + pgdat = NODE_DATA(dst_nid); + if (pgdat_free_space_enough(pgdat)) { + /* workload changed, reset hot threshold */ + pgdat->nbp_threshold = 0; + return true; + } + + def_th = sysctl_numa_balancing_hot_threshold; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); + + th = pgdat->nbp_threshold ? : def_th; + latency = numa_hint_fault_latency(folio); + if (latency >= th) + return false; + + return !numa_promotion_rate_limit(pgdat, rate_limit, + folio_nr_pages(folio)); + } + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid); + + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) + return false; /* * Allow first faults or private faults to migrate immediately early in @@ -1523,28 +2045,12 @@ struct numa_stats { int idle_cpu; }; -static inline bool is_core_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - int sibling; - - for_each_cpu(sibling, cpu_smt_mask(cpu)) { - if (cpu == sibling) - continue; - - if (!idle_cpu(cpu)) - return false; - } -#endif - - return true; -} - struct task_numa_env { struct task_struct *p; int src_cpu, src_nid; int dst_cpu, dst_nid; + int imb_numa_nr; struct numa_stats src_stats, dst_stats; @@ -1558,9 +2064,6 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); -static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1581,11 +2084,11 @@ numa_type numa_classify(unsigned int imbalance_pct, #ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ -static inline bool test_idle_cores(int cpu, bool def); +static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { if (!static_branch_likely(&sched_smt_present) || - idle_core >= 0 || !test_idle_cores(cpu, false)) + idle_core >= 0 || !test_idle_cores(cpu)) return idle_core; /* @@ -1625,11 +2128,11 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); - ns->util += cpu_util(cpu); - ns->nr_running += rq->cfs.h_nr_running; + ns->util += cpu_util_cfs(cpu); + ns->nr_running += rq->cfs.h_nr_runnable; ns->compute_capacity += capacity_of(cpu); - if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { if (READ_ONCE(rq->numa_migrate_on) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; @@ -1661,7 +2164,7 @@ static void task_numa_assign(struct task_numa_env *env, int start = env->dst_cpu; /* Find alternative idle CPU. */ - for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) { if (cpu == env->best_cpu || !idle_cpu(cpu) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { continue; @@ -1801,6 +2304,15 @@ static bool task_numa_compare(struct task_numa_env *env, */ cur_ng = rcu_dereference(cur->numa_group); if (cur_ng == p_ng) { + /* + * Do not swap within a group or between tasks that have + * no group if there is spare capacity. Swapping does + * not address the load imbalance and helps one task at + * the cost of punishing another. + */ + if (env->dst_stats.node_type == node_has_spare) + goto unlock; + imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* @@ -1941,7 +2453,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); imbalance = adjust_numa_imbalance(imbalance, dst_running, - env->dst_stats.weight); + env->imb_numa_nr); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -2006,8 +2518,10 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) + if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + env.imb_numa_nr = sd->imb_numa_nr; + } rcu_read_unlock(); /* @@ -2042,7 +2556,7 @@ static int task_numa_migrate(struct task_struct *p) */ ng = deref_curr_numa_group(p); if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -2130,7 +2644,7 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find out how many nodes on the workload is actively running on. Do this by + * Find out how many nodes the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. @@ -2140,13 +2654,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group) unsigned long faults, max_faults = 0; int nid, active_nodes = 0; - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults > max_faults) max_faults = faults; } - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults * ACTIVE_NODE_FRACTION > max_faults) active_nodes++; @@ -2184,7 +2698,7 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is - * completely idle or all activity is areas that are not of interest + * completely idle or all activity is in areas that are not of interest * to automatic numa balancing. Related to that, if there were failed * migration then it implies we are migrating too quickly or the local * node is overloaded. In either case, scan slower @@ -2300,7 +2814,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) dist = sched_max_numa_distance; - for_each_online_node(node) { + for_each_node_state(node, N_CPU) { score = group_weight(p, node, dist); if (score > max_score) { max_score = score; @@ -2319,7 +2833,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) * inside the highest scoring group of nodes. The nodemask tricks * keep the complexity of the search down. */ - nodes = node_online_map; + nodes = node_states[N_CPU]; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; nodemask_t max_group = NODE_MASK_NONE; @@ -2441,7 +2955,7 @@ static void task_numa_placement(struct task_struct *p) * is at the beginning of the numa_faults array. */ ng->faults[mem_idx] += diff; - ng->faults_cpu[mem_idx] += f_diff; + ng->faults[cpu_idx] += f_diff; ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } @@ -2458,6 +2972,9 @@ static void task_numa_placement(struct task_struct *p) } } + /* Cannot migrate task to CPU-less node */ + max_nid = numa_nearest_node(max_nid, N_CPU); + if (ng) { numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); @@ -2495,7 +3012,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + - 4*nr_node_ids*sizeof(unsigned long); + NR_NUMA_HINT_FAULT_STATS * + nr_node_ids * sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -2506,9 +3024,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; - /* Second half of the array tracks nids where faults happen */ - grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * - nr_node_ids; for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2565,7 +3080,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled()); double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { @@ -2592,7 +3107,7 @@ no_join: } /* - * Get rid of NUMA staticstics associated with a task (either current or dead). + * Get rid of NUMA statistics associated with a task (either current or dead). * If @final is set, the task is dead and has reached refcount zero, so we can * safely free all relevant data structures. Otherwise, there might be * concurrent reads from places like load balancing and procfs, and we should @@ -2650,6 +3165,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; + /* + * NUMA faults statistics are unnecessary for the slow memory + * node for memory tiering mode. + */ + if (!node_is_toptier(mem_node) && + (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || + !cpupid_valid(last_cpupid))) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * @@ -2720,6 +3244,45 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) +{ + unsigned long pids; + /* + * Allow unconditional access first two times, so that all the (pages) + * of VMAs get prot_none fault introduced irrespective of accesses. + * This is also done to avoid any side effect of task scanning + * amplifying the unfairness of disjoint set of VMAs' access. + */ + if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) + return true; + + pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + return true; + + /* + * Complete a scan that has already started regardless of PID access, or + * some VMAs may never be scanned in multi-threaded applications: + */ + if (mm->numa_scan_offset > vma->vm_start) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); + return true; + } + + /* + * This vma has not been accessed for a while, and if the number + * the threads in the same process is low, which means no other + * threads can help scan this vma, force a vma scan. + */ + if (READ_ONCE(mm->numa_scan_seq) > + (vma->numab_state->prev_scan_seq + get_nr_threads(current))) + return true; + + return false; +} + +#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) + /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). @@ -2734,6 +3297,9 @@ static void task_numa_work(struct callback_head *work) unsigned long start, end; unsigned long nr_pte_updates = 0; long pages, virtpages; + struct vma_iterator vmi; + bool vma_pids_skipped; + bool vma_pids_forced = false; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -2767,7 +3333,7 @@ static void task_numa_work(struct callback_head *work) } next_scan = now + msecs_to_jiffies(p->numa_scan_period); - if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan)) return; /* @@ -2776,7 +3342,6 @@ static void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ virtpages = pages * 8; /* Scan up to this much virtual space */ @@ -2786,34 +3351,118 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + + /* + * VMAs are skipped if the current PID has not trapped a fault within + * the VMA recently. Allow scanning to be forced if there is no + * suitable VMA remaining. + */ + vma_pids_skipped = false; + +retry_pids: + start = mm->numa_scan_offset; + vma_iter_init(&vmi, mm, start); + vma = vma_next(&vmi); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + vma_iter_set(&vmi, start); + vma = vma_next(&vmi); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = vma_next(&vmi)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); continue; } /* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso + * hinting faults in read-only file-backed mappings or the vDSO * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue; + } /* * Skip inaccessible VMAs to avoid any confusion between - * PROT_NONE and NUMA hinting ptes + * PROT_NONE and NUMA hinting PTEs */ - if (!vma_is_accessible(vma)) + if (!vma_is_accessible(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue; + } + + /* Initialise new per-VMA NUMAB state. */ + if (!vma->numab_state) { + struct vma_numab_state *ptr; + + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + if (!ptr) + continue; + + if (cmpxchg(&vma->numab_state, NULL, ptr)) { + kfree(ptr); + continue; + } + + vma->numab_state->start_scan_seq = mm->numa_scan_seq; + + vma->numab_state->next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + + /* Reset happens after 4 times scan delay of scan start */ + vma->numab_state->pids_active_reset = vma->numab_state->next_scan + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + + /* + * Ensure prev_scan_seq does not match numa_scan_seq, + * to prevent VMAs being skipped prematurely on the + * first scan: + */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; + } + + /* + * Scanning the VMAs of short lived tasks add more overhead. So + * delay the scan for new VMAs. + */ + if (mm->numa_scan_seq && time_before(jiffies, + vma->numab_state->next_scan)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); + continue; + } + + /* RESET access PIDs regularly for old VMAs. */ + if (mm->numa_scan_seq && + time_after(jiffies, vma->numab_state->pids_active_reset)) { + vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); + vma->numab_state->pids_active[1] = 0; + } + + /* Do not rescan VMAs twice within the same sequence. */ + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { + mm->numa_scan_offset = vma->vm_end; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); + continue; + } + + /* + * Do not scan the VMA if task has not accessed it, unless no other + * VMA candidate exists. + */ + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { + vma_pids_skipped = true; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); + continue; + } do { start = max(start, vma->vm_start); @@ -2824,7 +3473,7 @@ static void task_numa_work(struct callback_head *work) /* * Try to scan sysctl_numa_balancing_size worth of * hpages that have at least one present PTE that - * is not already pte-numa. If the VMA contains + * is not already PTE-numa. If the VMA contains * areas that are unused or already full of prot_numa * PTEs, scan up to virtpages, to skip through those * areas faster. @@ -2839,6 +3488,26 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); + + /* VMA scan is complete, do not scan until next sequence. */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; + + /* + * Only force scan within one VMA at a time, to limit the + * cost of scanning a potentially uninteresting VMA. + */ + if (vma_pids_forced) + break; + } + + /* + * If no VMAs are remaining and VMAs were skipped due to the PID + * not accessing the VMA previously, then force a scan to ensure + * forward progress: + */ + if (!vma && !vma_pids_forced && vma_pids_skipped) { + vma_pids_forced = true; + goto retry_pids; } out: @@ -2881,9 +3550,12 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0; p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_migrate_retry = 0; /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; + p->numa_pages_migrated = 0; + p->total_numa_faults = 0; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -2921,7 +3593,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -3008,7 +3680,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &rq->cfs_tasks); } #endif - cfs_rq->nr_running++; + cfs_rq->nr_queued++; } static void @@ -3021,7 +3693,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } #endif - cfs_rq->nr_running--; + cfs_rq->nr_queued--; } /* @@ -3085,6 +3757,9 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void @@ -3093,17 +3768,33 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { + bool curr = cfs_rq->curr == se; + if (se->on_rq) { /* commit outstanding execution time */ - if (cfs_rq->curr == se) - update_curr(cfs_rq); + update_curr(cfs_rq); + update_entity_lag(cfs_rq, se); + se->deadline -= se->vruntime; + se->rel_deadline = 1; + if (!curr) + __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); + /* + * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), + * we need to scale se->vlag when w_i changes. + */ + se->vlag = div_s64(se->vlag * se->load.weight, weight); + if (se->rel_deadline) + se->deadline = div_s64(se->deadline * se->load.weight, weight); + update_load_set(&se->load, weight); #ifdef CONFIG_SMP @@ -3115,22 +3806,36 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) + if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); + place_entity(cfs_rq, se, 0); + if (!curr) + __enqueue_entity(cfs_rq, se); + /* + * The entity's vruntime has been adjusted, so let's check + * whether the rq-wide min_vruntime needs updated too. Since + * the calculations above require stable min_vruntime rather + * than up-to-date one, we do the update at the end of the + * reweight process. + */ + update_min_vruntime(cfs_rq); + } } -void reweight_task(struct task_struct *p, int prio) +static void reweight_task_fair(struct rq *rq, struct task_struct *p, + const struct load_weight *lw) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct load_weight *load = &se->load; - unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); - load->inv_weight = sched_prio_to_wmult[prio]; + reweight_entity(cfs_rq, se, lw->weight); + load->inv_weight = lw->inv_weight; } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* @@ -3142,7 +3847,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (1) - * \Sum grq->load.weight + * \Sum grq->load.weight * * Now, because computing that sum is prohibitively expensive to compute (been * there, done that) we approximate it with this average stuff. The average @@ -3156,7 +3861,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->avg.load_avg * ge->load.weight = ------------------------------ (3) - * tg->load_avg + * tg->load_avg * * Where: tg->load_avg ~= \Sum grq->avg.load_avg * @@ -3172,7 +3877,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- = tg->weight (4) - * grp->load.weight + * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. * @@ -3191,7 +3896,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (6) - * tg_load_avg' + * tg_load_avg' * * Where: * @@ -3241,8 +3946,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); - /* * Recomputes the group entity based on the current state of its group * runqueue. @@ -3252,7 +3955,11 @@ static void update_cfs_group(struct sched_entity *se) struct cfs_rq *gcfs_rq = group_cfs_rq(se); long shares; - if (!gcfs_rq) + /* + * When a group becomes empty, preserve its weight. This matters for + * DELAY_DEQUEUE. + */ + if (!gcfs_rq || !gcfs_rq->load.weight) return; if (throttled_hierarchy(gcfs_rq)) @@ -3260,14 +3967,11 @@ static void update_cfs_group(struct sched_entity *se) #ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); - - if (likely(se->load.weight == shares)) - return; #else - shares = calc_group_shares(gcfs_rq); + shares = calc_group_shares(gcfs_rq); #endif - - reweight_entity(cfs_rq_of(se), se, shares); + if (unlikely(se->load.weight != shares)) + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3293,14 +3997,84 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * As is, the util number is not freq-invariant (we'd have to * implement arch_scale_freq_capacity() for that). * - * See cpu_util(). + * See cpu_util_cfs(). */ cpufreq_update_util(rq, flags); } } #ifdef CONFIG_SMP +static inline bool load_avg_is_decayed(struct sched_avg *sa) +{ + if (sa->load_sum) + return false; + + if (sa->util_sum) + return false; + + if (sa->runnable_sum) + return false; + + /* + * _avg must be null when _sum are null because _avg = _sum / divider + * Make sure that rounding and/or propagation of PELT values never + * break this. + */ + SCHED_WARN_ON(sa->load_avg || + sa->util_avg || + sa->runnable_avg); + + return true; +} + +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return u64_u32_load_copy(cfs_rq->avg.last_update_time, + cfs_rq->last_update_time_copy); +} #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list + * bottom-up, we only have to test whether the cfs_rq before us on the list + * is our child. + * If cfs_rq is not on the list, test whether a child needs its to be added to + * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). + */ +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) +{ + struct cfs_rq *prev_cfs_rq; + struct list_head *prev; + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq->on_list) { + prev = cfs_rq->leaf_cfs_rq_list.prev; + } else { + prev = rq->tmp_alone_branch; + } + + if (prev == &rq->leaf_cfs_rq_list) + return false; + + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); + + return (prev_cfs_rq->tg->parent == cfs_rq->tg); +} + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (!load_avg_is_decayed(&cfs_rq->avg)) + return false; + + if (child_cfs_rq_on_list(cfs_rq)) + return false; + + return true; +} + /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed @@ -3317,7 +4091,8 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { - long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + long delta; + u64 now; /* * No need to update load_avg for root_task_group as it is not used. @@ -3325,10 +4100,67 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; + /* rq has been offline and doesn't contribute to the share anymore: */ + if (!cpu_active(cpu_of(rq_of(cfs_rq)))) + return; + + /* + * For migration heavy workloads, access to tg->load_avg can be + * unbound. Limit the update rate to at most once per ms. + */ + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) + return; + + delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; + cfs_rq->last_update_tg_load_avg = now; + } +} + +static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) +{ + long delta; + u64 now; + + /* + * No need to update load_avg for root_task_group, as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + delta = 0 - cfs_rq->tg_load_avg_contrib; + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = 0; + cfs_rq->last_update_tg_load_avg = now; +} + +/* CPU offline callback: */ +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) +{ + struct task_group *tg; + + lockdep_assert_rq_held(rq); + + /* + * The rq clock has already been updated in + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + clear_tg_load_avg(cfs_rq); } + rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); } /* @@ -3355,32 +4187,13 @@ void set_task_rq_fair(struct sched_entity *se, if (!(se->avg.last_update_time && prev)) return; -#ifndef CONFIG_64BIT - { - u64 p_last_update_time_copy; - u64 n_last_update_time_copy; - - do { - p_last_update_time_copy = prev->load_last_update_time_copy; - n_last_update_time_copy = next->load_last_update_time_copy; - - smp_rmb(); - - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = cfs_rq_last_update_time(prev); + n_last_update_time = cfs_rq_last_update_time(next); - } while (p_last_update_time != p_last_update_time_copy || - n_last_update_time != n_last_update_time_copy); - } -#else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; -#endif __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3448,15 +4261,14 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3465,23 +4277,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * divider; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3492,11 +4311,16 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * divider; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); } static inline void @@ -3532,7 +4356,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3549,16 +4373,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, divider); + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) + return; + + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta_avg); add_positive(&cfs_rq->avg.load_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3629,6 +4459,8 @@ static inline bool skip_blocked_update(struct sched_entity *se) static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} +static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {} + static inline int propagate_entity_load_avg(struct sched_entity *se) { return 0; @@ -3638,18 +4470,100 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_NO_HZ_COMMON +static inline void migrate_se_pelt_lag(struct sched_entity *se) +{ + u64 throttled = 0, now, lut; + struct cfs_rq *cfs_rq; + struct rq *rq; + bool is_idle; + + if (load_avg_is_decayed(&se->avg)) + return; + + cfs_rq = cfs_rq_of(se); + rq = rq_of(cfs_rq); + + rcu_read_lock(); + is_idle = is_idle_task(rcu_dereference(rq->curr)); + rcu_read_unlock(); + + /* + * The lag estimation comes with a cost we don't want to pay all the + * time. Hence, limiting to the case where the source CPU is idle and + * we know we are at the greatest risk to have an outdated clock. + */ + if (!is_idle) + return; + + /* + * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where: + * + * last_update_time (the cfs_rq's last_update_time) + * = cfs_rq_clock_pelt()@cfs_rq_idle + * = rq_clock_pelt()@cfs_rq_idle + * - cfs->throttled_clock_pelt_time@cfs_rq_idle + * + * cfs_idle_lag (delta between rq's update and cfs_rq's update) + * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle + * + * rq_idle_lag (delta between now and rq's update) + * = sched_clock_cpu() - rq_clock()@rq_idle + * + * We can then write: + * + * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time + + * sched_clock_cpu() - rq_clock()@rq_idle + * Where: + * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle + * rq_clock()@rq_idle is rq->clock_idle + * cfs->throttled_clock_pelt_time@cfs_rq_idle + * is cfs_rq->throttled_pelt_idle + */ + +#ifdef CONFIG_CFS_BANDWIDTH + throttled = u64_u32_load(cfs_rq->throttled_pelt_idle); + /* The clock has been stopped for throttling */ + if (throttled == U64_MAX) + return; +#endif + now = u64_u32_load(rq->clock_pelt_idle); + /* + * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case + * is observed the old clock_pelt_idle value and the new clock_idle, + * which lead to an underestimation. The opposite would lead to an + * overestimation. + */ + smp_rmb(); + lut = cfs_rq_last_update_time(cfs_rq); + + now -= throttled; + if (now < lut) + /* + * cfs_rq->avg.last_update_time is more recent than our + * estimation, let's use it. + */ + now = lut; + else + now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle); + + __update_load_avg_blocked_se(now, se); +} +#else +static void migrate_se_pelt_lag(struct sched_entity *se) {} +#endif + /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) - * avg. The immediate corollary is that all (fair) tasks must be attached, see - * post_init_entity_util_avg(). + * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -3675,14 +4589,31 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_load; sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); r = removed_runnable; sub_positive(&sa->runnable_avg, r); sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); /* * removed_runnable is the unweighted version of removed_load so we @@ -3695,12 +4626,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) } decayed |= __update_load_avg_cfs_rq(now, cfs_rq); - -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->load_last_update_time_copy = sa->last_update_time; -#endif - + u64_u32_store_copy(sa->last_update_time, + cfs_rq->last_update_time_copy, + sa->last_update_time); return decayed; } @@ -3740,11 +4668,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; @@ -3772,8 +4700,15 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3788,6 +4723,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 #define DO_ATTACH 0x4 +#define DO_DETACH 0x8 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -3797,7 +4733,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration + * track group sched_entity load average for task_h_load calculation in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); @@ -3817,6 +4753,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq); + } else if (flags & DO_DETACH) { + /* + * DO_DETACH means we're here from dequeue_entity() + * and we are migrating task out of the CPU. + */ + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); @@ -3825,27 +4768,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s } } -#ifndef CONFIG_64BIT -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - u64 last_update_time_copy; - u64 last_update_time; - - do { - last_update_time_copy = cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time = cfs_rq->avg.last_update_time; - } while (last_update_time != last_update_time_copy); - - return last_update_time; -} -#else -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.last_update_time; -} -#endif - /* * Synchronize entity load avg of dequeued entity without locking * the previous rq. @@ -3870,8 +4792,8 @@ static void remove_entity_load_avg(struct sched_entity *se) /* * tasks cannot exit without having gone through wake_up_new_task() -> - * post_init_entity_util_avg() which will have added things to the - * cfs_rq, so we can remove unconditionally. + * enqueue_task_fair() which will have added things to the cfs_rq, + * so we can remove unconditionally. */ sync_entity_load_avg(se); @@ -3894,38 +4816,27 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); } -static inline unsigned long _task_util_est(struct task_struct *p) +static inline unsigned long task_runnable(struct task_struct *p) { - struct util_est ue = READ_ONCE(p->se.avg.util_est); - - return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); + return READ_ONCE(p->se.avg.runnable_avg); } -static inline unsigned long task_util_est(struct task_struct *p) +static inline unsigned long _task_util_est(struct task_struct *p) { - return max(task_util(p), _task_util_est(p)); + return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; } -#ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p) -{ - return clamp(task_util_est(p), - uclamp_eff_value(p, UCLAMP_MIN), - uclamp_eff_value(p, UCLAMP_MAX)); -} -#else -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long task_util_est(struct task_struct *p) { - return task_util_est(p); + return max(task_util(p), _task_util_est(p)); } -#endif static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) @@ -3936,42 +4847,39 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, return; /* Update root cfs_rq's estimated utilization */ - enqueued = cfs_rq->avg.util_est.enqueued; + enqueued = cfs_rq->avg.util_est; enqueued += _task_util_est(p); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); trace_sched_util_est_cfs_tp(cfs_rq); } -/* - * Check if a (signed) value is within a specified (unsigned) margin, - * based on the observation that: - * - * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) - * - * NOTE: this only works when value + maring < INT_MAX. - */ -static inline bool within_margin(int value, int margin) -{ - return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); -} - -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) { - long last_ewma_diff; - struct util_est ue; - int cpu; + unsigned int enqueued; if (!sched_feat(UTIL_EST)) return; /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + enqueued = cfs_rq->avg.util_est; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); trace_sched_util_est_cfs_tp(cfs_rq); +} + +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) + +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) +{ + unsigned int ewma, dequeued, last_ewma_diff; + + if (!sched_feat(UTIL_EST)) + return; /* * Skip update of task's estimated utilization when the task has not @@ -3980,84 +4888,232 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (!task_sleep) return; + /* Get current estimate of utilization */ + ewma = READ_ONCE(p->se.avg.util_est); + /* * If the PELT values haven't changed since enqueue time, * skip the util_est update. */ - ue = p->se.avg.util_est; - if (ue.enqueued & UTIL_AVG_UNCHANGED) + if (ewma & UTIL_AVG_UNCHANGED) return; + /* Get utilization at dequeue */ + dequeued = task_util(p); + /* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); - if (sched_feat(UTIL_EST_FASTUP)) { - if (ue.ewma < ue.enqueued) { - ue.ewma = ue.enqueued; - goto done; - } + if (ewma <= dequeued) { + ewma = dequeued; + goto done; } /* - * Skip update of task's estimated utilization when its EWMA is + * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value. */ - last_ewma_diff = ue.enqueued - ue.ewma; - if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) - return; + last_ewma_diff = ewma - dequeued; + if (last_ewma_diff < UTIL_EST_MARGIN) + goto done; /* * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; /* + * To avoid underestimate of task utilization, skip updates of EWMA if + * we cannot grant that thread got all CPU time it wanted. + */ + if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) + goto done; + + + /* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample - * of the task size. This is done by storing the current PELT value - * as ue.enqueued and by using this value to update the Exponential - * Weighted Moving Average (EWMA): + * of the task size. This is done by using this value to update the + * Exponential Weighted Moving Average (EWMA): * * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) - * = w * ( last_ewma_diff ) + ewma(t-1) - * = w * (last_ewma_diff + ewma(t-1) / w) + * = w * ( -last_ewma_diff ) + ewma(t-1) + * = w * (-last_ewma_diff + ewma(t-1) / w) * * Where 'w' is the weight of new samples, which is configured to be * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) */ - ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; - ue.ewma += last_ewma_diff; - ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + ewma <<= UTIL_EST_WEIGHT_SHIFT; + ewma -= last_ewma_diff; + ewma >>= UTIL_EST_WEIGHT_SHIFT; done: - WRITE_ONCE(p->se.avg.util_est, ue); + ewma |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(p->se.avg.util_est, ewma); trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, long capacity) +static inline unsigned long get_actual_cpu_capacity(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + + return capacity; +} + +static inline int util_fits_cpu(unsigned long util, + unsigned long uclamp_min, + unsigned long uclamp_max, + int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long capacity_orig; + bool fits, uclamp_max_fits; + + /* + * Check if the real util fits without any uclamp boost/cap applied. + */ + fits = fits_capacity(util, capacity); + + if (!uclamp_is_used()) + return fits; + + /* + * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and + * uclamp_max. We only care about capacity pressure (by using + * capacity_of()) for comparing against the real util. + * + * If a task is boosted to 1024 for example, we don't want a tiny + * pressure to skew the check whether it fits a CPU or not. + * + * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it + * should fit a little cpu even if there's some pressure. + * + * Only exception is for HW or cpufreq pressure since it has a direct impact + * on available OPP of the system. + * + * We honour it for uclamp_min only as a drop in performance level + * could result in not getting the requested minimum performance level. + * + * For uclamp_max, we can tolerate a drop in performance level as the + * goal is to cap the task. So it's okay if it's getting less. + */ + capacity_orig = arch_scale_cpu_capacity(cpu); + + /* + * We want to force a task to fit a cpu as implied by uclamp_max. + * But we do have some corner cases to cater for.. + * + * + * C=z + * | ___ + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | + * | | | | | | | (util somewhere in this region) + * | | | | | | | + * | | | | | | | + * +---------------------------------------- + * CPU0 CPU1 CPU2 + * + * In the above example if a task is capped to a specific performance + * point, y, then when: + * + * * util = 80% of x then it does not fit on CPU0 and should migrate + * to CPU1 + * * util = 80% of y then it is forced to fit on CPU1 to honour + * uclamp_max request. + * + * which is what we're enforcing here. A task always fits if + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, + * the normal upmigration rules should withhold still. + * + * Only exception is when we are on max capacity, then we need to be + * careful not to block overutilized state. This is so because: + * + * 1. There's no concept of capping at max_capacity! We can't go + * beyond this performance level anyway. + * 2. The system is being saturated when we're operating near + * max capacity, it doesn't make sense to block overutilized. + */ + uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); + uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); + fits = fits || uclamp_max_fits; + + /* + * + * C=z + * | ___ (region a, capped, util >= uclamp_max) + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min + * | | | | | | | + * | | | | | | | (region c, boosted, util < uclamp_min) + * +---------------------------------------- + * CPU0 CPU1 CPU2 + * + * a) If util > uclamp_max, then we're capped, we don't care about + * actual fitness value here. We only care if uclamp_max fits + * capacity without taking margin/pressure into account. + * See comment above. + * + * b) If uclamp_min <= util <= uclamp_max, then the normal + * fits_capacity() rules apply. Except we need to ensure that we + * enforce we remain within uclamp_max, see comment above. + * + * c) If util < uclamp_min, then we are boosted. Same as (b) but we + * need to take into account the boosted value fits the CPU without + * taking margin/pressure into account. + * + * Cases (a) and (b) are handled in the 'fits' variable already. We + * just need to consider an extra check for case (c) after ensuring we + * handle the case uclamp_min > uclamp_max. + */ + uclamp_min = min(uclamp_min, uclamp_max); + if (fits && (util < uclamp_min) && + (uclamp_min > get_actual_cpu_capacity(cpu))) + return -1; + + return fits; +} + +static inline int task_fits_cpu(struct task_struct *p, int cpu) { - return fits_capacity(uclamp_task_util(p), capacity); + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); + unsigned long util = task_util_est(p); + /* + * Return true only if the cpu fully fits the task requirements, which + * include the utilization but also the performance hints. + */ + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!static_branch_unlikely(&sched_asym_cpucapacity)) - return; + int cpu = cpu_of(rq); - if (!p) { - rq->misfit_task_load = 0; + if (!sched_asym_cpucap_active()) return; - } - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + /* + * Affinity allows us to go somewhere higher? Or are we on biggest + * available CPU already? Or do we fit into this CPU ? + */ + if (!p || (p->nr_cpus_allowed == 1) || + (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || + task_fits_cpu(p, cpu)) { + rq->misfit_task_load = 0; return; } @@ -4071,9 +5127,15 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) #else /* CONFIG_SMP */ +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + return !cfs_rq->nr_queued; +} + #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 +#define DO_DETACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -4087,7 +5149,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -4096,178 +5158,210 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ -static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { -#ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; - - if (d < 0) - d = -d; + struct sched_entity *se = &p->se; - if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq->nr_spread_over); -#endif + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + if (attr->sched_runtime) { + se->custom_slice = 1; + se->slice = clamp_t(u64, attr->sched_runtime, + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ + } else { + se->custom_slice = 0; + se->slice = sysctl_sched_base_slice; + } } static void -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vruntime = cfs_rq->min_vruntime; + u64 vslice, vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; + + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. + * Due to how V is constructed as the weighted average of entities, + * adding tasks with positive lag, or removing tasks with negative lag + * will move 'time' backwards, this can screw around with the lag of + * other tasks. + * + * EEVDF: placement strategy #1 / #2 */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); + if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { + struct sched_entity *curr = cfs_rq->curr; + unsigned long load; - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh = sysctl_sched_latency; + lag = se->vlag; /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: + * If we want to place a task and preserve lag, we have to + * consider the effect of the new entity on the weighted + * average and compensate for this, otherwise lag can quickly + * evaporate. + * + * Lag is defined as: + * + * lag_i = S - s_i = w_i * (V - v_i) + * + * To avoid the 'w_i' term all over the place, we only track + * the virtual lag: + * + * vl_i = V - v_i <=> v_i = V - vl_i + * + * And we take V to be the weighted average of all v: + * + * V = (\Sum w_j*v_j) / W + * + * Where W is: \Sum w_j + * + * Then, the weighted average after adding an entity with lag + * vl_i is given by: + * + * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) + * = (W*V + w_i*(V - vl_i)) / (W + w_i) + * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) + * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = V - w_i*vl_i / (W + w_i) + * + * And the actual lag after adding an entity with vl_i is: + * + * vl'_i = V' - v_i + * = V - w_i*vl_i / (W + w_i) - (V - vl_i) + * = vl_i - w_i*vl_i / (W + w_i) + * + * Which is strictly less than vl_i. So in order to preserve lag + * we should inflate the lag before placement such that the + * effective lag after placement comes out right. + * + * As such, invert the above relation for vl'_i to get the vl_i + * we need to use such that the lag after placement is the lag + * we computed before dequeue. + * + * vl'_i = vl_i - w_i*vl_i / (W + w_i) + * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) + * + * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i + * = W*vl_i + * + * vl_i = (W + w_i)*vl'_i / W */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; + load = cfs_rq->avg_load; + if (curr && curr->on_rq) + load += scale_load_down(curr->load.weight); - vruntime -= thresh; + lag *= load + scale_load_down(se->load.weight); + if (WARN_ON_ONCE(!load)) + load = 1; + lag = div_s64(lag, load); } - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); -} - -static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + se->vruntime = vruntime - lag; -static inline void check_schedstat_required(void) -{ -#ifdef CONFIG_SCHEDSTATS - if (schedstat_enabled()) + if (se->rel_deadline) { + se->deadline += se->vruntime; + se->rel_deadline = 0; return; - - /* Force schedstat enabled if a dependent tracepoint is active */ - if (trace_sched_stat_wait_enabled() || - trace_sched_stat_sleep_enabled() || - trace_sched_stat_iowait_enabled() || - trace_sched_stat_blocked_enabled() || - trace_sched_stat_runtime_enabled()) { - printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " - "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enable or " - "kernel.sched_schedstats=1\n"); } -#endif + + /* + * When joining the competition; the existing tasks will be, + * on average, halfway through their slice, as such start tasks + * off with half a slice to ease into the competition. + */ + if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) + vslice /= 2; + + /* + * EEVDF: vd_i = ve_i + r_i/w_i + */ + se->deadline = se->vruntime + vslice; } -static inline bool cfs_bandwidth_used(void); +static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); -/* - * MIGRATION - * - * dequeue - * update_curr() - * update_min_vruntime() - * vruntime -= min_vruntime - * - * enqueue - * update_curr() - * update_min_vruntime() - * vruntime += min_vruntime - * - * this way the vruntime transition between RQs is done when both - * min_vruntime are up-to-date. - * - * WAKEUP (remote) - * - * ->migrate_task_rq_fair() (p->state == TASK_WAKING) - * vruntime -= min_vruntime - * - * enqueue - * update_curr() - * update_min_vruntime() - * vruntime += min_vruntime - * - * this way we don't have the most up-to-date min_vruntime on the originating - * CPU and an up-to-date min_vruntime on the destination CPU. - */ +static void +requeue_delayed_entity(struct sched_entity *se); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; /* * If we're the current task, we must renormalise before calling * update_curr(). */ - if (renorm && curr) - se->vruntime += cfs_rq->min_vruntime; + if (curr) + place_entity(cfs_rq, se, flags); update_curr(cfs_rq); /* - * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. Being - * placed in the past could significantly boost this task to the - * fairness detriment of existing tasks. - */ - if (renorm && !curr) - se->vruntime += cfs_rq->min_vruntime; - - /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_runnable of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); se_update_runnable(se); + /* + * XXX update_load_avg() above will have attached us to the pelt sum; + * but update_cfs_group() here will re-adjust the weight and have to + * undo/redo all that. Seems wasteful. + */ update_cfs_group(se); + + /* + * XXX now that the entity has been re-weighted, and it's lag adjusted, + * we can place the entity. + */ + if (!curr) + place_entity(cfs_rq, se, flags); + account_entity_enqueue(cfs_rq, se); - if (flags & ENQUEUE_WAKEUP) - place_entity(cfs_rq, se, 0); + /* Entity has migrated, no longer consider this task hot */ + if (flags & ENQUEUE_MIGRATED) + se->exec_start = 0; check_schedstat_required(); - update_stats_enqueue(cfs_rq, se, flags); - check_spread(cfs_rq, se); + update_stats_enqueue_fair(cfs_rq, se, flags); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - /* - * When bandwidth control is enabled, cfs might have been removed - * because of a parent been throttled but cfs->nr_running > 1. Try to - * add it unconditionnally. - */ - if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) - list_add_leaf_cfs_rq(cfs_rq); - - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); -} - -static void __clear_buddies_last(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last != se) - break; + if (!throttled_hierarchy(cfs_rq)) { + list_add_leaf_cfs_rq(cfs_rq); + } else { +#ifdef CONFIG_CFS_BANDWIDTH + struct rq *rq = rq_of(cfs_rq); - cfs_rq->last = NULL; + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +#endif + } } } @@ -4282,68 +5376,122 @@ static void __clear_buddies_next(struct sched_entity *se) } } -static void __clear_buddies_skip(struct sched_entity *se) +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->next == se) + __clear_buddies_next(se); +} + +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +static void set_delayed(struct sched_entity *se) { + se->sched_delayed = 1; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since dequeue_entities() + * will account it for blocked tasks. + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip != se) - break; - cfs_rq->skip = NULL; + cfs_rq->h_nr_runnable--; + if (cfs_rq_throttled(cfs_rq)) + break; } } -static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void clear_delayed(struct sched_entity *se) { - if (cfs_rq->last == se) - __clear_buddies_last(se); + se->sched_delayed = 0; - if (cfs_rq->next == se) - __clear_buddies_next(se); + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since a dequeue has + * already accounted for it or an enqueue of a task + * below it will account for it in enqueue_task_fair(). + */ + if (!entity_is_task(se)) + return; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - __clear_buddies_skip(se); + cfs_rq->h_nr_runnable++; + if (cfs_rq_throttled(cfs_rq)) + break; + } } -static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static inline void finish_delayed_dequeue_entity(struct sched_entity *se) +{ + clear_delayed(se); + if (sched_feat(DELAY_ZERO) && se->vlag > 0) + se->vlag = 0; +} -static void +static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - /* - * Update run-time statistics of the 'current'. - */ + bool sleep = flags & DEQUEUE_SLEEP; + int action = UPDATE_TG; + update_curr(cfs_rq); + clear_buddies(cfs_rq, se); + + if (flags & DEQUEUE_DELAYED) { + SCHED_WARN_ON(!se->sched_delayed); + } else { + bool delay = sleep; + /* + * DELAY_DEQUEUE relies on spurious wakeups, special task + * states must not suffer spurious wakeups, excempt them. + */ + if (flags & DEQUEUE_SPECIAL) + delay = false; + + SCHED_WARN_ON(delay && se->sched_delayed); + + if (sched_feat(DELAY_DEQUEUE) && delay && + !entity_eligible(cfs_rq, se)) { + update_load_avg(cfs_rq, se, 0); + set_delayed(se); + return false; + } + } + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Subtract its load from the cfs_rq->runnable_avg. + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_runnable of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(cfs_rq, se, action); se_update_runnable(se); - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue_fair(cfs_rq, se, flags); - clear_buddies(cfs_rq, se); + update_entity_lag(cfs_rq, se); + if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { + se->deadline -= se->vruntime; + se->rel_deadline = 1; + } if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; account_entity_dequeue(cfs_rq, se); - /* - * Normalize after update_curr(); which will also have moved - * min_vruntime if @se is the one holding it back. But before doing - * update_min_vruntime() again, which will discount @se's position and - * can move min_vruntime forward still more. - */ - if (!(flags & DEQUEUE_SLEEP)) - se->vruntime -= cfs_rq->min_vruntime; - /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); @@ -4353,55 +5501,25 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- ie. we'll be penalized. + * further than we started -- i.e. we'll be penalized. */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - unsigned long ideal_runtime, delta_exec; - struct sched_entity *se; - s64 delta; - ideal_runtime = sched_slice(cfs_rq, curr); - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); - /* - * The current task ran long enough, ensure it doesn't get - * re-elected due to buddy favours. - */ - clear_buddies(cfs_rq, curr); - return; - } - - /* - * Ensure that a task that missed wakeup preemption by a - * narrow margin doesn't have to wait for a full slice. - * This also mitigates buddy induced latencies under load. - */ - if (delta_exec < sysctl_sched_min_granularity) - return; + if (flags & DEQUEUE_DELAYED) + finish_delayed_dequeue_entity(se); - se = __pick_first_entity(cfs_rq); - delta = curr->vruntime - se->vruntime; + if (cfs_rq->nr_queued == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); - if (delta < 0) - return; - - if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); + return true; } static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + clear_buddies(cfs_rq, se); + /* 'current' is not kept within the tree. */ if (se->on_rq) { /* @@ -4409,31 +5527,39 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); + /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ + se->vlag = se->deadline; } update_stats_curr_start(cfs_rq, se); + SCHED_WARN_ON(cfs_rq->curr); cfs_rq->curr = se; /* * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it + * least twice that of our own weight (i.e. don't track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, - max((u64)schedstat_val(se->statistics.slice_max), - se->sum_exec_runtime - se->prev_sum_exec_runtime)); + struct sched_statistics *stats; + + stats = __schedstats_from_se(se); + __schedstat_set(stats->slice_max, + max((u64)stats->slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); /* * Pick the next process, keeping these things in mind, in this order: @@ -4443,53 +5569,28 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { - struct sched_entity *left = __pick_first_entity(cfs_rq); struct sched_entity *se; /* - * If curr is set we have to see if its left of the leftmost entity - * still in the tree, provided there was anything in the tree at all. - */ - if (!left || (curr && entity_before(curr, left))) - left = curr; - - se = left; /* ideally we run the leftmost entity */ - - /* - * Avoid running the skip buddy, if running something else can - * be done without getting too unfair. + * Picking the ->next buddy will affect latency but not fairness. */ - if (cfs_rq->skip == se) { - struct sched_entity *second; - - if (se == curr) { - second = __pick_first_entity(cfs_rq); - } else { - second = __pick_next_entity(se); - if (!second || (curr && entity_before(curr, second))) - second = curr; - } - - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; + if (sched_feat(PICK_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + SCHED_WARN_ON(cfs_rq->next->sched_delayed); + return cfs_rq->next; } - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - se = cfs_rq->next; - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { + se = pick_eevdf(cfs_rq); + if (se->sched_delayed) { + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* - * Prefer last buddy, try to return the CPU to a preempted task. + * Must not reference @se again, see __block_task(). */ - se = cfs_rq->last; + return NULL; } - - clear_buddies(cfs_rq, se); - return se; } @@ -4507,15 +5608,14 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); - if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); + update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } + SCHED_WARN_ON(cfs_rq->curr != prev); cfs_rq->curr = NULL; } @@ -4539,19 +5639,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; #endif - - if (cfs_rq->nr_running > 1) - check_preempt_tick(cfs_rq, curr); } @@ -4611,8 +5702,20 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - if (cfs_b->quota != RUNTIME_INF) - cfs_b->runtime = cfs_b->quota; + s64 runtime; + + if (unlikely(cfs_b->quota == RUNTIME_INF)) + return; + + cfs_b->runtime += cfs_b->quota; + runtime = cfs_b->runtime_snap - cfs_b->runtime; + if (runtime > 0) { + cfs_b->burst_time += runtime; + cfs_b->nr_burst++; + } + + cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst); + cfs_b->runtime_snap = cfs_b->runtime; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4723,12 +5826,23 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; - /* Add cfs_rq with already running entity in the list */ - if (cfs_rq->nr_running >= 1) + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + + cfs_rq->throttled_clock_self = 0; + + if (SCHED_WARN_ON((s64)delta < 0)) + delta = 0; + + cfs_rq->throttled_clock_self_time += delta; + } } return 0; @@ -4741,8 +5855,12 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); + + SCHED_WARN_ON(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_queued) + cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -4754,7 +5872,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, dequeue = 1; + long queued_delta, runnable_delta, idle_delta, dequeue = 1; + long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -4784,18 +5903,33 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); + int flags; + /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) goto done; - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + /* + * Abuse SPECIAL to avoid delayed dequeue in this instance. + * This avoids teaching dequeue_entities() about throttled + * entities and keeps things relatively simple. + */ + flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; + if (se->sched_delayed) + flags |= DEQUEUE_DELAYED; + dequeue_entity(qcfs_rq, se, flags); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -4813,20 +5947,29 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) update_load_avg(qcfs_rq, se, 0); se_update_runnable(se); - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_delta = cfs_rq->h_nr_queued; + + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; } /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, task_delta); + sub_nr_running(rq, queued_delta); + /* Stop the fair server if throttling resulted in no runnable tasks */ + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) + dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); + SCHED_WARN_ON(cfs_rq->throttled_clock); + if (cfs_rq->nr_queued) + cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -4835,7 +5978,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta; + long queued_delta, runnable_delta, idle_delta; + long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4844,93 +5988,196 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + if (cfs_rq->throttled_clock) { + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + cfs_rq->throttled_clock = 0; + } list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - if (!cfs_rq->load.weight) - return; + if (!cfs_rq->load.weight) { + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; + } - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { - if (se->on_rq) + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_delta = cfs_rq->h_nr_queued; - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(qcfs_rq, se, UPDATE_TG); se_update_runnable(se); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_delta = cfs_rq->h_nr_queued; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; - - /* - * One parent has been throttled and cfs_rq removed from the - * list. Add it back to not break the leaf list. - */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); } + /* Start the fair server if un-throttling resulted in new runnable tasks */ + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) + dl_server_start(&rq->fair_server); + /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); + add_nr_running(rq, queued_delta); unthrottle_throttle: + assert_list_leaf_cfs_rq(rq); + + /* Determine whether we need to wake up potentially idle CPU: */ + if (rq->curr == rq->idle && rq->cfs.nr_queued) + resched_curr(rq); +} + +#ifdef CONFIG_SMP +static void __cfsb_csd_unthrottle(void *arg) +{ + struct cfs_rq *cursor, *tmp; + struct rq *rq = arg; + struct rq_flags rf; + + rq_lock(rq, &rf); + /* - * The cfs_rq_throttled() breaks in the above iteration can result in - * incomplete leaf list maintenance, resulting in triggering the - * assertion below. + * Iterating over the list can trigger several call to + * update_rq_clock() in unthrottle_cfs_rq(). + * Do it once and skip the potential next ones. */ - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + update_rq_clock(rq); + rq_clock_start_loop_update(rq); - if (list_add_leaf_cfs_rq(cfs_rq)) - break; + /* + * Since we hold rq lock we're safe from concurrent manipulation of + * the CSD list. However, this RCU critical section annotates the + * fact that we pair with sched_free_group_rcu(), so that we cannot + * race with group being freed in the window between removing it + * from the list and advancing to the next entry in the list. + */ + rcu_read_lock(); + + list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, + throttled_csd_list) { + list_del_init(&cursor->throttled_csd_list); + + if (cfs_rq_throttled(cursor)) + unthrottle_cfs_rq(cursor); } - assert_list_leaf_cfs_rq(rq); + rcu_read_unlock(); - /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_curr(rq); + rq_clock_stop_loop_update(rq); + rq_unlock(rq, &rf); } -static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { - struct cfs_rq *cfs_rq; + struct rq *rq = rq_of(cfs_rq); + bool first; + + if (rq == this_rq()) { + unthrottle_cfs_rq(cfs_rq); + return; + } + + /* Already enqueued */ + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + return; + + first = list_empty(&rq->cfsb_csd_list); + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); + if (first) + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); +} +#else +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + unthrottle_cfs_rq(cfs_rq); +} +#endif + +static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + lockdep_assert_rq_held(rq_of(cfs_rq)); + + if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + cfs_rq->runtime_remaining <= 0)) + return; + + __unthrottle_cfs_rq_async(cfs_rq); +} + +static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +{ + int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; + bool throttled = false; + struct cfs_rq *cfs_rq, *tmp; + struct rq_flags rf; + struct rq *rq; + LIST_HEAD(local_unthrottle); rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { - struct rq *rq = rq_of(cfs_rq); - struct rq_flags rf; + rq = rq_of(cfs_rq); + + if (!remaining) { + throttled = true; + break; + } rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; - /* By the above check, this should never be true */ + /* Already queued for async unthrottle */ + if (!list_empty(&cfs_rq->throttled_csd_list)) + goto next; + + /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); @@ -4944,16 +6191,44 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) cfs_rq->runtime_remaining += runtime; /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) - unthrottle_cfs_rq(cfs_rq); + if (cfs_rq->runtime_remaining > 0) { + if (cpu_of(rq) != this_cpu) { + unthrottle_cfs_rq_async(cfs_rq); + } else { + /* + * We currently only expect to be unthrottling + * a single cfs_rq locally. + */ + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + list_add_tail(&cfs_rq->throttled_csd_list, + &local_unthrottle); + } + } else { + throttled = true; + } next: rq_unlock_irqrestore(rq, &rf); + } - if (!remaining) - break; + list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, + throttled_csd_list) { + struct rq *rq = rq_of(cfs_rq); + + rq_lock_irqsave(rq, &rf); + + list_del_init(&cfs_rq->throttled_csd_list); + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + + rq_unlock_irqrestore(rq, &rf); } + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + rcu_read_unlock(); + + return throttled; } /* @@ -4973,6 +6248,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->nr_periods += overrun; + /* Refill extra burst quota even if cfs_b->idle */ + __refill_cfs_bandwidth_runtime(cfs_b); + /* * idle depends on !throttled (for the case of a large deficit), and if * we're going inactive then everything else can be deferred @@ -4980,8 +6258,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u if (cfs_b->idle && !throttled) goto out_deactivate; - __refill_cfs_bandwidth_runtime(cfs_b); - if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; @@ -4997,10 +6273,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - distribute_cfs_runtime(cfs_b); + throttled = distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - - throttled = !list_empty(&cfs_b->throttled_cfs_rq); } /* @@ -5034,7 +6308,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; - u64 remaining; + s64 remaining; /* if the call-back is running a quota refresh is already occurring */ if (hrtimer_callback_running(refresh_timer)) @@ -5042,7 +6316,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) /* is a quota refresh about to occur? */ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); - if (remaining < min_expire) + if (remaining < (s64)min_expire) return 1; return 0; @@ -5095,7 +6369,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued) return; __return_cfs_rq_runtime(cfs_rq); @@ -5133,7 +6407,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. + * runtime as update_curr() throttling can not trigger until it's on-rq. */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { @@ -5168,7 +6442,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -5231,6 +6505,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (new < max_cfs_quota_period) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; + cfs_b->burst *= 2; pr_warn_ratelimited( "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", @@ -5256,16 +6531,22 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) { raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->burst = 0; + cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; + + /* Add a random offset so that timers interleave */ + hrtimer_set_expires(&cfs_b->period_timer, + get_random_u32_below(cfs_b->period)); hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->slack_started = false; @@ -5275,6 +6556,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); + INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5291,12 +6573,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + int __maybe_unused i; + /* init_cfs_bandwidth() was not called */ if (!cfs_b->throttled_cfs_rq.next) return; hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); + + /* + * It is possible that we still have some cfs_rq's pending on a CSD + * list, though this race is very rare. In order for this to occur, we + * must have raced with the last task leaving the group while there + * exist throttled cfs_rq(s), and the period_timer must have queued the + * CSD item but the remote cpu has not yet processed it. To handle this, + * we can simply flush all pending CSD work inline here. We're + * guaranteed at this point that no additional cfs_rq of this group can + * join a CSD list. + */ +#ifdef CONFIG_SMP + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + if (list_empty(&rq->cfsb_csd_list)) + continue; + + local_irq_save(flags); + __cfsb_csd_unthrottle(rq); + local_irq_restore(flags); + } +#endif } /* @@ -5306,12 +6614,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * bits doesn't do much. */ -/* cpu online calback */ +/* cpu online callback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5330,7 +6638,18 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); + + // Do not unthrottle for an active CPU + if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) + return; + + /* + * The rq clock has already been updated in the + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5340,29 +6659,68 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) continue; /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = 1; - /* * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + if (!cfs_rq_throttled(cfs_rq)) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = 1; + unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); -} -#else /* CONFIG_CFS_BANDWIDTH */ + rq_clock_stop_loop_update(rq); +} -static inline bool cfs_bandwidth_used(void) +bool cfs_task_bw_constrained(struct task_struct *p) { + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + if (!cfs_bandwidth_used()) + return false; + + if (cfs_rq->runtime_enabled || + tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF) + return true; + return false; } +#ifdef CONFIG_NO_HZ_FULL +/* called from pick_next_task_fair() */ +static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq); + + if (!cfs_bandwidth_used()) + return; + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (rq->nr_running != 1) + return; + + /* + * We know there is only one task runnable and we've just picked it. The + * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will + * be otherwise able to stop the tick. Just need to check if we are using + * bandwidth control. + */ + if (cfs_task_bw_constrained(p)) + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#endif + +#else /* CONFIG_CFS_BANDWIDTH */ + static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -5385,9 +6743,8 @@ static inline int throttled_lb_pair(struct task_group *tg, return 0; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} - #ifdef CONFIG_FAIR_GROUP_SCHED +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif @@ -5398,9 +6755,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} - +#ifdef CONFIG_CGROUP_SCHED +bool cfs_task_bw_constrained(struct task_struct *p) +{ + return false; +} +#endif #endif /* CONFIG_CFS_BANDWIDTH */ +#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) +static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} +#endif + /************************************************** * CFS operations on tasks: */ @@ -5409,17 +6775,16 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); SCHED_WARN_ON(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { - u64 slice = sched_slice(cfs_rq, se); + if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; + u64 slice = se->slice; s64 delta = slice - ran; if (delta < 0) { - if (rq->curr == p) + if (task_current_donor(rq, p)) resched_curr(rq); return; } @@ -5434,13 +6799,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) */ static void hrtick_update(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -5454,28 +6818,55 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { - return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); + unsigned long rq_util_min, rq_util_max; + + if (!sched_energy_enabled()) + return false; + + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + + /* Return true only if the utilization doesn't fit CPU's capacity */ + return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); +} + +/* + * overutilized value make sense only if EAS is enabled + */ +static inline bool is_rd_overutilized(struct root_domain *rd) +{ + return !sched_energy_enabled() || READ_ONCE(rd->overutilized); } -static inline void update_overutilized_status(struct rq *rq) +static inline void set_rd_overutilized(struct root_domain *rd, bool flag) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { - WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); - } + if (!sched_energy_enabled()) + return; + + WRITE_ONCE(rd->overutilized, flag); + trace_sched_overutilized_tp(rd, flag); +} + +static inline void check_update_overutilized_status(struct rq *rq) +{ + /* + * overutilized field is used for load balancing decisions only + * if energy aware scheduler is being used + */ + + if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) + set_rd_overutilized(rq->rd, 1); } #else -static inline void update_overutilized_status(struct rq *rq) { } +static inline void check_update_overutilized_status(struct rq *rq) { } #endif /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) { - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + return unlikely(rq->nr_running == rq->cfs.h_nr_idle && rq->nr_running); } @@ -5486,6 +6877,37 @@ static int sched_idle_cpu(int cpu) } #endif +static void +requeue_delayed_entity(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * se->sched_delayed should imply: se->on_rq == 1. + * Because a delayed entity is one that is still on + * the runqueue competing until elegibility. + */ + SCHED_WARN_ON(!se->sched_delayed); + SCHED_WARN_ON(!se->on_rq); + + if (sched_feat(DELAY_ZERO)) { + update_entity_lag(cfs_rq, se); + if (se->vlag > 0) { + cfs_rq->nr_queued--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->vlag = 0; + place_entity(cfs_rq, se, 0); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; + } + } + + update_load_avg(cfs_rq, se, 0); + clear_delayed(se); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5496,8 +6918,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int idle_h_nr_running = task_has_idle_policy(p); + int h_nr_idle = task_has_idle_policy(p); + int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); + int rq_h_nr_queued = rq->cfs.h_nr_queued; + u64 slice = 0; /* * The code below (indirectly) updates schedutil which looks at @@ -5505,7 +6930,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - util_est_enqueue(&rq->cfs, p); + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) + util_est_enqueue(&rq->cfs, p); + + if (flags & ENQUEUE_DELAYED) { + requeue_delayed_entity(se); + return; + } /* * If in_iowait is set, the code below may not trigger any cpufreq @@ -5515,14 +6946,35 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + if (task_new && se->sched_delayed) + h_nr_runnable = 0; + for_each_sched_entity(se) { - if (se->on_rq) + if (se->on_rq) { + if (se->sched_delayed) + requeue_delayed_entity(se); break; + } cfs_rq = cfs_rq_of(se); + + /* + * Basically set the slice of group entries to the min_slice of + * their respective cfs_rq. This ensures the group can service + * its entities in the desired time-frame. + */ + if (slice) { + se->slice = slice; + se->custom_slice = 1; + } enqueue_entity(cfs_rq, se, flags); + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5538,19 +6990,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se_update_runnable(se); update_cfs_group(se); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; + se->slice = slice; + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; + + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; + } - /* - * One parent has been throttled and cfs_rq removed from the - * list. Add it back to not break the leaf list. - */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { + /* Account for idle runtime */ + if (!rq->nr_running) + dl_server_update_idle_time(rq, rq->curr); + dl_server_start(&rq->fair_server); } /* At this point se is NULL and we are at root level*/ @@ -5571,24 +7030,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * and the following generally works well enough in practice. */ if (!task_new) - update_overutilized_status(rq); + check_update_overutilized_status(rq); enqueue_throttle: - if (cfs_bandwidth_used()) { - /* - * When bandwidth control is enabled; the cfs_rq_throttled() - * breaks in the above iteration can result in incomplete - * leaf list maintenance, resulting in triggering the assertion - * below. - */ - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - - if (list_add_leaf_cfs_rq(cfs_rq)) - break; - } - } - assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -5597,31 +7041,63 @@ enqueue_throttle: static void set_next_buddy(struct sched_entity *se); /* - * The dequeue_task method is called before nr_running is - * decreased. We remove the task from the rbtree and - * update the fair scheduling stats: + * Basically dequeue_task_fair(), except it can deal with dequeue_entity() + * failing half-way through and resume the dequeue later. + * + * Returns: + * -1 - dequeue delayed + * 0 - dequeue throttled + * 1 - dequeue complete */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - int task_sleep = flags & DEQUEUE_SLEEP; - int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + int rq_h_nr_queued = rq->cfs.h_nr_queued; + bool task_sleep = flags & DEQUEUE_SLEEP; + bool task_delayed = flags & DEQUEUE_DELAYED; + struct task_struct *p = NULL; + int h_nr_idle = 0; + int h_nr_queued = 0; + int h_nr_runnable = 0; + struct cfs_rq *cfs_rq; + u64 slice = 0; + + if (entity_is_task(se)) { + p = task_of(se); + h_nr_queued = 1; + h_nr_idle = task_has_idle_policy(p); + if (task_sleep || task_delayed || !se->sched_delayed) + h_nr_runnable = 1; + } else { + cfs_rq = group_cfs_rq(se); + slice = cfs_rq_min_slice(cfs_rq); + } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, flags); - cfs_rq->h_nr_running--; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (!dequeue_entity(cfs_rq, se, flags)) { + if (p && &p->se == se) + return -1; + + break; + } + + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; + + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; + return 0; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + slice = cfs_rq_min_slice(cfs_rq); + /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); /* @@ -5633,6 +7109,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; } flags |= DEQUEUE_SLEEP; + flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); } for_each_sched_entity(se) { @@ -5642,32 +7119,76 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) se_update_runnable(se); update_cfs_group(se); - cfs_rq->h_nr_running--; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; + se->slice = slice; + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; + + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; - + return 0; } - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, 1); + sub_nr_running(rq, h_nr_queued); + + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) + dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; -dequeue_throttle: - util_est_dequeue(&rq->cfs, p, task_sleep); + if (p && task_delayed) { + SCHED_WARN_ON(!task_sleep); + SCHED_WARN_ON(p->on_rq != 1); + + /* Fix-up what dequeue_task_fair() skipped */ + hrtick_update(rq); + + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ + __block_task(rq, p); + } + + return 1; +} + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) + util_est_dequeue(&rq->cfs, p); + + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) + return false; + + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); + return true; } #ifdef CONFIG_SMP -/* Working cpumask for: load_balance, load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); +static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); #ifdef CONFIG_NO_HZ_COMMON @@ -5675,6 +7196,7 @@ static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; int has_blocked; /* Idle CPUS has blocked load */ + int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ } nohz ____cacheline_aligned; @@ -5885,23 +7407,23 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); - schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); - if (target == nr_cpumask_bits) + schedstat_inc(p->stats.nr_wakeups_affine_attempts); + if (target != this_cpu) return prev_cpu; schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); + schedstat_inc(p->stats.nr_wakeups_affine); return target; } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* - * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. + * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. */ static int -find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5916,11 +7438,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + struct rq *rq = cpu_rq(i); + + if (!sched_core_cookie_match(rq, p)) + continue; + if (sched_idle_cpu(i)) return i; if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { /* @@ -5953,7 +7479,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, +static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { int new_cpu = cpu; @@ -5978,13 +7504,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu); + group = sched_balance_find_dst_group(sd, p, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_group_cpu(group, p, cpu); + new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; @@ -6006,6 +7532,15 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu, struct task_struct *p) +{ + if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + sched_cpu_cookie_match(cpu_rq(cpu), p)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -6019,7 +7554,7 @@ static inline void set_idle_cores(int cpu, int val) WRITE_ONCE(sds->has_idle_cores, val); } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; @@ -6027,7 +7562,7 @@ static inline bool test_idle_cores(int cpu, bool def) if (sds) return READ_ONCE(sds->has_idle_cores); - return def; + return false; } /* @@ -6043,7 +7578,7 @@ void __update_idle_core(struct rq *rq) int cpu; rcu_read_lock(); - if (test_idle_cores(core, true)) + if (test_idle_cores(core)) goto unlock; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -6064,40 +7599,31 @@ unlock: * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - if (!test_idle_cores(target, false)) - return -1; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; + bool idle = true; + int cpu; - for_each_cpu(cpu, cpu_smt_mask(core)) { - if (!available_idle_cpu(cpu)) { - idle = false; - break; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { + *idle_cpu = cpu; + break; + } + continue; } + break; } - - if (idle) - return core; - - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } @@ -6108,12 +7634,14 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t { int cpu; - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) + for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { + if (cpu == target) + continue; + /* + * Check if the CPU is in the LLC scheduling domain of @target. + * Due to isolcpus, there is no guarantee that all the siblings are in the domain. + */ + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6124,9 +7652,18 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t #else /* CONFIG_SCHED_SMT */ -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline void set_idle_cores(int cpu, int val) { - return -1; +} + +static inline bool test_idle_cores(int cpu) +{ + return false; +} + +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) +{ + return __select_idle_cpu(core, p); } static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) @@ -6141,52 +7678,68 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle). */ -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - struct sched_domain *this_sd; - u64 avg_cost, avg_idle; - u64 time; - int this = smp_processor_id(); - int cpu, nr = INT_MAX; - - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct sched_domain_shared *sd_share; - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; - - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr = div_u64(span_avg, avg_cost); - else - nr = 4; + if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { + /* because !--nr is the condition to stop scan */ + nr = READ_ONCE(sd_share->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } } - time = cpu_clock(this); + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + for_each_cpu_wrap(cpu, cpus, target + 1) { + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; - for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } } - time = cpu_clock(this) - time; - update_avg(&this_sd->avg_scan_cost, time); + if (has_idle_core) + set_idle_cores(target, false); - return cpu; + return idle_cpu; } /* @@ -6197,36 +7750,62 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long task_util, best_cap = 0; + unsigned long task_util, util_min, util_max, best_cap = 0; + int fits, best_fits = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - task_util = uclamp_task_util(p); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (fits_capacity(task_util, cpu_cap)) + + fits = util_fits_cpu(task_util, util_min, util_max, cpu); + + /* This CPU fits with all requirements */ + if (fits > 0) return cpu; + /* + * Only the min performance hint (i.e. uclamp_min) doesn't fit. + * Look for the CPU with best capacity. + */ + else if (fits < 0) + cpu_cap = get_actual_cpu_capacity(cpu); - if (cpu_cap > best_cap) { + /* + * First, select CPU which fits better (-1 being better than 0). + * Then, select the one with best capacity at same level. + */ + if ((fits < best_fits) || + ((fits == best_fits) && (cpu_cap > best_cap))) { best_cap = cpu_cap; best_cpu = cpu; + best_fits = fits; } } return best_cpu; } -static inline bool asym_fits_capacity(int task_util, int cpu) +static inline bool asym_fits_cpu(unsigned long util, + unsigned long util_min, + unsigned long util_max, + int cpu) { - if (static_branch_unlikely(&sched_asym_cpucapacity)) - return fits_capacity(task_util, capacity_of(cpu)); + if (sched_asym_cpucap_active()) + /* + * Return true only if the cpu fully fits the task requirements + * which include the utilization and the performance hints. + */ + return (util_fits_cpu(util, util_min, util_max, cpu) > 0); return true; } @@ -6236,21 +7815,29 @@ static inline bool asym_fits_capacity(int task_util, int cpu) */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { + bool has_idle_core = false; struct sched_domain *sd; - unsigned long task_util; - int i, recent_used_cpu; + unsigned long task_util, util_min, util_max; + int i, recent_used_cpu, prev_aff = -1; /* * On asymmetric system, update task utilization because we will check - * that the task fits with cpu's capacity. + * that the task fits with CPU's capacity. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); - task_util = uclamp_task_util(p); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); } + /* + * per-cpu select_rq_mask usage + */ + lockdep_assert_irqs_disabled(); + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + asym_fits_cpu(task_util, util_min, util_max, target)) return target; /* @@ -6258,8 +7845,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) - return prev; + asym_fits_cpu(task_util, util_min, util_max, prev)) { + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(prev, target)) + return prev; + + prev_aff = prev; + } /* * Allow a per-cpu kthread to stack with the wakee if the @@ -6270,32 +7863,36 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * pattern is IO completions. */ if (is_per_cpu_kthread(current) && + in_task() && prev == smp_processor_id() && - this_rq()->nr_running <= 1) { + this_rq()->nr_running <= 1 && + asym_fits_cpu(task_util, util_min, util_max, prev)) { return prev; } /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && - asym_fits_capacity(task_util, recent_used_cpu)) { - /* - * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake: - */ - p->recent_used_cpu = prev; - return recent_used_cpu; + cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && + asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; + + } else { + recent_used_cpu = -1; } /* * For asymmetric CPU capacity systems, our domain of interest is * sd_asym_cpucapacity rather than sd_llc. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive @@ -6315,71 +7912,148 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (sched_smt_active()) { + has_idle_core = test_idle_cores(target); - i = select_idle_cpu(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (!has_idle_core && cpus_share_cache(prev, target)) { + i = select_idle_smt(p, sd, prev); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } + } - i = select_idle_smt(p, sd, target); + i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i; + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } /** - * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. - * @cpu: the CPU to get the utilization of - * - * The unit of the return value must be the one of capacity so we can compare - * the utilization with the capacity of the CPU that is available for CFS task - * (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * The estimated utilization of a CPU is defined to be the maximum between its - * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks - * currently RUNNABLE on that CPU. - * This allows to properly represent the expected utilization of a CPU which - * has just got a big task running since a long sleep period. At the same time - * however it preserves the benefits of the "blocked utilization" in - * describing the potential for other tasks waking up on the same CPU. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - * - * Return: the (estimated) utilization for the specified CPU - */ -static inline unsigned long cpu_util(int cpu) + * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. + * @cpu: the CPU to get the utilization for + * @p: task for which the CPU utilization should be predicted or NULL + * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL + * @boost: 1 to enable boosting, otherwise 0 + * + * The unit of the return value must be the same as the one of CPU capacity + * so that CPU utilization can be compared with CPU capacity. + * + * CPU utilization is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on that CPU. + * It represents the amount of CPU capacity currently used by CFS tasks in + * the range [0..max CPU capacity] with max CPU capacity being the CPU + * capacity at f_max. + * + * The estimated CPU utilization is defined as the maximum between CPU + * utilization and sum of the estimated utilization of the currently + * runnable tasks on that CPU. It preserves a utilization "snapshot" of + * previously-executed tasks, which helps better deduce how busy a CPU will + * be when a long-sleeping task wakes up. The contribution to CPU utilization + * of such a task would be significantly decayed at this point of time. + * + * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). + * CPU contention for CFS tasks can be detected by CPU runnable > CPU + * utilization. Boosting is implemented in cpu_util() so that internal + * users (e.g. EAS) can use it next to external users (e.g. schedutil), + * latter via cpu_util_cfs_boost(). + * + * CPU utilization can be higher than the current CPU capacity + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because + * of rounding errors as well as task migrations or wakeups of new tasks. + * CPU utilization has to be capped to fit into the [0..max CPU capacity] + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) + * could be seen as over-utilized even though CPU1 has 20% of spare CPU + * capacity. CPU utilization is allowed to overshoot current CPU capacity + * though since this is useful for predicting the CPU capacity required + * after task migrations (scheduler-driven DVFS). + * + * Return: (Boosted) (estimated) utilization for the specified CPU. + */ +static unsigned long +cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) { - struct cfs_rq *cfs_rq; - unsigned int util; + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long runnable; - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); + if (boost) { + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + util = max(util, runnable); + } + + /* + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its + * contribution. If @p migrates from another CPU to @cpu add its + * contribution. In all the other cases @cpu is not impacted by the + * migration so its util_avg is already correct. + */ + if (p && task_cpu(p) == cpu && dst_cpu != cpu) + lsub_positive(&util, task_util(p)); + else if (p && task_cpu(p) != cpu && dst_cpu == cpu) + util += task_util(p); + + if (sched_feat(UTIL_EST)) { + unsigned long util_est; + + util_est = READ_ONCE(cfs_rq->avg.util_est); + + /* + * During wake-up @p isn't enqueued yet and doesn't contribute + * to any cpu_rq(cpu)->cfs.avg.util_est. + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p + * has been enqueued. + * + * During exec (@dst_cpu = -1) @p is enqueued and does + * contribute to cpu_rq(cpu)->cfs.util_est. + * Remove it to "simulate" cpu_util without @p's contribution. + * + * Despite the task_on_rq_queued(@p) check there is still a + * small window for a possible race when an exec + * select_task_rq_fair() races with LB's detach_task(). + * + * detach_task() + * deactivate_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * -------------------------------- A + * dequeue_task() \ + * dequeue_task_fair() + Race Time + * util_est_dequeue() / + * -------------------------------- B + * + * The additional check "current == p" is required to further + * reduce the race window. + */ + if (dst_cpu == cpu) + util_est += _task_util_est(p); + else if (p && unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&util_est, _task_util_est(p)); - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + util = max(util, util_est); + } - return min_t(unsigned long, util, capacity_orig_of(cpu)); + return min(util, arch_scale_cpu_capacity(cpu)); +} + +unsigned long cpu_util_cfs(int cpu) +{ + return cpu_util(cpu, NULL, -1, 0); +} + +unsigned long cpu_util_cfs_boost(int cpu) +{ + return cpu_util(cpu, NULL, -1, 1); } /* @@ -6397,168 +8071,255 @@ static inline unsigned long cpu_util(int cpu) */ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { - struct cfs_rq *cfs_rq; - unsigned int util; - /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util(cpu); + p = NULL; - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); + return cpu_util(cpu, p, -1, 0); +} - /* Discount task's util from CPU's util */ - lsub_positive(&util, task_util(p)); +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the IRQ utilization. + * + * The DL bandwidth number OTOH is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max) +{ + unsigned long util, irq, scale; + struct rq *rq = cpu_rq(cpu); + + scale = arch_scale_cpu_capacity(cpu); /* - * Covered cases: - * - * a) if *p is the only task sleeping on this CPU, then: - * cpu_util (== task_util) > util_est (== 0) - * and thus we return: - * cpu_util_without = (cpu_util - task_util) = 0 - * - * b) if other tasks are SLEEPING on this CPU, which is now exiting - * IDLE, then: - * cpu_util >= task_util - * cpu_util > util_est (== 0) - * and thus we discount *p's blocked utilization to return: - * cpu_util_without = (cpu_util - task_util) >= 0 - * - * c) if other tasks are RUNNABLE on that CPU and - * util_est > cpu_util - * then we use util_est since it returns a more restrictive - * estimation of the spare capacity on that CPU, by just - * considering the expected utilization of tasks already - * runnable on that CPU. - * - * Cases a) and b) are covered by the above code, while case c) is - * covered by the following code when estimated utilization is - * enabled. + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). */ - if (sched_feat(UTIL_EST)) { - unsigned int estimated = - READ_ONCE(cfs_rq->avg.util_est.enqueued); + irq = cpu_util_irq(rq); + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + if (min) { /* - * Despite the following checks we still have a small window - * for a possible race, when an execl's select_task_rq_fair() - * races with LB's detach_task(): - * - * detach_task() - * p->on_rq = TASK_ON_RQ_MIGRATING; - * ---------------------------------- A - * deactivate_task() \ - * dequeue_task() + RaceTime - * util_est_dequeue() / - * ---------------------------------- B - * - * The additional check on "current == p" it's required to - * properly fix the execl regression and it helps in further - * reducing the chances for the above race. + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. */ - if (unlikely(task_on_rq_queued(p) || current == p)) - lsub_positive(&estimated, _task_util_est(p)); + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); - util = max(util, estimated); + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); } /* - * Utilization (estimated) can exceed the CPU capacity, thus let's - * clamp to the maximum CPU capacity to ensure consistency with - * the cpu_util call. + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ + util = util_cfs + cpu_util_rt(rq); + util += cpu_util_dl(rq); + + /* + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. */ - return min_t(unsigned long, util, capacity_orig_of(cpu)); + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); + + if (util >= scale) + return scale; + + /* + * There is still idle time; further improve the number by using the + * IRQ metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, scale); + util += irq; + + return min(scale, util); +} + +unsigned long sched_cpu_util(int cpu) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); } /* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. + * energy_env - Utilization landscape for energy estimation. + * @task_busy_time: Utilization contribution by the task for which we test the + * placement. Given by eenv_task_busy_time(). + * @pd_busy_time: Utilization of the whole perf domain without the task + * contribution. Given by eenv_pd_busy_time(). + * @cpu_cap: Maximum CPU capacity for the perf domain. + * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap). */ -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +struct energy_env { + unsigned long task_busy_time; + unsigned long pd_busy_time; + unsigned long cpu_cap; + unsigned long pd_cap; +}; + +/* + * Compute the task busy time for compute_energy(). This time cannot be + * injected directly into effective_cpu_util() because of the IRQ scaling. + * The latter only makes sense with the most recent CPUs where the task has + * run. + */ +static inline void eenv_task_busy_time(struct energy_env *eenv, + struct task_struct *p, int prev_cpu) { - struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu); + unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu)); - /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. - */ - if (task_cpu(p) == cpu && dst_cpu != cpu) - sub_positive(&util, task_util(p)); - else if (task_cpu(p) != cpu && dst_cpu == cpu) - util += task_util(p); + if (unlikely(irq >= max_cap)) + busy_time = max_cap; + else + busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap); - if (sched_feat(UTIL_EST)) { - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + eenv->task_busy_time = busy_time; +} - /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util() after the task has been enqueued. - */ - if (dst_cpu == cpu) - util_est += _task_util_est(p); +/* + * Compute the perf_domain (PD) busy time for compute_energy(). Based on the + * utilization for each @pd_cpus, it however doesn't take into account + * clamping since the ratio (utilization / cpu_capacity) is already enough to + * scale the EM reported power consumption at the (eventually clamped) + * cpu_capacity. + * + * The contribution of the task @p for which we want to estimate the + * energy cost is removed (by cpu_util()) and must be calculated + * separately (see eenv_task_busy_time). This ensures: + * + * - A stable PD utilization, no matter which CPU of that PD we want to place + * the task on. + * + * - A fair comparison between CPUs as the task contribution (task_util()) + * will always be the same no matter which CPU utilization we rely on + * (util_avg or util_est). + * + * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't + * exceed @eenv->pd_cap. + */ +static inline void eenv_pd_busy_time(struct energy_env *eenv, + struct cpumask *pd_cpus, + struct task_struct *p) +{ + unsigned long busy_time = 0; + int cpu; - util = max(util, util_est); + for_each_cpu(cpu, pd_cpus) { + unsigned long util = cpu_util(cpu, p, -1, 0); + + busy_time += effective_cpu_util(cpu, util, NULL, NULL); } - return min(util, capacity_orig_of(cpu)); + eenv->pd_busy_time = min(eenv->pd_cap, busy_time); } /* - * compute_energy(): Estimates the energy that @pd would consume if @p was - * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of @pd's CPUs after the task migration, and uses the Energy Model - * to compute what would be the energy if we decided to actually migrate that - * task. + * Compute the maximum utilization for compute_energy() when the task @p + * is placed on the cpu @dst_cpu. + * + * Returns the maximum utilization among @eenv->cpus. This utilization can't + * exceed @eenv->cpu_cap. */ -static long -compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +static inline unsigned long +eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, + struct task_struct *p, int dst_cpu) { - struct cpumask *pd_mask = perf_domain_span(pd); - unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - unsigned long max_util = 0, sum_util = 0; + unsigned long max_util = 0; int cpu; - /* - * The capacity state of CPUs of the current rd can be driven by CPUs - * of another rd if they belong to the same pd. So, account for the - * utilization of these CPUs too by masking pd with cpu_online_mask - * instead of the rd span. - * - * If an entire pd is outside of the current rd, it will not appear in - * its pd list and will not be accounted by compute_energy(). - */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); - struct task_struct *tsk = cpu == dst_cpu ? p : NULL; - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); + for_each_cpu(cpu, pd_cpus) { + struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; + unsigned long util = cpu_util(cpu, p, dst_cpu, 1); + unsigned long eff_util, min, max; /* * Performance domain frequency: utilization clamping * must be considered since it affects the selection * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. + * NOTE: in case RT tasks are running, by default the min + * utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); + eff_util = effective_cpu_util(cpu, util, &min, &max); + + /* Task's uclamp can modify min and max value */ + if (tsk && uclamp_is_used()) { + min = max(min, uclamp_eff_value(p, UCLAMP_MIN)); + + /* + * If there is no active max uclamp constraint, + * directly use task's one, otherwise keep max. + */ + if (uclamp_rq_is_idle(cpu_rq(cpu))) + max = uclamp_eff_value(p, UCLAMP_MAX); + else + max = max(max, uclamp_eff_value(p, UCLAMP_MAX)); + } + + eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max); + max_util = max(max_util, eff_util); } - return em_cpu_energy(pd->em_pd, max_util, sum_util); + return min(max_util, eenv->cpu_cap); +} + +/* + * compute_energy(): Use the Energy Model to estimate the energy that @pd would + * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task + * contribution is ignored. + */ +static inline unsigned long +compute_energy(struct energy_env *eenv, struct perf_domain *pd, + struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) +{ + unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); + unsigned long busy_time = eenv->pd_busy_time; + unsigned long energy; + + if (dst_cpu >= 0) + busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); + + energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + + trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time); + + return energy; } /* @@ -6593,7 +8354,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt @@ -6602,17 +8363,23 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) */ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - unsigned long cpu_cap, util, base_energy = 0; - int cpu, best_energy_cpu = prev_cpu; + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; + struct root_domain *rd = this_rq()->rd; + int cpu, best_energy_cpu, target = -1; + int prev_fits = -1, best_fits = -1; + unsigned long best_actual_cap = 0; + unsigned long prev_actual_cap = 0; struct sched_domain *sd; struct perf_domain *pd; + struct energy_env eenv; rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || READ_ONCE(rd->overutilized)) - goto fail; + if (!pd) + goto unlock; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6622,87 +8389,163 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) - goto fail; + goto unlock; + + target = prev_cpu; sync_entity_load_avg(&p->se); - if (!task_util_est(p)) + if (!task_util_est(p) && p_util_min == 0) goto unlock; + eenv_task_busy_time(&eenv, p, prev_cpu); + for (; pd; pd = pd->next) { - unsigned long cur_delta, spare_cap, max_spare_cap = 0; - unsigned long base_energy_pd; + unsigned long util_min = p_util_min, util_max = p_util_max; + unsigned long cpu_cap, cpu_actual_cap, util; + long prev_spare_cap = -1, max_spare_cap = -1; + unsigned long rq_util_min, rq_util_max; + unsigned long cur_delta, base_energy; int max_spare_cap_cpu = -1; + int fits, max_fits = -1; - /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd = compute_energy(p, -1, pd); - base_energy += base_energy_pd; + cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); + + if (cpumask_empty(cpus)) + continue; + + /* Account external pressure for the energy estimation */ + cpu = cpumask_first(cpus); + cpu_actual_cap = get_actual_cpu_capacity(cpu); + + eenv.cpu_cap = cpu_actual_cap; + eenv.pd_cap = 0; + + for_each_cpu(cpu, cpus) { + struct rq *rq = cpu_rq(cpu); + + eenv.pd_cap += cpu_actual_cap; + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - util = cpu_util_next(cpu, p, cpu); + util = cpu_util(cpu, p, cpu, 0); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap; - lsub_positive(&spare_cap, util); /* * Skip CPUs that cannot satisfy the capacity request. * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ - util = uclamp_rq_util_with(cpu_rq(cpu), util, p); - if (!fits_capacity(util, cpu_cap)) + if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { + /* + * Open code uclamp_rq_util_with() except for + * the clamp() part. I.e.: apply max aggregation + * only. util_fits_cpu() logic requires to + * operate on non clamped util but must use the + * max-aggregated uclamp_{min, max}. + */ + rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN); + rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX); + + util_min = max(rq_util_min, p_util_min); + util_max = max(rq_util_max, p_util_max); + } + + fits = util_fits_cpu(util, util_min, util_max, cpu); + if (!fits) continue; - /* Always use prev_cpu as a candidate. */ + lsub_positive(&cpu_cap, util); + if (cpu == prev_cpu) { - prev_delta = compute_energy(p, prev_cpu, pd); - prev_delta -= base_energy_pd; - best_delta = min(best_delta, prev_delta); + /* Always use prev_cpu as a candidate. */ + prev_spare_cap = cpu_cap; + prev_fits = fits; + } else if ((fits > max_fits) || + ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { + /* + * Find the CPU with the maximum spare capacity + * among the remaining CPUs in the performance + * domain. + */ + max_spare_cap = cpu_cap; + max_spare_cap_cpu = cpu; + max_fits = fits; } + } + + if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) + continue; + + eenv_pd_busy_time(&eenv, cpus, p); + /* Compute the 'base' energy of the pd, without @p */ + base_energy = compute_energy(&eenv, pd, cpus, p, -1); + + /* Evaluate the energy impact of using prev_cpu. */ + if (prev_spare_cap > -1) { + prev_delta = compute_energy(&eenv, pd, cpus, p, + prev_cpu); + /* CPU utilization has changed */ + if (prev_delta < base_energy) + goto unlock; + prev_delta -= base_energy; + prev_actual_cap = cpu_actual_cap; + best_delta = min(best_delta, prev_delta); + } + + /* Evaluate the energy impact of using max_spare_cap_cpu. */ + if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { + /* Current best energy cpu fits better */ + if (max_fits < best_fits) + continue; /* - * Find the CPU with the maximum spare capacity in - * the performance domain + * Both don't fit performance hint (i.e. uclamp_min) + * but best energy cpu has better capacity. */ - if (spare_cap > max_spare_cap) { - max_spare_cap = spare_cap; - max_spare_cap_cpu = cpu; - } - } + if ((max_fits < 0) && + (cpu_actual_cap <= best_actual_cap)) + continue; - /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { - cur_delta = compute_energy(p, max_spare_cap_cpu, pd); - cur_delta -= base_energy_pd; - if (cur_delta < best_delta) { - best_delta = cur_delta; - best_energy_cpu = max_spare_cap_cpu; - } + cur_delta = compute_energy(&eenv, pd, cpus, p, + max_spare_cap_cpu); + /* CPU utilization has changed */ + if (cur_delta < base_energy) + goto unlock; + cur_delta -= base_energy; + + /* + * Both fit for the task but best energy cpu has lower + * energy impact. + */ + if ((max_fits > 0) && (best_fits > 0) && + (cur_delta >= best_delta)) + continue; + + best_delta = cur_delta; + best_energy_cpu = max_spare_cap_cpu; + best_fits = max_fits; + best_actual_cap = cpu_actual_cap; } } -unlock: rcu_read_unlock(); - /* - * Pick the best CPU if prev_cpu cannot be used, or if it saves at - * least 6% of the energy used by prev_cpu. - */ - if (prev_delta == ULONG_MAX) - return best_energy_cpu; + if ((best_fits > prev_fits) || + ((best_fits > 0) && (best_delta < prev_delta)) || + ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) + target = best_energy_cpu; - if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) - return best_energy_cpu; - - return prev_cpu; + return target; -fail: +unlock: rcu_read_unlock(); - return -1; + return target; } /* @@ -6714,8 +8557,6 @@ fail: * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * * Returns the target CPU number. - * - * preempt must be disabled. */ static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) @@ -6728,10 +8569,18 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; + /* + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); if (wake_flags & WF_TTWU) { record_wakee(p); - if (sched_energy_enabled()) { + if ((wake_flags & WF_CURRENT_CPU) && + cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + + if (!is_rd_overutilized(this_rq()->rd)) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -6756,6 +8605,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) break; } + /* + * Usually only true for WF_EXEC and WF_FORK, as sched_domains + * usually do not have SD_BALANCE_WAKE set. That means wakeup + * will usually go to the fast path. + */ if (tmp->flags & sd_flag) sd = tmp; else if (!want_affine) @@ -6764,21 +8618,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (unlikely(sd)) { /* Slow path */ - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); return new_cpu; } -static void detach_entity_cfs_rq(struct sched_entity *se); - /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -6786,181 +8635,126 @@ static void detach_entity_cfs_rq(struct sched_entity *se); */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { - /* - * As blocked tasks retain absolute vruntime the migration needs to - * deal with this by subtracting the old and adding the new - * min_vruntime -- the latter is done by enqueue_entity() when placing - * the task on the new runqueue. - */ - if (p->state == TASK_WAKING) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - - se->vruntime -= min_vruntime; - } + struct sched_entity *se = &p->se; - if (p->on_rq == TASK_ON_RQ_MIGRATING) { - /* - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' - * rq->lock and can modify state directly. - */ - lockdep_assert_held(&task_rq(p)->lock); - detach_entity_cfs_rq(&p->se); + if (!task_on_rq_migrating(p)) { + remove_entity_load_avg(se); - } else { /* - * We are supposed to update the task to "current" time, then - * its up to date and ready to go to new CPU/cfs_rq. But we - * have difficulty in getting what current time is, so simply - * throw away the out-of-date time. This will result in the - * wakee task is less decayed, but giving the wakee more load - * sounds not bad. + * Here, the task's PELT values have been updated according to + * the current rq's clock. But if that clock hasn't been + * updated in a while, a substantial idle time will be missed, + * leading to an inflation after wake-up on the new rq. + * + * Estimate the missing time from the cfs_rq last_update_time + * and update sched_avg to improve the PELT continuity after + * migration. */ - remove_entity_load_avg(&p->se); + migrate_se_pelt_lag(se); } /* Tell new CPU we are migrated */ - p->se.avg.last_update_time = 0; - - /* We have migrated, no longer consider this task hot */ - p->se.exec_start = 0; + se->avg.last_update_time = 0; update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) { - remove_entity_load_avg(&p->se); -} - -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (rq->nr_running) - return 1; + struct sched_entity *se = &p->se; - return newidle_balance(rq, rf) != 0; -} -#endif /* CONFIG_SMP */ + if (se->sched_delayed) { + struct rq_flags rf; + struct rq *rq; -static unsigned long wakeup_gran(struct sched_entity *se) -{ - unsigned long gran = sysctl_sched_wakeup_granularity; + rq = task_rq_lock(p, &rf); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + task_rq_unlock(rq, p, &rf); + } - /* - * Since its curr running now, convert the gran from real-time - * to virtual-time in his units. - * - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - return calc_delta_fair(gran, se); + remove_entity_load_avg(se); } /* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * + * Set the max capacity the task is allowed to run at for misfit detection. */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +static void set_task_max_allowed_capacity(struct task_struct *p) { - s64 gran, vdiff = curr->vruntime - se->vruntime; + struct asym_cap_data *entry; - if (vdiff <= 0) - return -1; + if (!sched_asym_cpucap_active()) + return; - gran = wakeup_gran(se); - if (vdiff > gran) - return 1; + rcu_read_lock(); + list_for_each_entry_rcu(entry, &asym_cap_list, link) { + cpumask_t *cpumask; - return 0; + cpumask = cpu_capacity_span(entry); + if (!cpumask_intersects(p->cpus_ptr, cpumask)) + continue; + + p->max_allowed_capacity = entry->capacity; + break; + } + rcu_read_unlock(); } -static void set_last_buddy(struct sched_entity *se) +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; + set_cpus_allowed_common(p, ctx); + set_task_max_allowed_capacity(p); +} - for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) - return; - cfs_rq_of(se)->last = se; - } +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (sched_fair_runnable(rq)) + return 1; + + return sched_balance_newidle(rq, rf) != 0; } +#else +static inline void set_task_max_allowed_capacity(struct task_struct *p) {} +#endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } -static void set_skip_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) - cfs_rq_of(se)->skip = se; -} - /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int scale = cfs_rq->nr_running >= sched_nr_latency; - int next_buddy_marked = 0; + struct task_struct *donor = rq->donor; + struct sched_entity *se = &donor->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(donor); + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; /* * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_prempt_curr() after an enqueue (which may have + * unconditionally wakeup_preempt() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { set_next_buddy(pse); - next_buddy_marked = 1; } /* @@ -6973,122 +8767,120 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ - if (test_tsk_need_resched(curr)) + if (test_tsk_need_resched(rq->curr)) return; - /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(task_has_idle_policy(curr)) && - likely(!task_has_idle_policy(p))) + if (!sched_feat(WAKEUP_PREEMPTION)) + return; + + find_matching_se(&se, &pse); + WARN_ON_ONCE(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle entity in favor of a non-idle entity (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) goto preempt; + if (cse_is_idle != pse_is_idle) + return; /* - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): + * BATCH and IDLE tasks do not preempt others. */ - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + if (unlikely(!normal_policy(p->policy))) return; - find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); - BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is - * triggering this preemption. - */ - if (!next_buddy_marked) - set_next_buddy(pse); + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + /* + * If @p has a shorter slice than current and @p is eligible, override + * current's slice protection in order to allow preemption. + * + * Note that even if @p does not turn out to be the most eligible + * task at this moment, current's slice protection will be lost. + */ + if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) + se->vlag = se->deadline + 1; + + /* + * If @p has become the most eligible task, force preemption. + */ + if (pick_eevdf(cfs_rq) == pse) goto preempt; - } return; preempt: - resched_curr(rq); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; + resched_curr_lazy(rq); +} + +static struct task_struct *pick_task_fair(struct rq *rq) +{ + struct sched_entity *se; + struct cfs_rq *cfs_rq; + +again: + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_queued) + return NULL; + + do { + /* Might not have done put_prev_entity() */ + if (cfs_rq->curr && cfs_rq->curr->on_rq) + update_curr(cfs_rq); - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again; + + se = pick_next_entity(rq, cfs_rq); + if (!se) + goto again; + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); } +static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); + struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; int new_tasks; again: - if (!sched_fair_runnable(rq)) + p = pick_task_fair(rq); + if (!p) goto idle; + se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) + if (prev->sched_class != &fair_sched_class) goto simple; + __put_prev_set_next_dl_server(rq, prev, p); + /* * Because of the set_next_buddy() in dequeue_task_fair() it is rather * likely that a next task is from the same cgroup as the current. * * Therefore attempt to avoid putting and setting the entire cgroup * hierarchy, only change the part that actually changes. - */ - - do { - struct sched_entity *curr = cfs_rq->curr; - - /* - * Since we got here without doing put_prev_entity() we also - * have to consider cfs_rq->curr. If it is still a runnable - * entity, update_curr() will update its vruntime, otherwise - * forget we've ever seen it. - */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; - - /* - * This call to check_cfs_rq_runtime() will do the - * throttle and dequeue its entity in the parent(s). - * Therefore the nr_running test will indeed - * be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { - cfs_rq = &rq->cfs; - - if (!cfs_rq->nr_running) - goto idle; - - goto simple; - } - } - - se = pick_next_entity(cfs_rq, curr); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - p = task_of(se); - - /* + * * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the * least amount of cfs_rqs. */ if (prev != p) { struct sched_entity *pse = &prev->se; + struct cfs_rq *cfs_rq; while (!(cfs_rq = is_same_group(se, pse))) { int se_depth = se->depth; @@ -7106,47 +8898,25 @@ again: put_prev_entity(cfs_rq, pse); set_next_entity(cfs_rq, se); - } - goto done; -simple: -#endif - if (prev) - put_prev_task(rq, prev); - - do { - se = pick_next_entity(cfs_rq, NULL); - set_next_entity(cfs_rq, se); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); + __set_next_task_fair(rq, p, true); + } - p = task_of(se); + return p; -done: __maybe_unused; -#ifdef CONFIG_SMP - /* - * Move the next running task to the front of - * the list, so our cfs_tasks list becomes MRU - * one. - */ - list_move(&p->se.group_node, &rq->cfs_tasks); +simple: #endif - - if (hrtick_enabled(rq)) - hrtick_start_fair(rq, p); - - update_misfit_status(p, rq); - + put_prev_set_next_task(rq, prev, p); return p; idle: if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); + new_tasks = sched_balance_newidle(rq, rf); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -7165,15 +8935,34 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq) +static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) +{ + return pick_next_task_fair(rq, prev, NULL); +} + +static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) +{ + return !!dl_se->rq->cfs.nr_queued; +} + +static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) +{ + return pick_task_fair(dl_se->rq); +} + +void fair_server_init(struct rq *rq) { - return pick_next_task_fair(rq, NULL, NULL); + struct sched_dl_entity *dl_se = &rq->fair_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); } /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -7186,8 +8975,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) /* * sched_yield() is very simple - * - * The magic of dealing with the ->skip buddy is in pick_next_entity. */ static void yield_task_fair(struct rq *rq) { @@ -7203,21 +8990,19 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); - if (curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - /* - * Tell update_rq_clock() that we've just updated, - * so we don't do microscopic update in schedule() - * and double the fastpath cost. - */ - rq_clock_skip_update(rq); - } + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq); - set_skip_buddy(se); + se->deadline += calc_delta_fair(se->slice, se); } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -7228,7 +9013,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; - /* Tell the scheduler that we'd really like pse to run next. */ + /* Tell the scheduler that we'd really like se to run next. */ set_next_buddy(se); yield_task_fair(rq); @@ -7286,7 +9071,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) * topology where each level pairs two lower groups (or better). This results * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency - * of load-balance at each level inv. proportional to the number of CPUs in + * of load-balance at each level inversely proportional to the number of CPUs in * the groups. * * This yields: @@ -7375,11 +9160,16 @@ enum group_type { */ group_fully_busy, /* - * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity - * and must be migrated to a more powerful CPU. + * One task doesn't fit with CPU's capacity and must be migrated to a + * more powerful CPU. */ group_misfit_task, /* + * Balance SMT group that's fully busy. Can benefit from migration + * a task on SMT with busy sibling to another CPU on idle core. + */ + group_smt_balance, + /* * SD_ASYM_PACKING only: One local CPU with higher capacity is available, * and the task should be migrated to it instead of running on the * current CPU. @@ -7408,8 +9198,7 @@ enum migration_type { #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 -#define LBF_NOHZ_STATS 0x10 -#define LBF_NOHZ_AGAIN 0x20 +#define LBF_ACTIVE_LB 0x10 struct lb_env { struct sched_domain *sd; @@ -7445,7 +9234,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); if (p->sched_class != &fair_sched_class) return 0; @@ -7461,12 +9250,19 @@ static int task_hot(struct task_struct *p, struct lb_env *env) * Buddy candidates are cache hot: */ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) + (&p->se == cfs_rq_of(&p->se)->next)) return 1; if (sysctl_sched_migration_cost == -1) return 1; + + /* + * Don't migrate task if the task's cookie does not match + * with the destination CPU's core cookie. + */ + if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p)) + return 1; + if (sysctl_sched_migration_cost == 0) return 0; @@ -7477,43 +9273,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns 1, if task migration degrades locality - * Returns 0, if task migration improves locality i.e migration preferred. - * Returns -1, if task migration is not affected by locality. + * Returns a positive value, if task migration degrades locality. + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. */ -static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) - return -1; + return 0; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return -1; + return 0; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return -1; + return 0; /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) { if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) return 1; else - return -1; + return 0; } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return 0; + return -1; /* Leaving a core idle is often worse than degrading locality. */ if (env->idle == CPU_IDLE) - return -1; + return 0; dist = node_distance(src_nid, dst_nid); if (numa_group) { @@ -7524,41 +9320,85 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) dst_weight = task_weight(p, dst_nid, dist); } - return dst_weight < src_weight; + return src_weight - dst_weight; } #else -static inline int migrate_degrades_locality(struct task_struct *p, +static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return -1; + return 0; } #endif /* + * Check whether the task is ineligible on the destination cpu + * + * When the PLACE_LAG scheduling feature is enabled and + * dst_cfs_rq->nr_queued is greater than 1, if the task + * is ineligible, it will also be ineligible when + * it is migrated to the destination cpu. + */ +static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu) +{ + struct cfs_rq *dst_cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; +#else + dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; +#endif + if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued && + !entity_eligible(task_cfs_rq(p), &p->se)) + return 1; + + return 0; +} + +/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot; + long degrades, hot; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) + p->sched_task_hot = 0; /* * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 1) delayed dequeued unless we migrate load, or + * 2) throttled_lb_pair, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU. */ + if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) + return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; + /* + * We want to prioritize the migration of eligible tasks. + * For ineligible tasks we soft-limit them and only allow + * them to migrate when nr_balance_failed is non-zero to + * avoid load-balancing trying very hard to balance the load. + */ + if (!env->sd->nr_balance_failed && + task_is_ineligible_on_dst_cpu(p, env->dst_cpu)) + return 0; + + /* Disregard percpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; - schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->stats.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -7567,10 +9407,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have - * already computed one in current iteration. + * Avoid computing new_dst_cpu + * - for NEWLY_IDLE + * - if we have already computed one in current iteration + * - if it's an active balance */ - if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || + env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB)) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ @@ -7585,34 +9428,37 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; } - /* Record that we found atleast one task that could run on dst_cpu */ + /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->src_rq, p)) { - schedstat_inc(p->se.statistics.nr_failed_migrations_running); + if (task_on_cpu(env->src_rq, p)) { + schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. - */ - tsk_cache_hot = migrate_degrades_locality(p, env); - if (tsk_cache_hot == -1) - tsk_cache_hot = task_hot(p, env); - - if (tsk_cache_hot <= 0 || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) { - schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->se.statistics.nr_forced_migrations); - } + * 1) active balance + * 2) destination numa is preferred + * 3) task is cache cold, or + * 4) too many balance attempts have failed. + */ + if (env->flags & LBF_ACTIVE_LB) + return 1; + + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); + else + hot = degrades > 0; + + if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + if (hot) + p->sched_task_hot = 1; return 1; } - schedstat_inc(p->se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->stats.nr_failed_migrations_hot); return 0; } @@ -7621,7 +9467,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) */ static void detach_task(struct task_struct *p, struct lb_env *env) { - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); + + if (p->sched_task_hot) { + p->sched_task_hot = 0; + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->stats.nr_forced_migrations); + } deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); @@ -7637,7 +9489,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) { @@ -7658,8 +9510,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -static const unsigned int sched_nr_migrate_break = 32; - /* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". @@ -7673,7 +9523,16 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; int detached = 0; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); + + /* + * Source run queue has been emptied by another CPU, clear + * LBF_ALL_PINNED flag as we will not test any task. + */ + if (env->src_rq->nr_running <= 1) { + env->flags &= ~LBF_ALL_PINNED; + return 0; + } if (env->imbalance <= 0) return 0; @@ -7683,11 +9542,9 @@ static int detach_tasks(struct lb_env *env) * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash. */ - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) break; - p = list_last_entry(tasks, struct task_struct, se.group_node); - env->loop++; /* We've more or less seen every task there is, call it quits */ if (env->loop > env->loop_max) @@ -7695,11 +9552,13 @@ static int detach_tasks(struct lb_env *env) /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sched_nr_migrate_break; + env->loop_break += SCHED_NR_MIGRATE_BREAK; env->flags |= LBF_NEED_BREAK; break; } + p = list_last_entry(tasks, struct task_struct, se.group_node); + if (!can_migrate_task(p, env)) goto next; @@ -7724,8 +9583,7 @@ static int detach_tasks(struct lb_env *env) * scheduler fails to find a good waiting task to * migrate. */ - - if ((load >> env->sd->nr_balance_failed) > env->imbalance) + if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= load; @@ -7734,7 +9592,7 @@ static int detach_tasks(struct lb_env *env) case migrate_util: util = task_util_est(p); - if (util > env->imbalance) + if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= util; @@ -7746,7 +9604,7 @@ static int detach_tasks(struct lb_env *env) case migrate_misfit: /* This is not a misfit task */ - if (task_fits_capacity(p, capacity_of(env->src_cpu))) + if (task_fits_cpu(p, env->src_cpu)) goto next; env->imbalance = 0; @@ -7777,6 +9635,9 @@ static int detach_tasks(struct lb_env *env) continue; next: + if (p->sched_task_hot) + schedstat_inc(p->stats.nr_failed_migrations_hot); + list_move(&p->se.group_node, tasks); } @@ -7795,11 +9656,11 @@ next: */ static void attach_task(struct rq *rq, struct task_struct *p) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); - BUG_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } /* @@ -7853,81 +9714,56 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) static inline bool others_have_blocked(struct rq *rq) { - if (READ_ONCE(rq->avg_rt.util_avg)) + if (cpu_util_rt(rq)) return true; - if (READ_ONCE(rq->avg_dl.util_avg)) + if (cpu_util_dl(rq)) return true; - if (thermal_load_avg(rq)) + if (hw_load_avg(rq)) return true; -#ifdef CONFIG_HAVE_SCHED_AVG_IRQ - if (READ_ONCE(rq->avg_irq.util_avg)) + if (cpu_util_irq(rq)) return true; -#endif return false; } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_blocked_load_tick(struct rq *rq) { - rq->last_blocked_load_update_tick = jiffies; + WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); +} +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ if (!has_blocked) rq->has_blocked_load = 0; } #else static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif static bool __update_blocked_others(struct rq *rq, bool *done) { - const struct sched_class *curr_class; - u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; - bool decayed; + bool updated; /* * update_load_avg() can call cpufreq_update_util(). Make sure that RT, * DL and IRQ signals have been updated before updating CFS. */ - curr_class = rq->curr->sched_class; - - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - - decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | - update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | - update_irq_load_avg(rq, 0); + updated = update_other_load_avgs(rq); if (others_have_blocked(rq)) *done = false; - return decayed; + return updated; } #ifdef CONFIG_FAIR_GROUP_SCHED -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load.weight) - return false; - - if (cfs_rq->avg.load_sum) - return false; - - if (cfs_rq->avg.util_sum) - return false; - - if (cfs_rq->avg.runnable_sum) - return false; - - return true; -} - static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq, *pos; @@ -7944,6 +9780,9 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq); + if (cfs_rq->nr_queued == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); + if (cfs_rq == &rq->cfs) decayed = true; } @@ -7951,7 +9790,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(cfs_rq_of(se), se, 0); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); /* * There can be a lot of idle CPU cgroups. Don't let fully @@ -8033,13 +9872,14 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -static void update_blocked_averages(int cpu) +static void sched_balance_update_blocked_averages(int cpu) { bool decayed = false, done = true; struct rq *rq = cpu_rq(cpu); struct rq_flags rf; rq_lock_irqsave(rq, &rf); + update_blocked_load_tick(rq); update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); @@ -8051,24 +9891,25 @@ static void update_blocked_averages(int cpu) rq_unlock_irqrestore(rq, &rf); } -/********** Helpers for find_busiest_group ************************/ +/********** Helpers for sched_balance_find_src_group ************************/ /* - * sg_lb_stats - stats of a sched_group required for load_balancing + * sg_lb_stats - stats of a sched_group required for load-balancing: */ struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ - unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ - unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ - unsigned int idle_cpus; + unsigned long avg_load; /* Avg load over the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long group_capacity; /* Capacity over the CPUs of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ + unsigned int sum_nr_running; /* Nr of all tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ + unsigned int idle_cpus; /* Nr of idle CPUs in the group */ unsigned int group_weight; enum group_type group_type; - unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ - unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -8076,19 +9917,18 @@ struct sg_lb_stats { }; /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. + * sd_lb_stats - stats of a sched_domain required for load-balancing: */ struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *local; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_capacity; /* Total capacity of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ - - struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* Tasks should go to sibling first */ + + struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; static inline void init_sd_lb_stats(struct sd_lb_stats *sds) @@ -8114,8 +9954,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(int cpu) { + unsigned long max = get_actual_cpu_capacity(cpu); struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -8127,12 +9967,9 @@ static unsigned long scale_rt_capacity(int cpu) /* * avg_rt.util_avg and avg_dl.util_avg track binary signals * (running and not running) with weights 0 and 1024 respectively. - * avg_thermal.load_avg tracks thermal pressure and the weighted - * average uses the actual delta max capacity(load). */ - used = READ_ONCE(rq->avg_rt.util_avg); - used += READ_ONCE(rq->avg_dl.util_avg); - used += thermal_load_avg(rq); + used = cpu_util_rt(rq); + used += cpu_util_dl(rq); if (unlikely(used >= max)) return 1; @@ -8147,8 +9984,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); - if (!capacity) capacity = 1; @@ -8224,19 +10059,13 @@ static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { return ((rq->cpu_capacity * sd->imbalance_pct) < - (rq->cpu_capacity_orig * 100)); + (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } -/* - * Check whether a rq has a misfit task and if it looks like we can actually - * help that task: we can migrate the task to a CPU of higher capacity, or - * the task's current CPU is heavily pressured. - */ -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +/* Check if the rq has a misfit task */ +static inline bool check_misfit_status(struct rq *rq) { - return rq->misfit_task_load && - (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || - check_cpu_capacity(rq, sd)); + return rq->misfit_task_load; } /* @@ -8260,7 +10089,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -8276,7 +10105,7 @@ static inline int sg_imbalanced(struct sched_group *group) /* * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. - * We consider that a group has spare capacity if the * number of task is + * We consider that a group has spare capacity if the number of task is * smaller than the number of CPUs or if the utilization is lower than the * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into @@ -8327,26 +10156,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) return false; } -/* - * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller - * per-CPU capacity than sched_group ref. - */ -static inline bool -group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) -{ - return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); -} - -/* - * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller - * per-CPU capacity_orig than sched_group ref. - */ -static inline bool -group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) -{ - return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); -} - static inline enum group_type group_classify(unsigned int imbalance_pct, struct sched_group *group, @@ -8361,6 +10170,9 @@ group_type group_classify(unsigned int imbalance_pct, if (sgs->group_asym_packing) return group_asym_packing; + if (sgs->group_smt_balance) + return group_smt_balance; + if (sgs->group_misfit_task_load) return group_misfit_task; @@ -8370,70 +10182,180 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } -static bool update_nohz_stats(struct rq *rq, bool force) +/** + * sched_use_asym_prio - Check whether asym_packing priority must be used + * @sd: The scheduling domain of the load balancing + * @cpu: A CPU + * + * Always use CPU priority when balancing load between SMT siblings. When + * balancing load between cores, it is not sufficient that @cpu is idle. Only + * use CPU priority if the whole core is idle. + * + * Returns: True if the priority of @cpu must be followed. False otherwise. + */ +static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) { -#ifdef CONFIG_NO_HZ_COMMON - unsigned int cpu = rq->cpu; + if (!(sd->flags & SD_ASYM_PACKING)) + return false; - if (!rq->has_blocked_load) + if (!sched_smt_active()) + return true; + + return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); +} + +static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu) +{ + /* + * First check if @dst_cpu can do asym_packing load balance. Only do it + * if it has higher priority than @src_cpu. + */ + return sched_use_asym_prio(sd, dst_cpu) && + sched_asym_prefer(dst_cpu, src_cpu); +} + +/** + * sched_group_asym - Check if the destination CPU can do asym_packing balance + * @env: The load balancing environment + * @sgs: Load-balancing statistics of the candidate busiest group + * @group: The candidate busiest group + * + * @env::dst_cpu can do asym_packing if it has higher priority than the + * preferred CPU of @group. + * + * Return: true if @env::dst_cpu can do with asym_packing load balance. False + * otherwise. + */ +static inline bool +sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) +{ + /* + * CPU priorities do not make sense for SMT cores with more than one + * busy sibling. + */ + if ((group->flags & SD_SHARE_CPUCAPACITY) && + (sgs->group_weight - sgs->idle_cpus != 1)) return false; - if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); +} + +/* One group has more than one SMT CPU while the other group does not */ +static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, + struct sched_group *sg2) +{ + if (!sg1 || !sg2) return false; - if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) - return true; + return (sg1->flags & SD_SHARE_CPUCAPACITY) != + (sg2->flags & SD_SHARE_CPUCAPACITY); +} - update_blocked_averages(cpu); +static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + if (!env->idle) + return false; + + /* + * For SMT source group, it is better to move a task + * to a CPU that doesn't have multiple tasks sharing its CPU capacity. + * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY + * will not be on. + */ + if (group->flags & SD_SHARE_CPUCAPACITY && + sgs->sum_h_nr_running > 1) + return true; - return rq->has_blocked_load; -#else return false; -#endif +} + +static inline long sibling_imbalance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sg_lb_stats *busiest, + struct sg_lb_stats *local) +{ + int ncores_busiest, ncores_local; + long imbalance; + + if (!env->idle || !busiest->sum_nr_running) + return 0; + + ncores_busiest = sds->busiest->cores; + ncores_local = sds->local->cores; + + if (ncores_busiest == ncores_local) { + imbalance = busiest->sum_nr_running; + lsub_positive(&imbalance, local->sum_nr_running); + return imbalance; + } + + /* Balance such that nr_running/ncores ratio are same on both groups */ + imbalance = ncores_local * busiest->sum_nr_running; + lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running); + /* Normalize imbalance and do rounding on normalization */ + imbalance = 2 * imbalance + ncores_local + ncores_busiest; + imbalance /= ncores_local + ncores_busiest; + + /* Take advantage of resource in an empty sched group */ + if (imbalance <= 1 && local->sum_nr_running == 0 && + busiest->sum_nr_running > 1) + imbalance = 2; + + return imbalance; +} + +static inline bool +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) +{ + /* + * When there is more than 1 task, the group_overloaded case already + * takes care of cpu with reduced capacity + */ + if (rq->cfs.h_nr_runnable != 1) + return false; + + return check_cpu_capacity(rq, sd); } /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. - * @sg_status: Holds flag indicating the status of the sched_group + * @sg_overloaded: sched_group is overloaded + * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, + struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - int *sg_status) + bool *sg_overloaded, + bool *sg_overutilized) { - int i, nr_running, local_group; + int i, nr_running, local_group, sd_flags = env->sd->flags; + bool balancing_at_rd = !env->sd->parent; memset(sgs, 0, sizeof(*sgs)); - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + local_group = group == sds->local; for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + unsigned long load = cpu_load(rq); - if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) - env->flags |= LBF_NOHZ_AGAIN; - - sgs->group_load += cpu_load(rq); - sgs->group_util += cpu_util(i); + sgs->group_load += load; + sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; - if (nr_running > 1) - *sg_status |= SG_OVERLOAD; - if (cpu_overutilized(i)) - *sg_status |= SG_OVERUTILIZED; + *sg_overutilized = 1; -#ifdef CONFIG_NUMA_BALANCING - sgs->nr_numa_running += rq->nr_numa_running; - sgs->nr_preferred_running += rq->nr_preferred_running; -#endif /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -8443,29 +10365,46 @@ static inline void update_sg_lb_stats(struct lb_env *env, continue; } + /* Overload indicator is only updated at root domain */ + if (balancing_at_rd && nr_running > 1) + *sg_overloaded = 1; + +#ifdef CONFIG_NUMA_BALANCING + /* Only fbq_classify_group() uses this to classify NUMA groups */ + if (sd_flags & SD_NUMA) { + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; + } +#endif if (local_group) continue; - /* Check for a misfit task on the cpu */ - if (env->sd->flags & SD_ASYM_CPUCAPACITY && - sgs->group_misfit_task_load < rq->misfit_task_load) { - sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + if (sd_flags & SD_ASYM_CPUCAPACITY) { + /* Check for a misfit task on the cpu */ + if (sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; + *sg_overloaded = 1; + } + } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { + /* Check for a task running on a CPU with reduced capacity */ + if (sgs->group_misfit_task_load < load) + sgs->group_misfit_task_load = load; } } - /* Check if dst CPU is idle and preferred to this group */ - if (env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && - sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { - sgs->group_asym_packing = 1; - } - sgs->group_capacity = group->sgc->capacity; sgs->group_weight = group->group_weight; + /* Check if dst CPU is idle and preferred to this group */ + if (!local_group && env->idle && sgs->sum_h_nr_running && + sched_group_asym(env, sgs, group)) + sgs->group_asym_packing = 1; + + /* Check for loaded SMT group to be balanced to dst CPU */ + if (!local_group && smt_balance(env, sgs, group)) + sgs->group_smt_balance = 1; + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ @@ -8504,8 +10443,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, * CPUs in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). */ - if (sgs->group_type == group_misfit_task && - (!group_smaller_max_cpu_capacity(sg, sds->local) || + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type == group_misfit_task) && + (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; @@ -8523,9 +10463,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, switch (sgs->group_type) { case group_overloaded: /* Select the overloaded group with highest avg_load. */ - if (sgs->avg_load <= busiest->avg_load) - return false; - break; + return sgs->avg_load > busiest->avg_load; case group_imbalanced: /* @@ -8536,18 +10474,24 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) - return false; - break; + return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); case group_misfit_task: /* * If we have more than one misfit sg go with the biggest * misfit. */ - if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) - return false; - break; + return sgs->group_misfit_task_load > busiest->group_misfit_task_load; + + case group_smt_balance: + /* + * Check if we have spare CPUs on either SMT group to + * choose has spare or fully busy handling. + */ + if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) + goto has_spare; + + fallthrough; case group_fully_busy: /* @@ -8558,15 +10502,40 @@ static bool update_sd_pick_busiest(struct lb_env *env, * contention when accessing shared HW resources. * * XXX for now avg_load is not computed and always 0 so we - * select the 1st one. + * select the 1st one, except if @sg is composed of SMT + * siblings. */ - if (sgs->avg_load <= busiest->avg_load) + + if (sgs->avg_load < busiest->avg_load) return false; + + if (sgs->avg_load == busiest->avg_load) { + /* + * SMT sched groups need more help than non-SMT groups. + * If @sg happens to also be SMT, either choice is good. + */ + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) + return false; + } + break; case group_has_spare: /* - * Select not overloaded group with lowest number of idle cpus + * Do not pick sg with SMT CPUs over sg with pure CPUs, + * as we do not want to pull task off SMT core with one task + * and make the core idle. + */ + if (smt_vs_nonsmt_groups(sds->busiest, sg)) { + if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1) + return false; + else + return true; + } +has_spare: + + /* + * Select not overloaded group with lowest number of idle CPUs * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up * that the group has less spare capacity but finally more idle @@ -8589,7 +10558,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type <= group_fully_busy) && - (group_smaller_min_cpu_capacity(sds->local, sg))) + (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu)))) return false; return true; @@ -8664,10 +10633,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p) * be computed and tested before calling idle_cpu_without(). */ -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -8688,6 +10655,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, memset(sgs, 0, sizeof(*sgs)); + /* Assume that task can't fit any CPU of the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY) + sgs->group_misfit_task_load = 1; + for_each_cpu(i, sched_group_span(group)) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -8696,7 +10667,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -8707,12 +10678,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (!nr_running && idle_cpu_without(i, p)) sgs->idle_cpus++; - } + /* Check if task fits in the CPU */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load && + task_fits_cpu(p, i)) + sgs->group_misfit_task_load = 0; - /* Check if task fits in the group */ - if (sd->flags & SD_ASYM_CPUCAPACITY && - !task_fits_capacity(p, group->sgc->max_capacity)) { - sgs->group_misfit_task_load = 1; } sgs->group_capacity = group->sgc->capacity; @@ -8757,6 +10728,7 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_imbalanced: case group_asym_packing: + case group_smt_balance: /* Those types are not used in the slow wakeup path */ return false; @@ -8783,23 +10755,13 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. - * This is an approximation as the number of running tasks may not be - * related to the number of busy CPUs due to sched_setaffinity. - */ -static inline bool allow_numa_imbalance(int dst_running, int dst_weight) -{ - return (dst_running < (dst_weight >> 2)); -} - -/* - * find_idlest_group() finds and returns the least busy CPU group within the + * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -8818,6 +10780,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) p->cpus_ptr)) continue; + /* Skip over this group if no cookie matched */ + if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) + continue; + local_group = cpumask_test_cpu(this_cpu, sched_group_span(group)); @@ -8894,6 +10860,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) case group_imbalanced: case group_asym_packing: + case group_smt_balance: /* Those type are not used in the slow wakeup path */ return NULL; @@ -8904,7 +10871,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) break; case group_has_spare: +#ifdef CONFIG_NUMA if (sd->flags & SD_NUMA) { + int imb_numa_nr = sd->imb_numa_nr; #ifdef CONFIG_NUMA_BALANCING int idlest_cpu; /* @@ -8917,16 +10886,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) idlest_cpu = cpumask_first(sched_group_span(idlest)); if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) return idlest; -#endif +#endif /* CONFIG_NUMA_BALANCING */ /* - * Otherwise, keep the task on this node to stay close - * its wakeup source and improve locality. If there is - * a real need of migration, periodic load balance will + * Otherwise, keep the task close to the wakeup source + * and improve locality if the number of running tasks + * would remain below threshold where an imbalance is + * allowed while accounting for the possibility the + * task is pinned to a subset of CPUs. If there is a + * real need of migration, periodic load balance will * take care of it. */ - if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) + if (p->nr_cpus_allowed != NR_CPUS) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); + + cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); + imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); + } + + imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); + if (!adjust_numa_imbalance(imbalance, + local_sgs.sum_nr_running + 1, + imb_numa_nr)) { return NULL; + } } +#endif /* CONFIG_NUMA */ /* * Select group with highest number of idle CPUs. We could also @@ -8942,6 +10926,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; } +static void update_idle_cpu_scan(struct lb_env *env, + unsigned long sum_util) +{ + struct sched_domain_shared *sd_share; + int llc_weight, pct; + u64 x, y, tmp; + /* + * Update the number of CPUs to scan in LLC domain, which could + * be used as a hint in select_idle_cpu(). The update of sd_share + * could be expensive because it is within a shared cache line. + * So the write of this hint only occurs during periodic load + * balancing, rather than CPU_NEWLY_IDLE, because the latter + * can fire way more frequently than the former. + */ + if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) + return; + + llc_weight = per_cpu(sd_llc_size, env->dst_cpu); + if (env->sd->span_weight != llc_weight) + return; + + sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + if (!sd_share) + return; + + /* + * The number of CPUs to search drops as sum_util increases, when + * sum_util hits 85% or above, the scan stops. + * The reason to choose 85% as the threshold is because this is the + * imbalance_pct(117) when a LLC sched group is overloaded. + * + * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] + * and y'= y / SCHED_CAPACITY_SCALE + * + * x is the ratio of sum_util compared to the CPU capacity: + * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) + * y' is the ratio of CPUs to be scanned in the LLC domain, + * and the number of CPUs to scan is calculated by: + * + * nr_scan = llc_weight * y' [2] + * + * When x hits the threshold of overloaded, AKA, when + * x = 100 / pct, y drops to 0. According to [1], + * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 + * + * Scale x by SCHED_CAPACITY_SCALE: + * x' = sum_util / llc_weight; [3] + * + * and finally [1] becomes: + * y = SCHED_CAPACITY_SCALE - + * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] + * + */ + /* equation [3] */ + x = sum_util; + do_div(x, llc_weight); + + /* equation [4] */ + pct = env->sd->imbalance_pct; + tmp = x * x * pct * pct; + do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); + tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); + y = SCHED_CAPACITY_SCALE - tmp; + + /* equation [2] */ + y *= llc_weight; + do_div(y, SCHED_CAPACITY_SCALE); + if ((int)y != sd_share->nr_idle_scan) + WRITE_ONCE(sd_share->nr_idle_scan, (int)y); +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -8950,16 +11005,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int sg_status = 0; - -#ifdef CONFIG_NO_HZ_COMMON - if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) - env->flags |= LBF_NOHZ_STATS; -#endif + unsigned long sum_util = 0; + bool sg_overloaded = 0, sg_overutilized = 0; do { struct sg_lb_stats *sgs = &tmp_sgs; @@ -8975,73 +11025,44 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, sgs, &sg_status); - - if (local_group) - goto next_group; - + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); - if (update_sd_pick_busiest(env, sds, sg, sgs)) { + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } -next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sum_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups); - /* Tag domain that child domain prefers tasks go to siblings first */ - sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; - -#ifdef CONFIG_NO_HZ_COMMON - if ((env->flags & LBF_NOHZ_AGAIN) && - cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { + /* + * Indicate that the child domain of the busiest group prefers tasks + * go to a child's sibling domains first. NB the flags of a sched group + * are those of the child domain. + */ + if (sds->busiest) + sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING); - WRITE_ONCE(nohz.next_blocked, - jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); - } -#endif if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { - struct root_domain *rd = env->dst_rq->rd; - /* update overload indicator if we are at root domain */ - WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); - } else if (sg_status & SG_OVERUTILIZED) { - struct root_domain *rd = env->dst_rq->rd; - - WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); + } else if (sg_overutilized) { + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } -} - -#define NUMA_IMBALANCE_MIN 2 - -static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight) -{ - if (!allow_numa_imbalance(dst_running, dst_weight)) - return imbalance; - - /* - * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the destination is lightly loaded. - */ - if (imbalance <= NUMA_IMBALANCE_MIN) - return 0; - return imbalance; + update_idle_cpu_scan(env, sum_util); } /** @@ -9058,9 +11079,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s busiest = &sds->busiest_stat; if (busiest->group_type == group_misfit_task) { - /* Set imbalance to allow misfit tasks to be balanced. */ - env->migration_type = migrate_misfit; - env->imbalance = 1; + if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = 1; + } else { + /* + * Set load imbalance to allow moving task from cpu + * with reduced capacity. + */ + env->migration_type = migrate_load; + env->imbalance = busiest->group_misfit_task_load; + } return; } @@ -9074,6 +11104,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } + if (busiest->group_type == group_smt_balance) { + /* Reduce number of tasks sharing CPU capacity */ + env->migration_type = migrate_task; + env->imbalance = 1; + return; + } + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -9092,7 +11129,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (local->group_type == group_has_spare) { if ((busiest->group_type > group_fully_busy) && - !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { + !(env->sd->flags & SD_SHARE_LLC)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity @@ -9112,7 +11149,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * waiting task in this overloaded busiest group. Let's * try to pull it. */ - if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + if (env->idle && env->imbalance == 0) { env->migration_type = migrate_task; env->imbalance = 1; } @@ -9121,30 +11158,34 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } if (busiest->group_weight == 1 || sds->prefer_sibling) { - unsigned int nr_diff = busiest->sum_nr_running; /* * When prefer sibling, evenly spread running tasks on * groups. */ env->migration_type = migrate_task; - lsub_positive(&nr_diff, local->sum_nr_running); - env->imbalance = nr_diff >> 1; + env->imbalance = sibling_imbalance(env, sds, busiest, local); } else { /* * If there is no overload, we just want to even the number of - * idle cpus. + * idle CPUs. */ env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - - busiest->idle_cpus) >> 1); + env->imbalance = max_t(long, 0, + (local->idle_cpus - busiest->idle_cpus)); } +#ifdef CONFIG_NUMA /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running, busiest->group_weight); + local->sum_nr_running + 1, + env->sd->imb_numa_nr); } +#endif + + /* Number of tasks to move to restore balance */ + env->imbalance >>= 1; return; } @@ -9162,8 +11203,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / - sds->total_capacity; /* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks. @@ -9172,6 +11211,19 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = 0; return; } + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; + + /* + * If the local group is more loaded than the average system + * load, don't try to pull any tasks. + */ + if (local->avg_load >= sds->avg_load) { + env->imbalance = 0; + return; + } + } /* @@ -9189,7 +11241,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; } -/******* find_busiest_group() helpers end here *********************/ +/******* sched_balance_find_src_group() helpers end here *********************/ /* * Decision matrix according to the local and busiest group type: @@ -9197,7 +11249,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load @@ -9212,17 +11264,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ /** - * find_busiest_group - Returns the busiest group within the sched_domain + * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ -static struct sched_group *find_busiest_group(struct lb_env *env) +static struct sched_group *sched_balance_find_src_group(struct lb_env *env) { struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; @@ -9235,24 +11286,20 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } - - local = &sds.local_stat; - busiest = &sds.busiest_stat; - /* There is no busy sibling group to pull tasks from */ if (!sds.busiest) goto out_balanced; + busiest = &sds.busiest_stat; + /* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance; + if (!is_rd_overutilized(env->dst_rq->rd) && + rcu_dereference(env->dst_rq->rd->pd)) + goto out_balanced; + /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) goto force_balance; @@ -9265,6 +11312,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; + local = &sds.local_stat; /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -9304,22 +11352,32 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto out_balanced; } - /* Try to move all excess tasks to child's sibling domain */ + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. + */ if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_nr_running > local->sum_nr_running + 1) + sibling_imbalance(env, &sds, busiest, local) > 1) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) + if (!env->idle) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already * busy, let another idle CPU try to pull task. */ goto out_balanced; + } + + if (busiest->group_type == group_smt_balance && + smt_vs_nonsmt_groups(sds.local, sds.busiest)) { + /* Let non SMT CPU pull from SMT CPU sharing with sibling */ + goto force_balance; + } if (busiest->group_weight > 1 && - local->idle_cpus <= (busiest->idle_cpus + 1)) + local->idle_cpus <= (busiest->idle_cpus + 1)) { /* * If the busiest group is not overloaded * and there is no imbalance between this and busiest @@ -9330,12 +11388,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * there is more than 1 CPU per group. */ goto out_balanced; + } - if (busiest->sum_h_nr_running == 1) + if (busiest->sum_h_nr_running == 1) { /* * busiest doesn't have any tasks waiting to run */ goto out_balanced; + } } force_balance: @@ -9349,9 +11409,9 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the CPUs in the group. + * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. */ -static struct rq *find_busiest_queue(struct lb_env *env, +static struct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; @@ -9389,8 +11449,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + nr_running = rq->cfs.h_nr_runnable; + if (!nr_running) + continue; + capacity = capacity_of(i); - nr_running = rq->cfs.h_nr_running; /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -9399,10 +11462,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, * average load. */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && - capacity_of(env->dst_cpu) < capacity && + !capacity_greater(capacity_of(env->dst_cpu), capacity) && nr_running == 1) continue; + /* + * Make sure we only pull tasks from a CPU of lower priority + * when balancing between SMT siblings. + * + * If balancing between cores, let lower priority CPUs help + * SMT cores with more than one busy sibling. + */ + if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1) + continue; + switch (env->migration_type) { case migrate_load: /* @@ -9436,7 +11509,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, break; case migrate_util: - util = cpu_util(cpu_of(rq)); + util = cpu_util_cfs_boost(i); /* * Don't try to pull utilization from a CPU with one @@ -9487,30 +11560,55 @@ static inline bool asym_active_balance(struct lb_env *env) { /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. + * ASYM_PACKING needs to force migrate tasks from busy but lower + * priority CPUs in order to pack all tasks in the highest priority + * CPUs. When done between cores, do it only if the whole core if the + * whole core is idle. + * + * If @env::src_cpu is an SMT core with busy siblings, let + * the lower priority @env::dst_cpu help it. Do not follow + * CPU priority. */ - return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu); + return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) && + (sched_asym_prefer(env->dst_cpu, env->src_cpu) || + !sched_use_asym_prio(env->sd, env->src_cpu)); } static inline bool -voluntary_active_balance(struct lb_env *env) +imbalanced_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + /* + * The imbalanced case includes the case of pinned tasks preventing a fair + * distribution of the load on the system but also the even distribution of the + * threads on a system with spare capacity + */ + if ((env->migration_type == migrate_task) && + (sd->nr_balance_failed > sd->cache_nice_tries+2)) + return 1; + + return 0; +} + +static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; if (asym_active_balance(env)) return 1; + if (imbalanced_active_balance(env)) + return 1; + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. */ - if ((env->idle != CPU_NOT_IDLE) && - (env->src_rq->cfs.h_nr_running == 1)) { + if (env->idle && + (env->src_rq->cfs.h_nr_runnable == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -9522,22 +11620,13 @@ voluntary_active_balance(struct lb_env *env) return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { + struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask); struct sched_group *sg = env->sd->groups; - int cpu; + int cpu, idle_smt = -1; /* * Ensure the balancing environment is consistent; can happen @@ -9549,28 +11638,83 @@ static int should_we_balance(struct lb_env *env) /* * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. + * + * However, we bail out if we already have tasks or a wakeup pending, + * to optimize wakeup latency. */ - if (env->idle == CPU_NEWLY_IDLE) + if (env->idle == CPU_NEWLY_IDLE) { + if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending) + return 0; return 1; + } + cpumask_copy(swb_cpus, group_balance_mask(sg)); /* Try to find first idle CPU */ - for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + for_each_cpu_and(cpu, swb_cpus, env->cpus) { if (!idle_cpu(cpu)) continue; - /* Are we the first idle CPU? */ + /* + * Don't balance to idle SMT in busy core right away when + * balancing cores, but remember the first idle SMT CPU for + * later consideration. Find CPU on an idle core first. + */ + if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { + if (idle_smt == -1) + idle_smt = cpu; + /* + * If the core is not idle, and first SMT sibling which is + * idle has been found, then its not needed to check other + * SMT siblings for idleness: + */ +#ifdef CONFIG_SCHED_SMT + cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); +#endif + continue; + } + + /* + * Are we the first idle core in a non-SMT domain or higher, + * or the first idle CPU in a SMT domain? + */ return cpu == env->dst_cpu; } + /* Are we the first idle CPU with busy siblings? */ + if (idle_smt != -1) + return idle_smt == env->dst_cpu; + /* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu; } +static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd, + enum cpu_idle_type idle) +{ + if (!schedstat_enabled()) + return; + + switch (env->migration_type) { + case migrate_load: + __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance); + break; + case migrate_util: + __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance); + break; + case migrate_task: + __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance); + break; + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; + } +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ -static int load_balance(int this_cpu, struct rq *this_rq, +static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { @@ -9580,14 +11724,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); - struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_span(sd->groups), + .dst_grpmask = group_balance_mask(sd->groups), .idle = idle, - .loop_break = sched_nr_migrate_break, + .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), @@ -9603,34 +11746,35 @@ redo: goto out_balanced; } - group = find_busiest_group(&env); + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(&env, group); + busiest = sched_balance_find_src_rq(&env, group); if (!busiest) { schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; } - BUG_ON(busiest == env.dst_rq); + WARN_ON_ONCE(busiest == env.dst_rq); - schedstat_add(sd->lb_imbalance[idle], env.imbalance); + update_lb_imbalance_stat(&env, sd, idle); env.src_cpu = busiest->cpu; env.src_rq = busiest; ld_moved = 0; + /* Clear this flag as soon as we find a pullable task */ + env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* - * Attempt to move tasks. If find_busiest_group has found + * Attempt to move tasks. If sched_balance_find_src_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -9679,7 +11823,7 @@ more_balance: * load to given_cpu. In rare situations, this may cause * conflicts (balance_cpu and given_cpu/ilb_cpu deciding * _independently_ and at _same_ time to move some load to - * given_cpu) causing exceess load to be moved to given_cpu. + * given_cpu) causing excess load to be moved to given_cpu. * This however should not happen so much in practice and * moreover subsequent load balance cycles should correct the * excess load moved. @@ -9693,7 +11837,7 @@ more_balance: env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; /* * Go back to "more_balance" rather than "redo" since we @@ -9725,7 +11869,7 @@ more_balance: */ if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; goto redo; } goto out_all_pinned; @@ -9739,14 +11883,18 @@ more_balance: * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. + * + * Similarly for migration_misfit which is not related to + * load/util migration, don't pollute nr_balance_failed. */ - if (idle != CPU_NEWLY_IDLE) + if (idle != CPU_NEWLY_IDLE && + env.migration_type != migrate_misfit) sd->nr_balance_failed++; if (need_active_balance(&env)) { unsigned long flags; - raw_spin_lock_irqsave(&busiest->lock, flags); + raw_spin_rq_lock_irqsave(busiest, flags); /* * Don't kick the active_load_balance_cpu_stop, @@ -9754,12 +11902,13 @@ more_balance: * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { - raw_spin_unlock_irqrestore(&busiest->lock, - flags); - env.flags |= LBF_ALL_PINNED; + raw_spin_rq_unlock_irqrestore(busiest, flags); goto out_one_pinned; } + /* Record that we found at least one task that could run on this_cpu */ + env.flags &= ~LBF_ALL_PINNED; + /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared @@ -9770,32 +11919,23 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_unlock_irqrestore(&busiest->lock, flags); + preempt_disable(); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } - - /* We've kicked active balancing, force task migration. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; + preempt_enable(); } - } else + } else { sd->nr_balance_failed = 0; + } - if (likely(!active_balance) || voluntary_active_balance(&env)) { + if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; } goto out; @@ -9827,12 +11967,17 @@ out_one_pinned: ld_moved = 0; /* - * newidle_balance() disregards balance intervals, so we could + * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval - * skyrocketting in a short amount of time. Skip the balance_interval + * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. + * + * Similarly misfit migration which is not necessarily an indication of + * the system being busy and requires lb to backoff to let it settle + * down. */ - if (env.idle == CPU_NEWLY_IDLE) + if (env.idle == CPU_NEWLY_IDLE || + env.migration_type == migrate_misfit) goto out; /* tune up the balancing interval */ @@ -9920,7 +12065,7 @@ static int active_load_balance_cpu_stop(void *data) * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup. */ - BUG_ON(busiest_rq == target_rq); + WARN_ON_ONCE(busiest_rq == target_rq); /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); @@ -9937,13 +12082,7 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, - /* - * can_migrate_task() doesn't need to compute new_dst_cpu - * for active balancing. Since we have CPU_IDLE, but no - * @dst_grpmask we need to make that test go away with lying - * about DST_PINNED. - */ - .flags = LBF_DST_PINNED, + .flags = LBF_ACTIVE_LB, }; schedstat_inc(sd->alb_count); @@ -9971,10 +12110,23 @@ out_unlock: return 0; } -static DEFINE_SPINLOCK(balancing); +/* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); /* - * Scale the max load_balance interval with the number of CPUs in the system. + * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ void update_max_interval(void) @@ -9982,13 +12134,37 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +{ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = jiffies; + } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + /* + * Decay the newidle max times by ~1% per second to ensure that + * it is not outdated and the current max cost is actually + * shorter. + */ + sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; + sd->last_decay_max_lb_cost = jiffies; + + return true; + } + + return false; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; @@ -10005,14 +12181,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) for_each_domain(cpu, sd) { /* * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. + * visit to all the domains. */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } + need_decay = update_newidle_cost(sd, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -10030,25 +12201,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { - if (!spin_trylock(&balancing)) + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + idle = idle_cpu(cpu); + busy = !idle && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } if (need_serialize) - spin_unlock(&balancing); + atomic_set_release(&sched_balance_running, 0); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; @@ -10070,22 +12241,9 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) { + if (likely(update_next_balance)) rq->next_balance = next_balance; -#ifdef CONFIG_NO_HZ_COMMON - /* - * If this CPU has been elected to perform the nohz idle - * balance. Other idle CPUs have already rebalanced with - * nohz_idle_balance() and nohz.next_balance has been - * updated accordingly. This CPU is now running the idle load - * balance for itself and we need to update the - * nohz.next_balance accordingly. - */ - if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) - nohz.next_balance = rq->next_balance; -#endif - } } static inline int on_null_domain(struct rq *rq) @@ -10095,34 +12253,37 @@ static inline int on_null_domain(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing + * NOHZ idle load balancing (ILB) details: + * + * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set - * anywhere yet. */ - static inline int find_new_ilb(void) { - int ilb; + const struct cpumask *hk_mask; + int ilb_cpu; + + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); - for_each_cpu_and(ilb, nohz.idle_cpus_mask, - housekeeping_cpumask(HK_FLAG_MISC)) { + for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { - if (ilb == smp_processor_id()) + if (ilb_cpu == smp_processor_id()) continue; - if (idle_cpu(ilb)) - return ilb; + if (idle_cpu(ilb_cpu)) + return ilb_cpu; } - return nr_cpu_ids; + return -1; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). + * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU + * SMP function call (IPI). + * + * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set + * (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -10136,8 +12297,14 @@ static void kick_ilb(unsigned int flags) nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); + if (ilb_cpu < 0) + return; - if (ilb_cpu >= nr_cpu_ids) + /* + * Don't bother if no new NOHZ balance work items for ilb_cpu, + * i.e. all bits in flags are already set in ilb_cpu. + */ + if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags) return; /* @@ -10150,7 +12317,7 @@ static void kick_ilb(unsigned int flags) /* * This way we generate an IPI on the target CPU which - * is idle. And the softirq performing nohz idle load balance + * is idle, and the softirq performing NOHZ idle load balancing * will be run before returning from the IPI. */ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); @@ -10179,7 +12346,7 @@ static void nohz_balancer_kick(struct rq *rq) /* * None are in tickless mode and hence no need for NOHZ idle load - * balancing. + * balancing: */ if (likely(!atomic_read(&nohz.nr_cpus))) return; @@ -10192,7 +12359,7 @@ static void nohz_balancer_kick(struct rq *rq) goto out; if (rq->nr_running >= 2) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto out; } @@ -10201,12 +12368,11 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { /* - * If there's a CFS task and the current CPU has reduced - * capacity; kick the ILB to see if there's a better CPU to run - * on. + * If there's a runnable CFS task and the current CPU has reduced + * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { - flags = NOHZ_KICK_MASK; + if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10217,10 +12383,13 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_PACKING; see if there's a more preferred CPU * currently idle; in which case, kick the ILB to move tasks * around. + * + * When balancing between cores, all the SMT siblings of the + * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_asym_prefer(i, cpu)) { - flags = NOHZ_KICK_MASK; + if (sched_asym(sd, i, cpu)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10232,8 +12401,8 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq, sd)) { - flags = NOHZ_KICK_MASK; + if (check_misfit_status(rq)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -10251,22 +12420,25 @@ static void nohz_balancer_kick(struct rq *rq) if (sds) { /* * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread + * increase the overall cache utilization), we need a less-loaded LLC + * domain to pull some load from. Likewise, we may need to spread * load within the current LLC domain (e.g. packed SMT cores but * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks + * the others are - so just get a NOHZ balance going if it looks * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } unlock: rcu_read_unlock(); out: + if (READ_ONCE(nohz.needs_update)) + flags |= NOHZ_NEXT_KICK; + if (flags) kick_ilb(flags); } @@ -10331,10 +12503,6 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) - return; - /* * Can be set safely without rq->lock held * If a clear happens, it will have evaluated last additions because @@ -10363,29 +12531,45 @@ void nohz_balance_enter_idle(int cpu) /* * Ensures that if nohz_idle_balance() fails to observe our * @idle_cpus_mask store, it must observe the @has_blocked - * store. + * and @needs_update stores. */ smp_mb__after_atomic(); set_cpu_sd_state_idle(cpu); + WRITE_ONCE(nohz.needs_update, 1); out: /* * Each time a cpu enter idle, we assume that it has blocked load and - * enable the periodic update of the load of idle cpus + * enable the periodic update of the load of idle CPUs */ WRITE_ONCE(nohz.has_blocked, 1); } +static bool update_nohz_stats(struct rq *rq) +{ + unsigned int cpu = rq->cpu; + + if (!rq->has_blocked_load) + return false; + + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return false; + + if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) + return true; + + sched_balance_update_blocked_averages(cpu); + + return rq->has_blocked_load; +} + /* - * Internal function that runs load balance for all idle cpus. The load balance + * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. - * The function returns false if the loop has stopped before running - * through all idle CPUs. */ -static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, - enum cpu_idle_type idle) +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) { /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; @@ -10394,7 +12578,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, int update_next_balance = 0; int this_cpu = this_rq->cpu; int balance_cpu; - int ret = false; struct rq *rq; SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); @@ -10402,12 +12585,17 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, /* * We assume there will be no idle load after this update and clear * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trig another update of idle load. + * set the has_blocked flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. + * + * Same applies to idle_cpus_mask vs needs_update. */ - WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 0); /* * Ensures that if we miss the CPU, we must see the has_blocked @@ -10415,8 +12603,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, */ smp_mb(); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) + /* + * Start with the next CPU after this_cpu so we will end with this_cpu and let a + * chance for other idle cpu to pull load. + */ + for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) { + if (!idle_cpu(balance_cpu)) continue; /* @@ -10424,14 +12616,18 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, * work being done for other CPUs. Next load * balancing owner will pick it up. */ - if (need_resched()) { - has_blocked_load = true; + if (!idle_cpu(this_cpu) && need_resched()) { + if (flags & NOHZ_STATS_KICK) + has_blocked_load = true; + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 1); goto abort; } rq = cpu_rq(balance_cpu); - has_blocked_load |= update_nohz_stats(rq, true); + if (flags & NOHZ_STATS_KICK) + has_blocked_load |= update_nohz_stats(rq); /* * If time for next balance is due, @@ -10445,7 +12641,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(rq, CPU_IDLE); + sched_balance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { @@ -10462,32 +12658,19 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, if (likely(update_next_balance)) nohz.next_balance = next_balance; - /* Newly idle CPU doesn't need an update */ - if (idle != CPU_NEWLY_IDLE) { - update_blocked_averages(this_cpu); - has_blocked_load |= this_rq->has_blocked_load; - } - - if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(this_rq, CPU_IDLE); - - WRITE_ONCE(nohz.next_blocked, - now + msecs_to_jiffies(LOAD_AVG_PERIOD)); - - /* The full idle balance loop has been done */ - ret = true; + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - - return ret; } /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -10501,21 +12684,43 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (idle != CPU_IDLE) return false; - _nohz_idle_balance(this_rq, flags, idle); + _nohz_idle_balance(this_rq, flags); return true; } -static void nohz_newidle_balance(struct rq *this_rq) +/* + * Check if we need to directly run the ILB for updating blocked load before + * entering idle state. Here we run ILB directly without issuing IPIs. + * + * Note that when this function is called, the tick may not yet be stopped on + * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and + * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates + * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle + * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is + * called from this function on (this) CPU that's not yet in the mask. That's + * OK because the goal of nohz_run_idle_balance() is to run ILB only for + * updating the blocked load of already idle CPUs without waking up one of + * those idle CPUs and outside the preempt disable / IRQ off phase of the local + * cpu about to enter idle, because it can take a long time. + */ +void nohz_run_idle_balance(int cpu) { - int this_cpu = this_rq->cpu; + unsigned int flags; + + flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu)); /* - * This CPU doesn't want to be disturbed by scheduler - * housekeeping + * Update the blocked load only if no SCHED_SOFTIRQ is about to happen + * (i.e. NOHZ_STATS_KICK set) and will do the same. */ - if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) - return; + if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); +} + +static void nohz_newidle_balance(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu; /* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) @@ -10526,16 +12731,11 @@ static void nohz_newidle_balance(struct rq *this_rq) time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; - raw_spin_unlock(&this_rq->lock); /* - * This CPU is going to be idle and blocked load of idle CPUs - * need to be updated. Run the ilb locally as it is a good - * candidate for ilb instead of waking up another idle CPU. - * Kick an normal ilb if we failed to do the update. + * Set the need to trigger ILB in order to update blocked load + * before entering idle state. */ - if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) - kick_ilb(NOHZ_STATS_KICK); - raw_spin_lock(&this_rq->lock); + atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } #else /* !CONFIG_NO_HZ_COMMON */ @@ -10550,7 +12750,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * newidle_balance is called by schedule() if this_cpu is about to become + * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -10558,18 +12758,28 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + int continue_balancing = 1; + u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; - u64 curr_cost = 0; update_misfit_status(NULL, this_rq); + + /* + * There is a task waiting to run. No need to search for one. + * Return 0; the task will be enqueued when switching to idle. + */ + if (this_rq->ttwu_pending) + return 0; + /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * We must set idle_stamp _before_ calling sched_balance_rq() + * for CPU_NEWLY_IDLE, such that we measure the this duration + * as idle time. */ this_rq->idle_stamp = rq_clock(this_rq); @@ -10587,82 +12797,83 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !READ_ONCE(this_rq->rd->overload)) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + + if (!get_rd_overloaded(this_rq->rd) || + (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) update_next_balance(sd, &next_balance); rcu_read_unlock(); - nohz_newidle_balance(this_rq); - goto out; } + rcu_read_unlock(); - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); + + t0 = sched_clock_cpu(this_cpu); + sched_balance_update_blocked_averages(this_cpu); - update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - int continue_balancing = 1; - u64 t0, domain_cost; + u64 domain_cost; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); + + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) break; - } if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); - pulled_task = load_balance(this_cpu, this_rq, + pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; + t1 = sched_clock_cpu(this_cpu); + domain_cost = t1 - t0; + update_newidle_cost(sd, domain_cost); curr_cost += domain_cost; + t0 = t1; } - update_next_balance(sd, &next_balance); - /* * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || !continue_balancing) break; } rcu_read_unlock(); - raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq); if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; -out: /* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) + if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_queued) + pulled_task = -1; + +out: /* Move the next balance forward */ if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) - pulled_task = -1; - if (pulled_task) this_rq->idle_stamp = 0; + else + nohz_newidle_balance(this_rq); rq_repin_lock(this_rq, rf); @@ -10670,19 +12881,21 @@ out: } /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + * This softirq handler is triggered via SCHED_SOFTIRQ from two places: + * + * - directly from the local sched_tick() for periodic load balancing + * + * - indirectly from a remote sched_tick() for NOHZ idle balancing + * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(void) { struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; /* - * If this CPU has a pending nohz_balance_kick, then do the + * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are - * stopped. Do nohz_idle_balance *before* rebalance_domains to + * stopped. Do nohz_idle_balance *before* sched_balance_domains to * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. @@ -10691,17 +12904,20 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) return; /* normal load balance */ - update_blocked_averages(this_rq->cpu); - rebalance_domains(this_rq, idle); + sched_balance_update_blocked_averages(this_rq->cpu); + sched_balance_domains(this_rq, idle); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq) +void sched_balance_trigger(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return; if (time_after_eq(jiffies, rq->next_balance)) @@ -10723,10 +12939,140 @@ static void rq_offline_fair(struct rq *rq) /* Ensure any throttled groups are reachable by pick_next_task */ unthrottle_offline_cfs_rqs(rq); + + /* Ensure that we remove rq contribution to group share: */ + clear_tg_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_CORE +static inline bool +__entity_slice_used(struct sched_entity *se, int min_nr_tasks) +{ + u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + u64 slice = se->slice; + + return (rtime * min_nr_tasks > slice); +} + +#define MIN_NR_TASKS_DURING_FORCEIDLE 2 +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) +{ + if (!sched_core_enabled(rq)) + return; + + /* + * If runqueue has only one task which used up its slice and + * if the sibling is forced idle, then trigger schedule to + * give forced idle task a chance. + * + * sched_slice() considers only this active rq and it gets the + * whole slice. But during force idle, we have siblings acting + * like a single runqueue and hence we need to consider runnable + * tasks on this CPU and the forced idle CPU. Ideally, we should + * go through the forced idle rq, but that would be a perf hit. + * We can assume that the forced idle CPU has at least + * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check + * if we need to give up the CPU. + */ + if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 && + __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) + resched_curr(rq); +} + +/* + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + */ +static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, + bool forceidle) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (forceidle) { + if (cfs_rq->forceidle_seq == fi_seq) + break; + cfs_rq->forceidle_seq = fi_seq; + } + + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + } +} + +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) +{ + struct sched_entity *se = &p->se; + + if (p->sched_class != &fair_sched_class) + return; + + se_fi_update(se, rq->core->core_forceidle_seq, in_fi); +} + +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) +{ + struct rq *rq = task_rq(a); + const struct sched_entity *sea = &a->se; + const struct sched_entity *seb = &b->se; + struct cfs_rq *cfs_rqa; + struct cfs_rq *cfs_rqb; + s64 delta; + + SCHED_WARN_ON(task_rq(b)->core != rq->core); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Find an se in the hierarchy for tasks a and b, such that the se's + * are immediate siblings. + */ + while (sea->cfs_rq->tg != seb->cfs_rq->tg) { + int sea_depth = sea->depth; + int seb_depth = seb->depth; + + if (sea_depth >= seb_depth) + sea = parent_entity(sea); + if (sea_depth <= seb_depth) + seb = parent_entity(seb); + } + + se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); + se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); + + cfs_rqa = sea->cfs_rq; + cfs_rqb = seb->cfs_rq; +#else + cfs_rqa = &task_rq(a)->cfs; + cfs_rqb = &task_rq(b)->cfs; +#endif + + /* + * Find delta after normalizing se's vruntime with its cfs_rq's + * min_vruntime_fi, which would have been updated in prior calls + * to se_fi_update(). + */ + delta = (s64)(sea->vruntime - seb->vruntime) + + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + + return delta > 0; +} + +static int task_is_throttled_fair(struct task_struct *p, int cpu) +{ + struct cfs_rq *cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq = task_group(p)->cfs_rq[cpu]; +#else + cfs_rq = &cpu_rq(cpu)->cfs; +#endif + return throttled_hierarchy(cfs_rq); +} +#else +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} +#endif + /* * scheduler tick hitting a task of our scheduling class. * @@ -10749,7 +13095,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); - update_overutilized_status(task_rq(curr)); + check_update_overutilized_status(task_rq(curr)); + + task_tick_core(rq, curr); } /* @@ -10759,33 +13107,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, *curr; - struct rq *rq = this_rq(); - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; - if (curr) { - update_curr(cfs_rq); - se->vruntime = curr->vruntime; - } - place_entity(cfs_rq, se, 1); - - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { - /* - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - swap(curr->vruntime, se->vruntime); - resched_curr(rq); - } - - se->vruntime -= cfs_rq->min_vruntime; - rq_unlock(rq, &rf); + set_task_max_allowed_capacity(p); } /* @@ -10798,7 +13120,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->cfs.nr_running == 1) + if (rq->cfs.nr_queued == 1) return; /* @@ -10806,39 +13128,11 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { + if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else - check_preempt_curr(rq, p, 0); -} - -static inline bool vruntime_normalized(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - - /* - * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, - * the dequeue_entity(.flags=0) will already have normalized the - * vruntime. - */ - if (p->on_rq) - return true; - - /* - * When !on_rq, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - A forked child which is waiting for being woken up by - * wake_up_new_task(). - * - A task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - */ - if (!se->sum_exec_runtime || - (p->state == TASK_WAKING && p->sched_remote_wakeup)) - return true; - - return false; + wakeup_preempt(rq, p, 0); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -10848,7 +13142,13 @@ static inline bool vruntime_normalized(struct task_struct *p) */ static void propagate_entity_cfs_rq(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + return; + + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ se = se->parent; @@ -10856,10 +13156,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + update_load_avg(cfs_rq, se, UPDATE_TG); + if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(cfs_rq, se, UPDATE_TG); + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } } #else @@ -10870,6 +13173,17 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); +#ifdef CONFIG_SMP + /* + * In case the task sched_avg hasn't been attached: + * - A forked task which hasn't been woken up by wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() but is + * waiting for actually being woken up by sched_ttwu_pending(). + */ + if (!se->avg.last_update_time) + return; +#endif + /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); @@ -10881,14 +13195,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); @@ -10899,16 +13205,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } detach_entity_cfs_rq(se); } @@ -10916,12 +13212,8 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); attach_entity_cfs_rq(se); - - if (!vruntime_normalized(p)) - se->vruntime += cfs_rq->min_vruntime; } static void switched_from_fair(struct rq *rq, struct task_struct *p) @@ -10931,27 +13223,26 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) static void switched_to_fair(struct rq *rq, struct task_struct *p) { + SCHED_WARN_ON(p->se.sched_delayed); + attach_task_cfs_rq(p); + set_task_max_allowed_capacity(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (rq->curr == p) + if (task_current_donor(rq, p)) resched_curr(rq); else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } } -/* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; @@ -10964,6 +13255,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) list_move(&se->group_node, &rq->cfs_tasks); } #endif + if (!first) + return; + + SCHED_WARN_ON(se->sched_delayed); + + if (hrtick_enabled_fair(rq)) + hrtick_start_fair(rq, p); + + update_misfit_status(p, rq); + sched_fair_update_stop_tick(rq, p); +} + +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +{ + struct sched_entity *se = &p->se; for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10972,60 +13284,43 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } + + __set_next_task_fair(rq, p, first); } void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_set_group_fair(struct task_struct *p) +static void task_change_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - - set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; -} + /* + * We couldn't detach or attach a forked task which + * hasn't been woken up by wake_up_new_task(). + */ + if (READ_ONCE(p->__state) == TASK_NEW) + return; -static void task_move_group_fair(struct task_struct *p) -{ detach_task_cfs_rq(p); - set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif + set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } -static void task_change_group_fair(struct task_struct *p, int type) -{ - switch (type) { - case TASK_SET_GROUP: - task_set_group_fair(p); - break; - - case TASK_MOVE_GROUP: - task_move_group_fair(p); - break; - } -} - void free_fair_sched_group(struct task_group *tg) { int i; - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -11052,7 +13347,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), @@ -11060,7 +13355,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!cfs_rq) goto err; - se = kzalloc_node(sizeof(struct sched_entity), + se = kzalloc_node(sizeof(struct sched_entity_stats), GFP_KERNEL, cpu_to_node(i)); if (!se) goto err_free_rq; @@ -11098,26 +13393,35 @@ void online_fair_sched_group(struct task_group *tg) void unregister_fair_sched_group(struct task_group *tg) { - unsigned long flags; - struct rq *rq; int cpu; + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(cpu) { - if (tg->se[cpu]) - remove_entity_load_avg(tg->se[cpu]); + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct sched_entity *se = tg->se[cpu]; + struct rq *rq = cpu_rq(cpu); + + if (se) { + if (se->sched_delayed) { + guard(rq_lock_irqsave)(rq); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + list_del_leaf_cfs_rq(cfs_rq); + } + remove_entity_load_avg(se); + } /* * Only empty task groups can be destroyed; so we can speculatively * check on_list without danger of it being re-added. */ - if (!tg->cfs_rq[cpu]->on_list) - continue; - - rq = cpu_rq(cpu); - - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + if (cfs_rq->on_list) { + guard(rq_lock_irqsave)(rq); + list_del_leaf_cfs_rq(cfs_rq); + } } } @@ -11154,10 +13458,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@ -11166,9 +13472,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@ -11186,22 +13491,87 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_unlock_irqrestore(rq, &rf); } -done: - mutex_unlock(&shares_mutex); return 0; } -#else /* CONFIG_FAIR_GROUP_SCHED */ -void free_fair_sched_group(struct task_group *tg) { } - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +int sched_group_set_shares(struct task_group *tg, unsigned long shares) { - return 1; + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; } -void online_fair_sched_group(struct task_group *tg) { } +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + idle_task_delta = grp_cfs_rq->h_nr_queued - + grp_cfs_rq->h_nr_idle; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->h_nr_idle += idle_task_delta; -void unregister_fair_sched_group(struct task_group *tg) { } + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + +next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + + mutex_unlock(&shares_mutex); + return 0; +} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -11216,7 +13586,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); + rr_interval = NS_TO_JIFFIES(se->slice); return rr_interval; } @@ -11231,8 +13601,9 @@ DEFINE_SCHED_CLASS(fair) = { .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .check_preempt_curr = check_preempt_wakeup, + .wakeup_preempt = check_preempt_wakeup_fair, + .pick_task = pick_task_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, @@ -11246,12 +13617,13 @@ DEFINE_SCHED_CLASS(fair) = { .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, - .set_cpus_allowed = set_cpus_allowed_common, + .set_cpus_allowed = set_cpus_allowed_fair, #endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, + .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, @@ -11264,6 +13636,10 @@ DEFINE_SCHED_CLASS(fair) = { .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_fair, +#endif + #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif @@ -11308,111 +13684,27 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); - -#ifdef CONFIG_NO_HZ_COMMON - nohz.next_balance = jiffies; - nohz.next_blocked = jiffies; - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); -#endif -#endif /* SMP */ - -} + int i; -/* - * Helper functions to facilitate extracting info from tracepoints. - */ + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i), + GFP_KERNEL, cpu_to_node(i)); -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) -{ -#ifdef CONFIG_SMP - return cfs_rq ? &cfs_rq->avg : NULL; -#else - return NULL; +#ifdef CONFIG_CFS_BANDWIDTH + INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); + INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); #endif -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); - -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) -{ - if (!cfs_rq) { - if (str) - strlcpy(str, "(null)", len); - else - return NULL; } - cfs_rq_tg_path(cfs_rq, str, len); - return str; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); - -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) -{ - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_rt : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); - -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_dl : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); - -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) - return rq ? &rq->avg_irq : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); + open_softirq(SCHED_SOFTIRQ, sched_balance_softirq); -int sched_trace_rq_cpu(struct rq *rq) -{ - return rq ? cpu_of(rq) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); - -int sched_trace_rq_cpu_capacity(struct rq *rq) -{ - return rq ? -#ifdef CONFIG_SMP - rq->cpu_capacity -#else - SCHED_CAPACITY_SCALE -#endif - : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd) -{ -#ifdef CONFIG_SMP - return rd ? rd->span : NULL; -#else - return NULL; +#ifdef CONFIG_NO_HZ_COMMON + nohz.next_balance = jiffies; + nohz.next_blocked = jiffies; + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -} -EXPORT_SYMBOL_GPL(sched_trace_rd_span); +#endif /* SMP */ -int sched_trace_rq_nr_running(struct rq *rq) -{ - return rq ? rq->nr_running : -1; } -EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); |