summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c2227
1 files changed, 1271 insertions, 956 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fbdca89c677f..da46c3164537 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,6 +37,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/isolation.h>
#include <linux/sched/nohz.h>
+#include <linux/sched/prio.h>
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
@@ -51,6 +52,8 @@
#include <asm/switch_to.h>
+#include <uapi/linux/sched/types.h>
+
#include "sched.h"
#include "stats.h"
#include "autogroup.h"
@@ -71,12 +74,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_base_slice = 750000ULL;
-static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
+unsigned int sysctl_sched_base_slice = 700000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;
static int __init setup_sched_thermal_decay_shift(char *str)
{
@@ -85,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str)
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
-#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered CPU has higher priority.
*/
@@ -108,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu)
* (default: ~5%)
*/
#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
-#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
@@ -130,7 +131,7 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
#endif
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_fair_sysctls[] = {
+static const struct ctl_table sched_fair_sysctls[] = {
#ifdef CONFIG_CFS_BANDWIDTH
{
.procname = "sched_cfs_bandwidth_slice_us",
@@ -159,7 +160,7 @@ static int __init sched_fair_sysctl_init(void)
return 0;
}
late_initcall(sched_fair_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -396,7 +397,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+ WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
/* Iterate through all leaf cfs_rq's on a runqueue */
@@ -468,7 +469,7 @@ static int se_is_idle(struct sched_entity *se)
return cfs_rq_is_idle(group_cfs_rq(se));
}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -514,7 +515,7 @@ static int se_is_idle(struct sched_entity *se)
return task_has_idle_policy(task_of(se));
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
@@ -523,7 +524,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
@@ -532,7 +533,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
return max_vruntime;
}
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
@@ -553,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a,
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->min_vruntime);
+ return (s64)(se->vruntime - cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -605,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* Which we track using:
*
- * v0 := cfs_rq->min_vruntime
+ * v0 := cfs_rq->zero_vruntime
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
* \Sum w_i := cfs_rq->avg_load
*
- * Since min_vruntime is a monotonic increasing variable that closely tracks
- * the per-task service, these deltas: (v_i - v), will be in the order of the
- * maximal (virtual) lag induced in the system due to quantisation.
+ * Since zero_vruntime closely tracks the per-task service, these
+ * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
@@ -670,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
avg = div_s64(avg, load);
}
- return cfs_rq->min_vruntime + avg;
+ return cfs_rq->zero_vruntime + avg;
}
/*
@@ -689,21 +690,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static s64 entity_lag(u64 avruntime, struct sched_entity *se)
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
- vlag = avruntime - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
-
- return clamp(vlag, -limit, limit);
-}
+ WARN_ON_ONCE(!se->on_rq);
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- SCHED_WARN_ON(!se->on_rq);
+ vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
- se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
+ se->vlag = clamp(vlag, -limit, limit);
}
/*
@@ -736,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+ return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -744,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
-{
- u64 min_vruntime = cfs_rq->min_vruntime;
- /*
- * open coded max_vruntime() to allow updating avg_vruntime
- */
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta > 0) {
- avg_vruntime_update(cfs_rq, delta);
- min_vruntime = vruntime;
- }
- return min_vruntime;
-}
-
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
+static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_root_entity(cfs_rq);
- struct sched_entity *curr = cfs_rq->curr;
- u64 vruntime = cfs_rq->min_vruntime;
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
- if (curr) {
- if (curr->on_rq)
- vruntime = curr->vruntime;
- else
- curr = NULL;
- }
-
- if (se) {
- if (!curr)
- vruntime = se->min_vruntime;
- else
- vruntime = min_vruntime(vruntime, se->min_vruntime);
- }
+ avg_vruntime_update(cfs_rq, delta);
- /* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+ cfs_rq->zero_vruntime = vruntime;
}
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
@@ -852,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -863,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -886,6 +856,47 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
}
/*
+ * Set the vruntime up to which an entity can run before looking
+ * for another entity to pick.
+ * In case of run to parity, we use the shortest slice of the enqueued
+ * entities to set the protected period.
+ * When run to parity is disabled, we give a minimum quantum to the running
+ * entity to ensure progress.
+ */
+static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = normalized_sysctl_sched_base_slice;
+ u64 vprot = se->deadline;
+
+ if (sched_feat(RUN_TO_PARITY))
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ slice = min(slice, se->slice);
+ if (slice != se->slice)
+ vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));
+
+ se->vprot = vprot;
+}
+
+static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = cfs_rq_min_slice(cfs_rq);
+
+ se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se));
+}
+
+static inline bool protect_slice(struct sched_entity *se)
+{
+ return ((s64)(se->vprot - se->vruntime) > 0);
+}
+
+static inline void cancel_protect_slice(struct sched_entity *se)
+{
+ if (protect_slice(se))
+ se->vprot = se->vruntime;
+}
+
+/*
* Earliest Eligible Virtual Deadline First
*
* In order to provide latency guarantees for different request sizes
@@ -904,7 +915,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -915,17 +926,23 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
* We can safely skip eligibility check if there is only one entity
* in this cfs_rq, saving some cycles.
*/
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_queued == 1)
return curr && curr->on_rq ? curr : se;
+ /*
+ * Picking the ->next buddy will affect latency but not fairness.
+ */
+ if (sched_feat(PICK_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ WARN_ON_ONCE(cfs_rq->next->sched_delayed);
+ return cfs_rq->next;
+ }
+
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
- /*
- * Once selected, run a task until it either becomes non-eligible or
- * until it gets a new slice. See the HACK in set_next_entity().
- */
- if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ if (curr && protect && protect_slice(curr))
return curr;
/* Pick the leftmost entity if it's eligible */
@@ -969,7 +986,11 @@ found:
return best;
}
-#ifdef CONFIG_SCHED_DEBUG
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ return __pick_eevdf(cfs_rq, true);
+}
+
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -983,7 +1004,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
/**************************************************************
* Scheduling class statistics methods:
*/
-#ifdef CONFIG_SMP
int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
@@ -995,8 +1015,6 @@ int sched_update_scaling(void)
return 0;
}
-#endif
-#endif
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -1029,7 +1047,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#include "pelt.h"
-#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -1119,34 +1136,40 @@ void post_init_entity_util_avg(struct task_struct *p)
sa->runnable_avg = sa->util_avg;
}
-#else /* !CONFIG_SMP */
-void init_entity_runnable_average(struct sched_entity *se)
-{
-}
-void post_init_entity_util_avg(struct task_struct *p)
-{
-}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq)
-{
-}
-#endif /* CONFIG_SMP */
-
-static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
s64 delta_exec;
- delta_exec = now - curr->exec_start;
+ delta_exec = now - se->exec_start;
if (unlikely(delta_exec <= 0))
return delta_exec;
- curr->exec_start = now;
- curr->sum_exec_runtime += delta_exec;
+ se->exec_start = now;
+ if (entity_is_task(se)) {
+ struct task_struct *donor = task_of(se);
+ struct task_struct *running = rq->curr;
+ /*
+ * If se is a task, we account the time against the running
+ * task, as w/ proxy-exec they may not be the same.
+ */
+ running->se.exec_start = now;
+ running->se.sum_exec_runtime += delta_exec;
+
+ trace_sched_stat_runtime(running, delta_exec);
+ account_group_exec_runtime(running, delta_exec);
+
+ /* cgroup time is always accounted against the donor */
+ cgroup_account_cputime(donor, delta_exec);
+ } else {
+ /* If not task, account the time against donor se */
+ se->sum_exec_runtime += delta_exec;
+ }
if (schedstat_enabled()) {
struct sched_statistics *stats;
- stats = __schedstats_from_se(curr);
+ stats = __schedstats_from_se(se);
__schedstat_set(stats->exec_max,
max(delta_exec, stats->exec_max));
}
@@ -1154,60 +1177,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
return delta_exec;
}
-static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
-{
- trace_sched_stat_runtime(p, delta_exec);
- account_group_exec_runtime(p, delta_exec);
- cgroup_account_cputime(p, delta_exec);
- if (p->dl_server)
- dl_server_update(p->dl_server, delta_exec);
-}
-
-static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- if (!sched_feat(PREEMPT_SHORT))
- return false;
-
- if (curr->vlag == curr->deadline)
- return false;
-
- return !entity_eligible(cfs_rq, curr);
-}
-
-static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
- struct sched_entity *pse, struct sched_entity *se)
-{
- if (!sched_feat(PREEMPT_SHORT))
- return false;
-
- if (pse->slice >= se->slice)
- return false;
-
- if (!entity_eligible(cfs_rq, pse))
- return false;
-
- if (entity_before(pse, se))
- return true;
-
- if (!entity_eligible(cfs_rq, se))
- return true;
-
- return false;
-}
+static void set_next_buddy(struct sched_entity *se);
/*
* Used by other classes to account runtime.
*/
s64 update_curr_common(struct rq *rq)
{
- struct task_struct *donor = rq->donor;
- s64 delta_exec;
-
- delta_exec = update_curr_se(rq, &donor->se);
- if (likely(delta_exec > 0))
- update_curr_task(donor, delta_exec);
-
- return delta_exec;
+ return update_se(rq, &rq->donor->se);
}
/*
@@ -1215,6 +1192,12 @@ s64 update_curr_common(struct rq *rq)
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
+ /*
+ * Note: cfs_rq->curr corresponds to the task picked to
+ * run (ie: rq->donor.se) which due to proxy-exec may
+ * not necessarily be the actual task running
+ * (rq->curr.se). This is easy to confuse!
+ */
struct sched_entity *curr = cfs_rq->curr;
struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
@@ -1223,34 +1206,33 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (unlikely(!curr))
return;
- delta_exec = update_curr_se(rq, curr);
+ delta_exec = update_se(rq, curr);
if (unlikely(delta_exec <= 0))
return;
curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
- update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
- struct task_struct *p = task_of(curr);
-
- update_curr_task(p, delta_exec);
-
/*
- * Any fair task that runs outside of fair_server should
- * account against fair_server such that it can account for
- * this time and possibly avoid running this period.
+ * If the fair_server is active, we need to account for the
+ * fair_server time whether or not the task is running on
+ * behalf of fair_server or not:
+ * - If the task is running on behalf of fair_server, we need
+ * to limit its time based on the assigned runtime.
+ * - Fair task that runs outside of fair_server should account
+ * against fair_server such that it can account for this time
+ * and possibly avoid running this period.
*/
- if (p->dl_server != &rq->fair_server)
- dl_server_update(&rq->fair_server, delta_exec);
+ dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_queued == 1)
return;
- if (resched || did_preempt_short(cfs_rq, curr)) {
+ if (resched || !protect_slice(curr)) {
resched_curr_lazy(rq);
clear_buddies(cfs_rq, curr);
}
@@ -1497,7 +1479,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
* by the PTE scanner and NUMA hinting faults should be trapped based
* on resident pages
*/
- nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size);
rss = get_mm_rss(p->mm);
if (!rss)
rss = nr_scan_pages;
@@ -1925,17 +1907,18 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
struct pglist_data *pgdat;
unsigned long rate_limit;
unsigned int latency, th, def_th;
+ long nr = folio_nr_pages(folio);
pgdat = NODE_DATA(dst_nid);
if (pgdat_free_space_enough(pgdat)) {
/* workload changed, reset hot threshold */
pgdat->nbp_threshold = 0;
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr);
return true;
}
def_th = sysctl_numa_balancing_hot_threshold;
- rate_limit = sysctl_numa_balancing_promote_rate_limit << \
- (20 - PAGE_SHIFT);
+ rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit);
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
th = pgdat->nbp_threshold ? : def_th;
@@ -1943,8 +1926,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
if (latency >= th)
return false;
- return !numa_promotion_rate_limit(pgdat, rate_limit,
- folio_nr_pages(folio));
+ return !numa_promotion_rate_limit(pgdat, rate_limit, nr);
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
@@ -2099,12 +2081,12 @@ static inline int numa_idle_core(int idle_core, int cpu)
return idle_core;
}
-#else
+#else /* !CONFIG_SCHED_SMT: */
static inline int numa_idle_core(int idle_core, int cpu)
{
return idle_core;
}
-#endif
+#endif /* !CONFIG_SCHED_SMT */
/*
* Gather all necessary information to make NUMA balancing placement
@@ -2128,7 +2110,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util_cfs(cpu);
- ns->nr_running += rq->cfs.h_nr_running;
+ ns->nr_running += rq->cfs.h_nr_runnable;
ns->compute_capacity += capacity_of(cpu);
if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
@@ -2258,7 +2240,8 @@ static bool task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
- if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
+ if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
+ !cur->mm))
cur = NULL;
/*
@@ -3300,7 +3283,7 @@ static void task_numa_work(struct callback_head *work)
bool vma_pids_skipped;
bool vma_pids_forced = false;
- SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
work->next = work;
/*
@@ -3314,6 +3297,15 @@ static void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
+ /*
+ * Memory is pinned to only one NUMA node via cpuset.mems, naturally
+ * no page can be migrated.
+ */
+ if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
+ trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
+ return;
+ }
+
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
@@ -3399,11 +3391,17 @@ retry_pids:
/* Initialise new per-VMA NUMAB state. */
if (!vma->numab_state) {
- vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
- GFP_KERNEL);
- if (!vma->numab_state)
+ struct vma_numab_state *ptr;
+
+ ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
continue;
+ if (cmpxchg(&vma->numab_state, NULL, ptr)) {
+ kfree(ptr);
+ continue;
+ }
+
vma->numab_state->start_scan_seq = mm->numa_scan_seq;
vma->numab_state->next_scan = now +
@@ -3528,7 +3526,7 @@ out:
}
}
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+void init_numa_balancing(u64 clone_flags, struct task_struct *p)
{
int mm_users = 0;
struct mm_struct *mm = p->mm;
@@ -3642,7 +3640,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
p->numa_scan_period = task_scan_start(p);
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
+
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
@@ -3659,38 +3658,30 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
}
-#endif
- cfs_rq->nr_running++;
- if (se_is_idle(se))
- cfs_rq->idle_nr_running++;
+ cfs_rq->nr_queued++;
}
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
-#endif
- cfs_rq->nr_running--;
- if (se_is_idle(se))
- cfs_rq->idle_nr_running--;
+ cfs_rq->nr_queued--;
}
/*
@@ -3741,7 +3732,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
-#ifdef CONFIG_SMP
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -3758,169 +3748,50 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
-#else
-static inline void
-enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
-dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-#endif
-static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
- unsigned long weight)
-{
- unsigned long old_weight = se->load.weight;
- s64 vlag, vslice;
-
- /*
- * VRUNTIME
- * --------
- *
- * COROLLARY #1: The virtual runtime of the entity needs to be
- * adjusted if re-weight at !0-lag point.
- *
- * Proof: For contradiction assume this is not true, so we can
- * re-weight without changing vruntime at !0-lag point.
- *
- * Weight VRuntime Avg-VRuntime
- * before w v V
- * after w' v' V'
- *
- * Since lag needs to be preserved through re-weight:
- *
- * lag = (V - v)*w = (V'- v')*w', where v = v'
- * ==> V' = (V - v)*w/w' + v (1)
- *
- * Let W be the total weight of the entities before reweight,
- * since V' is the new weighted average of entities:
- *
- * V' = (WV + w'v - wv) / (W + w' - w) (2)
- *
- * by using (1) & (2) we obtain:
- *
- * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
- * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
- * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
- * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
- *
- * Since we are doing at !0-lag point which means V != v, we
- * can simplify (3):
- *
- * ==> W / (W + w' - w) = w / w'
- * ==> Ww' = Ww + ww' - ww
- * ==> W * (w' - w) = w * (w' - w)
- * ==> W = w (re-weight indicates w' != w)
- *
- * So the cfs_rq contains only one entity, hence vruntime of
- * the entity @v should always equal to the cfs_rq's weighted
- * average vruntime @V, which means we will always re-weight
- * at 0-lag point, thus breach assumption. Proof completed.
- *
- *
- * COROLLARY #2: Re-weight does NOT affect weighted average
- * vruntime of all the entities.
- *
- * Proof: According to corollary #1, Eq. (1) should be:
- *
- * (V - v)*w = (V' - v')*w'
- * ==> v' = V' - (V - v)*w/w' (4)
- *
- * According to the weighted average formula, we have:
- *
- * V' = (WV - wv + w'v') / (W - w + w')
- * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
- * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
- * = (WV + w'V' - Vw) / (W - w + w')
- *
- * ==> V'*(W - w + w') = WV + w'V' - Vw
- * ==> V' * (W - w) = (W - w) * V (5)
- *
- * If the entity is the only one in the cfs_rq, then reweight
- * always occurs at 0-lag point, so V won't change. Or else
- * there are other entities, hence W != w, then Eq. (5) turns
- * into V' = V. So V won't change in either case, proof done.
- *
- *
- * So according to corollary #1 & #2, the effect of re-weight
- * on vruntime should be:
- *
- * v' = V' - (V - v) * w / w' (4)
- * = V - (V - v) * w / w'
- * = V - vl * w / w'
- * = V - vl'
- */
- if (avruntime != se->vruntime) {
- vlag = entity_lag(avruntime, se);
- vlag = div_s64(vlag * old_weight, weight);
- se->vruntime = avruntime - vlag;
- }
-
- /*
- * DEADLINE
- * --------
- *
- * When the weight changes, the virtual time slope changes and
- * we should adjust the relative virtual deadline accordingly.
- *
- * d' = v' + (d - v)*w/w'
- * = V' - (V - v)*w/w' + (d - v)*w/w'
- * = V - (V - v)*w/w' + (d - v)*w/w'
- * = V + (d - V)*w/w'
- */
- vslice = (s64)(se->deadline - avruntime);
- vslice = div_s64(vslice * old_weight, weight);
- se->deadline = avruntime + vslice;
-}
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
- u64 avruntime;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
- avruntime = avg_vruntime(cfs_rq);
+ update_entity_lag(cfs_rq, se);
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
+ cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- if (se->on_rq) {
- reweight_eevdf(se, avruntime, weight);
- } else {
- /*
- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
- * we need to scale se->vlag when w_i changes.
- */
- se->vlag = div_s64(se->vlag * se->load.weight, weight);
- }
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
+ if (se->rel_deadline)
+ se->deadline = div_s64(se->deadline * se->load.weight, weight);
update_load_set(&se->load, weight);
-#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
-#endif
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
+ place_entity(cfs_rq, se, 0);
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
-
- /*
- * The entity's vruntime has been adjusted, so let's check
- * whether the rq-wide min_vruntime needs updated too. Since
- * the calculations above require stable min_vruntime rather
- * than up-to-date one, we do the update at the end of the
- * reweight process.
- */
- update_min_vruntime(cfs_rq);
+ cfs_rq->nr_queued++;
}
}
@@ -3938,7 +3809,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
@@ -4045,7 +3915,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
*/
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-#endif /* CONFIG_SMP */
/*
* Recomputes the group entity based on the current state of its group
@@ -4056,26 +3925,23 @@ static void update_cfs_group(struct sched_entity *se)
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares;
- if (!gcfs_rq)
- return;
-
- if (throttled_hierarchy(gcfs_rq))
+ /*
+ * When a group becomes empty, preserve its weight. This matters for
+ * DELAY_DEQUEUE.
+ */
+ if (!gcfs_rq || !gcfs_rq->load.weight)
return;
-#ifndef CONFIG_SMP
- shares = READ_ONCE(gcfs_rq->tg->shares);
-#else
shares = calc_group_shares(gcfs_rq);
-#endif
if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void update_cfs_group(struct sched_entity *se)
{
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
@@ -4100,7 +3966,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
}
-#ifdef CONFIG_SMP
static inline bool load_avg_is_decayed(struct sched_avg *sa)
{
if (sa->load_sum)
@@ -4117,7 +3982,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa)
* Make sure that rounding and/or propagation of PELT values never
* break this.
*/
- SCHED_WARN_ON(sa->load_avg ||
+ WARN_ON_ONCE(sa->load_avg ||
sa->util_avg ||
sa->runnable_avg);
@@ -4142,15 +4007,17 @@ static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
{
struct cfs_rq *prev_cfs_rq;
struct list_head *prev;
+ struct rq *rq = rq_of(cfs_rq);
if (cfs_rq->on_list) {
prev = cfs_rq->leaf_cfs_rq_list.prev;
} else {
- struct rq *rq = rq_of(cfs_rq);
-
prev = rq->tmp_alone_branch;
}
+ if (prev == &rq->leaf_cfs_rq_list)
+ return false;
+
prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
return (prev_cfs_rq->tg->parent == cfs_rq->tg);
@@ -4167,6 +4034,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (child_cfs_rq_on_list(cfs_rq))
return false;
+ if (cfs_rq->tg_load_avg_contrib)
+ return false;
+
return true;
}
@@ -4550,7 +4420,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
return true;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
@@ -4563,7 +4433,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_NO_HZ_COMMON
static inline void migrate_se_pelt_lag(struct sched_entity *se)
@@ -4644,9 +4514,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
__update_load_avg_blocked_se(now, se);
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static void migrate_se_pelt_lag(struct sched_entity *se) {}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
@@ -5014,13 +4884,6 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
goto done;
/*
- * To avoid overestimation of actual task utilization, skip updates if
- * we cannot grant there is idle time in this CPU.
- */
- if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
- return;
-
- /*
* To avoid underestimate of task utilization, skip updates of EWMA if
* we cannot grant that thread got all CPU time it wanted.
*/
@@ -5220,48 +5083,22 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
-#else /* CONFIG_SMP */
-
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- return !cfs_rq->nr_running;
-}
-
-#define UPDATE_TG 0x0
-#define SKIP_AGE_LOAD 0x0
-#define DO_ATTACH 0x0
-#define DO_DETACH 0x0
-
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
+void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
{
- cfs_rq_util_change(cfs_rq, 0);
-}
-
-static inline void remove_entity_load_avg(struct sched_entity *se) {}
-
-static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+ struct sched_entity *se = &p->se;
-static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
-{
- return 0;
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ se->custom_slice = 1;
+ se->slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ se->custom_slice = 0;
+ se->slice = sysctl_sched_base_slice;
+ }
}
-static inline void
-util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
-static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
-
-#endif /* CONFIG_SMP */
-
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -5280,7 +5117,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* EEVDF: placement strategy #1 / #2
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) {
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
@@ -5313,7 +5150,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
* = (W*V + w_i*(V - vl_i)) / (W + w_i)
* = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
- * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
* = V - w_i*vl_i / (W + w_i)
*
* And the actual lag after adding an entity with vl_i is:
@@ -5350,7 +5187,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
- if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
+ if (se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
@@ -5373,8 +5210,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
-static inline bool cfs_bandwidth_used(void);
-
static void
requeue_delayed_entity(struct sched_entity *se);
@@ -5396,7 +5231,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_running of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -5429,20 +5264,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
+ if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (cfs_rq->pelt_clock_throttled) {
struct rq *rq = rq_of(cfs_rq);
- if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
- cfs_rq->throttled_clock = rq_clock(rq);
- if (!cfs_rq->throttled_clock_self)
- cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
}
+#endif
}
}
@@ -5465,9 +5298,48 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+static void set_delayed(struct sched_entity *se)
+{
+ se->sched_delayed = 1;
+
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since dequeue_entities()
+ * will account it for blocked tasks.
+ */
+ if (!entity_is_task(se))
+ return;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ cfs_rq->h_nr_runnable--;
+ }
+}
+
+static void clear_delayed(struct sched_entity *se)
{
se->sched_delayed = 0;
+
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since a dequeue has
+ * already accounted for it or an enqueue of a task
+ * below it will account for it in enqueue_task_fair().
+ */
+ if (!entity_is_task(se))
+ return;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ cfs_rq->h_nr_runnable++;
+ }
+}
+
+static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+{
+ clear_delayed(se);
if (sched_feat(DELAY_ZERO) && se->vlag > 0)
se->vlag = 0;
}
@@ -5476,33 +5348,32 @@ static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool sleep = flags & DEQUEUE_SLEEP;
+ int action = UPDATE_TG;
update_curr(cfs_rq);
+ clear_buddies(cfs_rq, se);
if (flags & DEQUEUE_DELAYED) {
- SCHED_WARN_ON(!se->sched_delayed);
+ WARN_ON_ONCE(!se->sched_delayed);
} else {
bool delay = sleep;
/*
* DELAY_DEQUEUE relies on spurious wakeups, special task
* states must not suffer spurious wakeups, excempt them.
*/
- if (flags & DEQUEUE_SPECIAL)
+ if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
delay = false;
- SCHED_WARN_ON(delay && se->sched_delayed);
+ WARN_ON_ONCE(delay && se->sched_delayed);
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
update_load_avg(cfs_rq, se, 0);
- se->sched_delayed = 1;
+ set_delayed(se);
return false;
}
}
- int action = UPDATE_TG;
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
action |= DO_DETACH;
@@ -5510,7 +5381,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_running of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
@@ -5520,8 +5391,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_dequeue_fair(cfs_rq, se, flags);
- clear_buddies(cfs_rq, se);
-
update_entity_lag(cfs_rq, se);
if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
se->deadline -= se->vruntime;
@@ -5538,20 +5407,21 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_group(se);
- /*
- * Now advance min_vruntime if @se was the entity holding it back,
- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
- * put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- i.e. we'll be penalized.
- */
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
- update_min_vruntime(cfs_rq);
-
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
- if (cfs_rq->nr_running == 0)
+ if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
+ }
+#endif
+ }
return true;
}
@@ -5571,15 +5441,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- /*
- * HACK, stash a copy of deadline at the point of pick in vlag,
- * which isn't used until dequeue.
- */
- se->vlag = se->deadline;
+
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
- SCHED_WARN_ON(cfs_rq->curr);
+ WARN_ON_ONCE(cfs_rq->curr);
cfs_rq->curr = se;
/*
@@ -5612,17 +5479,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
static struct sched_entity *
pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
- /*
- * Enabling NEXT_BUDDY will affect latency but not fairness.
- */
- if (sched_feat(NEXT_BUDDY) &&
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
- /* ->next will never be delayed */
- SCHED_WARN_ON(cfs_rq->next->sched_delayed);
- return cfs_rq->next;
- }
+ struct sched_entity *se;
- struct sched_entity *se = pick_eevdf(cfs_rq);
+ se = pick_eevdf(cfs_rq);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@@ -5654,7 +5513,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
- SCHED_WARN_ON(cfs_rq->curr != prev);
+ WARN_ON_ONCE(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
@@ -5708,7 +5567,7 @@ void cfs_bandwidth_usage_dec(void)
{
static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
-#else /* CONFIG_JUMP_LABEL */
+#else /* !CONFIG_JUMP_LABEL: */
static bool cfs_bandwidth_used(void)
{
return true;
@@ -5716,16 +5575,7 @@ static bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
-#endif /* CONFIG_JUMP_LABEL */
-
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
- return 100000000ULL;
-}
+#endif /* !CONFIG_JUMP_LABEL */
static inline u64 sched_cfs_bandwidth_slice(void)
{
@@ -5835,74 +5685,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
+static inline bool task_is_throttled(struct task_struct *p)
+{
+ return cfs_bandwidth_used() && p->throttled;
+}
+
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
+static void throttle_cfs_rq_work(struct callback_head *work)
+{
+ struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+
+ WARN_ON_ONCE(p != current);
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+
+ /*
+ * If task is exiting, then there won't be a return to userspace, so we
+ * don't have to bother with any of this.
+ */
+ if ((p->flags & PF_EXITING))
+ return;
+
+ scoped_guard(task_rq_lock, p) {
+ se = &p->se;
+ cfs_rq = cfs_rq_of(se);
+
+ /* Raced, forget */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ /*
+ * If not in limbo, then either replenish has happened or this
+ * task got migrated out of the throttled cfs_rq, move along.
+ */
+ if (!cfs_rq->throttle_count)
+ return;
+ rq = scope.rq;
+ update_rq_clock(rq);
+ WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ /*
+ * Must not set throttled before dequeue or dequeue will
+ * mistakenly regard this task as an already throttled one.
+ */
+ p->throttled = true;
+ resched_curr(rq);
+ }
+}
+
+void init_cfs_throttle_work(struct task_struct *p)
+{
+ init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
+ /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+ INIT_LIST_HEAD(&p->throttle_node);
+}
+
/*
- * Ensure that neither of the group entities corresponding to src_cpu or
- * dest_cpu are members of a throttled hierarchy when performing group
- * load-balance operations.
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
*/
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
{
- struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+ WARN_ON_ONCE(p->se.on_rq);
+ list_del_init(&p->throttle_node);
- src_cfs_rq = tg->cfs_rq[src_cpu];
- dest_cfs_rq = tg->cfs_rq[dest_cpu];
+ /* task blocked after throttled */
+ if (flags & DEQUEUE_SLEEP) {
+ p->throttled = false;
+ return;
+ }
- return throttled_hierarchy(src_cfs_rq) ||
- throttled_hierarchy(dest_cfs_rq);
+ /*
+ * task is migrating off its old cfs_rq, detach
+ * the task's load from its old cfs_rq.
+ */
+ if (task_on_rq_migrating(p))
+ detach_task_cfs_rq(p);
}
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+ /* @p should have gone through dequeue_throttled_task() first */
+ WARN_ON_ONCE(!list_empty(&p->throttle_node));
+
+ /*
+ * If the throttled task @p is enqueued to a throttled cfs_rq,
+ * take the fast path by directly putting the task on the
+ * target cfs_rq's limbo list.
+ *
+ * Do not do that when @p is current because the following race can
+ * cause @p's group_node to be incorectly re-insterted in its rq's
+ * cfs_tasks list, despite being throttled:
+ *
+ * cpuX cpuY
+ * p ret2user
+ * throttle_cfs_rq_work() sched_move_task(p)
+ * LOCK task_rq_lock
+ * dequeue_task_fair(p)
+ * UNLOCK task_rq_lock
+ * LOCK task_rq_lock
+ * task_current_donor(p) == true
+ * task_on_rq_queued(p) == true
+ * dequeue_task(p)
+ * put_prev_task(p)
+ * sched_change_group()
+ * enqueue_task(p) -> p's new cfs_rq
+ * is throttled, go
+ * fast path and skip
+ * actual enqueue
+ * set_next_task(p)
+ * list_move(&se->group_node, &rq->cfs_tasks); // bug
+ * schedule()
+ *
+ * In the above race case, @p current cfs_rq is in the same rq as
+ * its previous cfs_rq because sched_move_task() only moves a task
+ * to a different group from the same rq, so we can use its current
+ * cfs_rq to derive rq and test if the task is current.
+ */
+ if (throttled_hierarchy(cfs_rq) &&
+ !task_current_donor(rq_of(cfs_rq), p)) {
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
+
+ /* we can't take the fast path, do an actual enqueue*/
+ p->throttled = false;
+ return false;
+}
+
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;
- cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
+ if (--cfs_rq->throttle_count)
+ return 0;
+
+ if (cfs_rq->pelt_clock_throttled) {
cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
- /* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ cfs_rq->throttled_clock_self = 0;
- cfs_rq->throttled_clock_self = 0;
+ if (WARN_ON_ONCE((s64)delta < 0))
+ delta = 0;
- if (SCHED_WARN_ON((s64)delta < 0))
- delta = 0;
+ cfs_rq->throttled_clock_self_time += delta;
+ }
- cfs_rq->throttled_clock_self_time += delta;
- }
+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_del_init(&p->throttle_node);
+ p->throttled = false;
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+ return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+ if (task_has_throttle_work(p))
+ return;
+
+ /*
+ * Kthreads and exiting tasks don't return to userspace, so adding the
+ * work is pointless
+ */
+ if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+ return;
+
+ task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
+static void record_throttle_clock(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttle_count++)
+ return 0;
- SCHED_WARN_ON(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_running)
- cfs_rq->throttled_clock_self = rq_clock(rq);
+ /*
+ * For cfs_rqs that still have entities enqueued, PELT clock
+ * stop happens at dequeue time when all entities are dequeued.
+ */
+ if (!cfs_rq->nr_queued) {
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
}
- cfs_rq->throttle_count++;
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
return 0;
}
@@ -5910,9 +5939,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long task_delta, idle_task_delta, dequeue = 1;
- long rq_h_nr_running = rq->cfs.h_nr_running;
+ int dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5935,77 +5962,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!dequeue)
return false; /* Throttle no longer required. */
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- int flags;
-
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- /*
- * Abuse SPECIAL to avoid delayed dequeue in this instance.
- * This avoids teaching dequeue_entities() about throttled
- * entities and keeps things relatively simple.
- */
- flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
- if (se->sched_delayed)
- flags |= DEQUEUE_DELAYED;
- dequeue_entity(qcfs_rq, se, flags);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
-
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
-
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, task_delta);
-
- /* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_running && !rq->cfs.h_nr_running)
- dl_server_stop(&rq->fair_server);
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- SCHED_WARN_ON(cfs_rq->throttled_clock);
- if (cfs_rq->nr_running)
- cfs_rq->throttled_clock = rq_clock(rq);
+ WARN_ON_ONCE(cfs_rq->throttled_clock);
return true;
}
@@ -6013,11 +5980,19 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long task_delta, idle_task_delta;
- long rq_h_nr_running = rq->cfs.h_nr_running;
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
- se = cfs_rq->tg->se[cpu_of(rq)];
+ /*
+ * It's possible we are called with runtime_remaining < 0 due to things
+ * like async unthrottled us with a positive runtime_remaining but other
+ * still running entities consumed those runtime before we reached here.
+ *
+ * We can't unthrottle this cfs_rq without any runtime remaining because
+ * any enqueue in tg_unthrottle_up() will immediately trigger a throttle,
+ * which is not supposed to happen on unthrottle path.
+ */
+ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+ return;
cfs_rq->throttled = 0;
@@ -6045,67 +6020,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
break;
}
- goto unthrottle_throttle;
- }
-
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- /* Handle any unfinished DELAY_DEQUEUE business first. */
- if (se->sched_delayed) {
- int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
- dequeue_entity(qcfs_rq, se, flags);
- } else if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
-
- qcfs_rq->h_nr_running += task_delta;
- qcfs_rq->idle_h_nr_running += idle_task_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
-
- qcfs_rq->h_nr_running += task_delta;
- qcfs_rq->idle_h_nr_running += idle_task_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
}
- /* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_running && rq->cfs.h_nr_running)
- dl_server_start(&rq->fair_server);
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, task_delta);
-
-unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
- if (rq->curr == rq->idle && rq->cfs.nr_running)
+ if (rq->curr == rq->idle && rq->cfs.nr_queued)
resched_curr(rq);
}
-#ifdef CONFIG_SMP
static void __cfsb_csd_unthrottle(void *arg)
{
struct cfs_rq *cursor, *tmp;
@@ -6156,7 +6079,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
}
/* Already enqueued */
- if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
return;
first = list_empty(&rq->cfsb_csd_list);
@@ -6164,18 +6087,12 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
if (first)
smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
}
-#else
-static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
-{
- unthrottle_cfs_rq(cfs_rq);
-}
-#endif
static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
lockdep_assert_rq_held(rq_of(cfs_rq));
- if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) ||
cfs_rq->runtime_remaining <= 0))
return;
@@ -6211,7 +6128,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
goto next;
/* By the above checks, this should never be true */
- SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
@@ -6232,7 +6149,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
* We currently only expect to be unthrottling
* a single cfs_rq locally.
*/
- SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
list_add_tail(&cfs_rq->throttled_csd_list,
&local_unthrottle);
}
@@ -6257,7 +6174,7 @@ next:
rq_unlock_irqrestore(rq, &rf);
}
- SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
rcu_read_unlock();
@@ -6402,7 +6319,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -6476,6 +6393,16 @@ static void sync_throttle(struct task_group *tg, int cpu)
cfs_rq->throttle_count = pcfs_rq->throttle_count;
cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
+
+ /*
+ * It is not enough to sync the "pelt_clock_throttled" indicator
+ * with the parent cfs_rq when the hierarchy is not queued.
+ * Always join a throttled hierarchy with PELT clock throttled
+ * and leaf it to the first enqueue, or distribution to
+ * unthrottle the PELT clock.
+ */
+ if (cfs_rq->throttle_count)
+ cfs_rq->pelt_clock_throttled = 1;
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -6507,8 +6434,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-extern const u64 max_cfs_quota_period;
-
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -6535,7 +6460,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
* to fail.
*/
new = old * 2;
- if (new < max_cfs_quota_period) {
+ if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
cfs_b->burst *= 2;
@@ -6569,19 +6494,19 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
- cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->period = us_to_ktime(default_bw_period_us());
cfs_b->burst = 0;
cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
- cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
/* Add a random offset so that timers interleave */
hrtimer_set_expires(&cfs_b->period_timer,
get_random_u32_below(cfs_b->period));
- hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
cfs_b->slack_started = false;
}
@@ -6590,6 +6515,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -6625,7 +6551,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* guaranteed at this point that no additional cfs_rq of this group can
* join a CSD list.
*/
-#ifdef CONFIG_SMP
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
unsigned long flags;
@@ -6637,7 +6562,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
__cfsb_csd_unthrottle(rq);
local_irq_restore(flags);
}
-#endif
}
/*
@@ -6673,6 +6597,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
+ // Do not unthrottle for an active CPU
+ if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
+ return;
+
/*
* The rq clock has already been updated in the
* set_rq_offline(), so we should skip updating
@@ -6688,18 +6616,20 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
continue;
/*
- * clock_task is not advancing so we just need to make sure
- * there's some valid quota amount
- */
- cfs_rq->runtime_remaining = 1;
- /*
* Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
- if (cfs_rq_throttled(cfs_rq))
- unthrottle_cfs_rq(cfs_rq);
+ if (!cfs_rq_throttled(cfs_rq))
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = 1;
+ unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
@@ -6744,33 +6674,37 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
if (cfs_task_bw_constrained(p))
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
-#endif
+#endif /* CONFIG_NO_HZ_FULL */
-#else /* CONFIG_CFS_BANDWIDTH */
-
-static inline bool cfs_bandwidth_used(void)
-{
- return false;
-}
+#else /* !CONFIG_CFS_BANDWIDTH: */
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void task_throttle_setup_work(struct task_struct *p) {}
+static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
+static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return false;
+}
+
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return 0;
}
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
{
return 0;
}
@@ -6793,7 +6727,7 @@ bool cfs_task_bw_constrained(struct task_struct *p)
return false;
}
#endif
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* !CONFIG_CFS_BANDWIDTH */
#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
@@ -6808,9 +6742,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- SCHED_WARN_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
- if (rq->cfs.h_nr_running > 1) {
+ if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
u64 slice = se->slice;
s64 delta = slice - ran;
@@ -6838,7 +6772,7 @@ static void hrtick_update(struct rq *rq)
hrtick_start_fair(rq, donor);
}
-#else /* !CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
@@ -6847,9 +6781,8 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static inline void hrtick_update(struct rq *rq)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
-#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
unsigned long rq_util_min, rq_util_max;
@@ -6891,23 +6824,18 @@ static inline void check_update_overutilized_status(struct rq *rq)
if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
set_rd_overutilized(rq->rd, 1);
}
-#else
-static inline void check_update_overutilized_status(struct rq *rq) { }
-#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
{
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
rq->nr_running);
}
-#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
return sched_idle_rq(cpu_rq(cpu));
}
-#endif
static void
requeue_delayed_entity(struct sched_entity *se)
@@ -6919,25 +6847,25 @@ requeue_delayed_entity(struct sched_entity *se)
* Because a delayed entity is one that is still on
* the runqueue competing until elegibility.
*/
- SCHED_WARN_ON(!se->sched_delayed);
- SCHED_WARN_ON(!se->on_rq);
+ WARN_ON_ONCE(!se->sched_delayed);
+ WARN_ON_ONCE(!se->on_rq);
if (sched_feat(DELAY_ZERO)) {
update_entity_lag(cfs_rq, se);
if (se->vlag > 0) {
- cfs_rq->nr_running--;
+ cfs_rq->nr_queued--;
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->vlag = 0;
place_entity(cfs_rq, se, 0);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
- cfs_rq->nr_running++;
+ cfs_rq->nr_queued++;
}
}
update_load_avg(cfs_rq, se, 0);
- se->sched_delayed = 0;
+ clear_delayed(se);
}
/*
@@ -6950,18 +6878,22 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int idle_h_nr_running = task_has_idle_policy(p);
+ int h_nr_idle = task_has_idle_policy(p);
+ int h_nr_runnable = 1;
int task_new = !(flags & ENQUEUE_WAKEUP);
- int rq_h_nr_running = rq->cfs.h_nr_running;
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (task_is_throttled(p) && enqueue_throttled_task(p))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
- if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+ if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
util_est_enqueue(&rq->cfs, p);
if (flags & ENQUEUE_DELAYED) {
@@ -6977,6 +6909,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+ if (task_new && se->sched_delayed)
+ h_nr_runnable = 0;
+
for_each_sched_entity(se) {
if (se->on_rq) {
if (se->sched_delayed)
@@ -6997,15 +6932,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_entity(cfs_rq, se, flags);
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
+ h_nr_idle = 1;
flags = ENQUEUE_WAKEUP;
}
@@ -7018,25 +6950,20 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
+ h_nr_idle = 1;
}
- if (!rq_h_nr_running && rq->cfs.h_nr_running) {
- /* Account for idle runtime */
- if (!rq->nr_running)
- dl_server_update_idle_time(rq, rq->curr);
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
- }
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
@@ -7058,14 +6985,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
check_update_overutilized_status(rq);
-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
}
-static void set_next_buddy(struct sched_entity *se);
-
/*
* Basically dequeue_task_fair(), except it can deal with dequeue_entity()
* failing half-way through and resume the dequeue later.
@@ -7078,22 +7002,22 @@ static void set_next_buddy(struct sched_entity *se);
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
bool was_sched_idle = sched_idle_rq(rq);
- int rq_h_nr_running = rq->cfs.h_nr_running;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
+ bool task_throttled = flags & DEQUEUE_THROTTLE;
struct task_struct *p = NULL;
- int idle_h_nr_running = 0;
- int h_nr_running = 0;
+ int h_nr_idle = 0;
+ int h_nr_queued = 0;
+ int h_nr_runnable = 0;
struct cfs_rq *cfs_rq;
u64 slice = 0;
if (entity_is_task(se)) {
p = task_of(se);
- h_nr_running = 1;
- idle_h_nr_running = task_has_idle_policy(p);
- } else {
- cfs_rq = group_cfs_rq(se);
- slice = cfs_rq_min_slice(cfs_rq);
+ h_nr_queued = 1;
+ h_nr_idle = task_has_idle_policy(p);
+ if (task_sleep || task_delayed || !se->sched_delayed)
+ h_nr_runnable = 1;
}
for_each_sched_entity(se) {
@@ -7103,18 +7027,19 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (p && &p->se == se)
return -1;
+ slice = cfs_rq_min_slice(cfs_rq);
break;
}
- cfs_rq->h_nr_running -= h_nr_running;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = h_nr_running;
+ h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -7126,7 +7051,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+ if (task_sleep && se)
set_next_buddy(se);
break;
}
@@ -7142,31 +7067,30 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
update_cfs_group(se);
se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running -= h_nr_running;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = h_nr_running;
+ h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
}
- sub_nr_running(rq, h_nr_running);
-
- if (rq_h_nr_running && !rq->cfs.h_nr_running)
- dl_server_stop(&rq->fair_server);
+ sub_nr_running(rq, h_nr_queued);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
if (p && task_delayed) {
- SCHED_WARN_ON(!task_sleep);
- SCHED_WARN_ON(p->on_rq != 1);
+ WARN_ON_ONCE(!task_sleep);
+ WARN_ON_ONCE(p->on_rq != 1);
/* Fix-up what dequeue_task_fair() skipped */
hrtick_update(rq);
@@ -7189,7 +7113,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
- if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+ if (task_is_throttled(p)) {
+ dequeue_throttled_task(p, flags);
+ return true;
+ }
+
+ if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
@@ -7204,7 +7133,10 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
return true;
}
-#ifdef CONFIG_SMP
+static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
+{
+ return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
+}
/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7365,8 +7297,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
- if (sync && cpu_rq(this_cpu)->nr_running == 1)
- return this_cpu;
+ if (sync) {
+ struct rq *rq = cpu_rq(this_cpu);
+
+ if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1)
+ return this_cpu;
+ }
if (available_idle_cpu(prev_cpu))
return prev_cpu;
@@ -7671,7 +7607,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
}
-#else /* CONFIG_SCHED_SMT */
+#else /* !CONFIG_SCHED_SMT: */
static inline void set_idle_cores(int cpu, int val)
{
@@ -7692,7 +7628,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
return -1;
}
-#endif /* CONFIG_SCHED_SMT */
+#endif /* !CONFIG_SCHED_SMT */
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
@@ -8729,22 +8665,10 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
set_task_max_allowed_capacity(p);
}
-static int
-balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
- if (sched_fair_runnable(rq))
- return 1;
-
- return sched_balance_newidle(rq, rf) != 0;
-}
-#else
-static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
-#endif /* CONFIG_SMP */
-
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
+ if (WARN_ON_ONCE(!se->on_rq))
return;
if (se_is_idle(se))
return;
@@ -8752,11 +8676,77 @@ static void set_next_buddy(struct sched_entity *se)
}
}
+enum preempt_wakeup_action {
+ PREEMPT_WAKEUP_NONE, /* No preemption. */
+ PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */
+ PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */
+ PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */
+};
+
+static inline bool
+set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ /*
+ * Keep existing buddy if the deadline is sooner than pse.
+ * The older buddy may be cache cold and completely unrelated
+ * to the current wakeup but that is unpredictable where as
+ * obeying the deadline is more in line with EEVDF objectives.
+ */
+ if (cfs_rq->next && entity_before(cfs_rq->next, pse))
+ return false;
+
+ set_next_buddy(pse);
+ return true;
+}
+
+/*
+ * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
+ * strictly enforced because the hint is either misunderstood or
+ * multiple tasks must be woken up.
+ */
+static inline enum preempt_wakeup_action
+preempt_sync(struct rq *rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ u64 threshold, delta;
+
+ /*
+ * WF_SYNC without WF_TTWU is not expected so warn if it happens even
+ * though it is likely harmless.
+ */
+ WARN_ON_ONCE(!(wake_flags & WF_TTWU));
+
+ threshold = sysctl_sched_migration_cost;
+ delta = rq_clock_task(rq) - se->exec_start;
+ if ((s64)delta < 0)
+ delta = 0;
+
+ /*
+ * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they
+ * could run on other CPUs. Reduce the threshold before preemption is
+ * allowed to an arbitrary lower value as it is more likely (but not
+ * guaranteed) the waker requires the wakee to finish.
+ */
+ if (wake_flags & WF_RQ_SELECTED)
+ threshold >>= 2;
+
+ /*
+ * As WF_SYNC is not strictly obeyed, allow some runtime for batch
+ * wakeups to be issued.
+ */
+ if (entity_before(pse, se) && delta >= threshold)
+ return PREEMPT_WAKEUP_RESCHED;
+
+ return PREEMPT_WAKEUP_NONE;
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
+ enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
struct sched_entity *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
@@ -8771,13 +8761,9 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
- if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ if (task_is_throttled(p))
return;
- if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
- set_next_buddy(pse);
- }
-
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
@@ -8804,8 +8790,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* Preempt an idle entity in favor of a non-idle entity (and don't preempt
* in the inverse case).
*/
- if (cse_is_idle && !pse_is_idle)
+ if (cse_is_idle && !pse_is_idle) {
+ /*
+ * When non-idle entity preempt an idle entity,
+ * don't give idle entity slice protection.
+ */
+ preempt_action = PREEMPT_WAKEUP_SHORT;
goto preempt;
+ }
+
if (cse_is_idle != pse_is_idle)
return;
@@ -8820,42 +8813,94 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
/*
* If @p has a shorter slice than current and @p is eligible, override
* current's slice protection in order to allow preemption.
- *
- * Note that even if @p does not turn out to be the most eligible
- * task at this moment, current's slice protection will be lost.
*/
- if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
- se->vlag = se->deadline + 1;
+ if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
+ preempt_action = PREEMPT_WAKEUP_SHORT;
+ goto pick;
+ }
+
+ /*
+ * Ignore wakee preemption on WF_FORK as it is less likely that
+ * there is shared data as exec often follow fork. Do not
+ * preempt for tasks that are sched_delayed as it would violate
+ * EEVDF to forcibly queue an ineligible task.
+ */
+ if ((wake_flags & WF_FORK) || pse->sched_delayed)
+ return;
+
+ /*
+ * If @p potentially is completing work required by current then
+ * consider preemption.
+ *
+ * Reschedule if waker is no longer eligible. */
+ if (in_task() && !entity_eligible(cfs_rq, se)) {
+ preempt_action = PREEMPT_WAKEUP_RESCHED;
+ goto preempt;
+ }
+ /* Prefer picking wakee soon if appropriate. */
+ if (sched_feat(NEXT_BUDDY) &&
+ set_preempt_buddy(cfs_rq, wake_flags, pse, se)) {
+
+ /*
+ * Decide whether to obey WF_SYNC hint for a new buddy. Old
+ * buddies are ignored as they may not be relevant to the
+ * waker and less likely to be cache hot.
+ */
+ if (wake_flags & WF_SYNC)
+ preempt_action = preempt_sync(rq, wake_flags, pse, se);
+ }
+
+ switch (preempt_action) {
+ case PREEMPT_WAKEUP_NONE:
+ return;
+ case PREEMPT_WAKEUP_RESCHED:
+ goto preempt;
+ case PREEMPT_WAKEUP_SHORT:
+ fallthrough;
+ case PREEMPT_WAKEUP_PICK:
+ break;
+ }
+
+pick:
/*
* If @p has become the most eligible task, force preemption.
*/
- if (pick_eevdf(cfs_rq) == pse)
+ if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
goto preempt;
+ if (sched_feat(RUN_TO_PARITY))
+ update_protect_slice(cfs_rq, se);
+
return;
preempt:
+ if (preempt_action == PREEMPT_WAKEUP_SHORT)
+ cancel_protect_slice(se);
+
resched_curr_lazy(rq);
}
-static struct task_struct *pick_task_fair(struct rq *rq)
+static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
+ struct task_struct *p;
+ bool throttled;
again:
cfs_rq = &rq->cfs;
- if (!cfs_rq->nr_running)
+ if (!cfs_rq->nr_queued)
return NULL;
+ throttled = false;
+
do {
/* Might not have done put_prev_entity() */
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
+ throttled |= check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
@@ -8863,7 +8908,10 @@ again:
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
- return task_of(se);
+ p = task_of(se);
+ if (unlikely(throttled))
+ task_throttle_setup_work(p);
+ return p;
}
static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
@@ -8877,7 +8925,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
- p = pick_task_fair(rq);
+ p = pick_task_fair(rq, rf);
if (!p)
goto idle;
se = &p->se;
@@ -8926,26 +8974,26 @@ again:
return p;
simple:
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
put_prev_set_next_task(rq, prev, p);
return p;
idle:
- if (!rf)
- return NULL;
-
- new_tasks = sched_balance_newidle(rq, rf);
+ if (rf) {
+ new_tasks = sched_balance_newidle(rq, rf);
- /*
- * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
- * possible for any higher priority task to appear. In that case we
- * must re-start the pick_next_entity() loop.
- */
- if (new_tasks < 0)
- return RETRY_TASK;
+ /*
+ * Because sched_balance_newidle() releases (and re-acquires)
+ * rq->lock, it is possible for any higher priority task to
+ * appear. In that case we must re-start the pick_next_entity()
+ * loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
- if (new_tasks > 0)
- goto again;
+ if (new_tasks > 0)
+ goto again;
+ }
/*
* rq is about to be idle, check if we need to update the
@@ -8956,19 +9004,10 @@ idle:
return NULL;
}
-static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+static struct task_struct *
+fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
{
- return pick_next_task_fair(rq, prev, NULL);
-}
-
-static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
-{
- return !!dl_se->rq->cfs.nr_running;
-}
-
-static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
-{
- return pick_task_fair(dl_se->rq);
+ return pick_task_fair(dl_se->rq, rf);
}
void fair_server_init(struct rq *rq)
@@ -8977,7 +9016,7 @@ void fair_server_init(struct rq *rq)
init_dl_entity(dl_se);
- dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
+ dl_server_init(dl_se, rq, fair_server_pick_task);
}
/*
@@ -8999,7 +9038,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
*/
static void yield_task_fair(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr = rq->donor;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se;
@@ -9023,15 +9062,26 @@ static void yield_task_fair(struct rq *rq)
*/
rq_clock_skip_update(rq);
- se->deadline += calc_delta_fair(se->slice, se);
+ /*
+ * Forfeit the remaining vruntime, only if the entity is eligible. This
+ * condition is necessary because in core scheduling we prefer to run
+ * ineligible tasks rather than force idling. If this happens we may
+ * end up in a loop where the core scheduler picks the yielding task,
+ * which yields immediately again; without the condition the vruntime
+ * ends up quickly running away.
+ */
+ if (entity_eligible(cfs_rq, se)) {
+ se->vruntime = se->deadline;
+ se->deadline += calc_delta_fair(se->slice, se);
+ }
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- /* throttled hierarchies are not runnable */
- if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+ /* !se->on_rq also covers throttled task */
+ if (!se->on_rq)
return false;
/* Tell the scheduler that we'd really like se to run next. */
@@ -9042,7 +9092,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
return true;
}
-#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods.
*
@@ -9294,43 +9343,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
#ifdef CONFIG_NUMA_BALANCING
/*
- * Returns 1, if task migration degrades locality
- * Returns 0, if task migration improves locality i.e migration preferred.
- * Returns -1, if task migration is not affected by locality.
+ * Returns a positive value, if task migration degrades locality.
+ * Returns 0, if task migration is not affected by locality.
+ * Returns a negative value, if task migration improves locality i.e migration preferred.
*/
-static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
if (!static_branch_likely(&sched_numa_balancing))
- return -1;
+ return 0;
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return -1;
+ return 0;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
- return -1;
+ return 0;
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid) {
if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
return 1;
else
- return -1;
+ return 0;
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
- return 0;
+ return -1;
/* Leaving a core idle is often worse than degrading locality. */
if (env->idle == CPU_IDLE)
- return -1;
+ return 0;
dist = node_distance(src_nid, dst_nid);
if (numa_group) {
@@ -9341,16 +9390,40 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
dst_weight = task_weight(p, dst_nid, dist);
}
- return dst_weight < src_weight;
+ return src_weight - dst_weight;
}
-#else
-static inline int migrate_degrades_locality(struct task_struct *p,
+#else /* !CONFIG_NUMA_BALANCING: */
+static inline long migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
- return -1;
+ return 0;
}
+#endif /* !CONFIG_NUMA_BALANCING */
+
+/*
+ * Check whether the task is ineligible on the destination cpu
+ *
+ * When the PLACE_LAG scheduling feature is enabled and
+ * dst_cfs_rq->nr_queued is greater than 1, if the task
+ * is ineligible, it will also be ineligible when
+ * it is migrated to the destination cpu.
+ */
+static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
+{
+ struct cfs_rq *dst_cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
+#else
+ dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
#endif
+ if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
+ !entity_eligible(task_cfs_rq(p), &p->se))
+ return 1;
+
+ return 0;
+}
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
@@ -9358,24 +9431,44 @@ static inline int migrate_degrades_locality(struct task_struct *p,
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
- int tsk_cache_hot;
+ long degrades, hot;
lockdep_assert_rq_held(env->src_rq);
+ if (p->sched_task_hot)
+ p->sched_task_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU.
+ * 1) delayed dequeued unless we migrate load, or
+ * 2) target cfs_rq is in throttled hierarchy, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
+ */
+ if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
+ return 0;
+
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
+ return 0;
+
+ /*
+ * We want to prioritize the migration of eligible tasks.
+ * For ineligible tasks we soft-limit them and only allow
+ * them to migrate when nr_balance_failed is non-zero to
+ * avoid load-balancing trying very hard to balance the load.
*/
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ if (!env->sd->nr_balance_failed &&
+ task_is_ineligible_on_dst_cpu(p, env->dst_cpu))
return 0;
/* Disregard percpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
+ if (task_is_blocked(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
@@ -9398,12 +9491,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
- for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
- env->flags |= LBF_DST_PINNED;
- env->new_dst_cpu = cpu;
- break;
- }
+ cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
+
+ if (cpu < nr_cpu_ids) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
}
return 0;
@@ -9412,7 +9504,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_on_cpu(env->src_rq, p)) {
+ if (task_on_cpu(env->src_rq, p) ||
+ task_current_donor(env->src_rq, p)) {
schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
@@ -9427,16 +9520,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (env->flags & LBF_ACTIVE_LB)
return 1;
- tsk_cache_hot = migrate_degrades_locality(p, env);
- if (tsk_cache_hot == -1)
- tsk_cache_hot = task_hot(p, env);
+ degrades = migrate_degrades_locality(p, env);
+ if (!degrades)
+ hot = task_hot(p, env);
+ else
+ hot = degrades > 0;
- if (tsk_cache_hot <= 0 ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd->lb_hot_gained[env->idle]);
- schedstat_inc(p->stats.nr_forced_migrations);
- }
+ if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+ if (hot)
+ p->sched_task_hot = 1;
return 1;
}
@@ -9451,6 +9543,15 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_rq_held(env->src_rq);
+ if (p->sched_task_hot) {
+ p->sched_task_hot = 0;
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->stats.nr_forced_migrations);
+ }
+
+ WARN_ON(task_current(env->src_rq, p));
+ WARN_ON(task_current_donor(env->src_rq, p));
+
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -9611,6 +9712,9 @@ static int detach_tasks(struct lb_env *env)
continue;
next:
+ if (p->sched_task_hot)
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
+
list_move(&p->se.group_node, tasks);
}
@@ -9712,12 +9816,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
if (!has_blocked)
rq->has_blocked_load = 0;
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
{
@@ -9753,7 +9857,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
- if (cfs_rq->nr_running == 0)
+ if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
if (cfs_rq == &rq->cfs)
@@ -9826,7 +9930,7 @@ static unsigned long task_h_load(struct task_struct *p)
return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -9843,7 +9947,7 @@ static unsigned long task_h_load(struct task_struct *p)
{
return p->se.avg.load_avg;
}
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static void sched_balance_update_blocked_averages(int cpu)
{
@@ -9988,9 +10092,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
min_capacity = ULONG_MAX;
max_capacity = 0;
- if (child->flags & SD_OVERLAP) {
+ if (child->flags & SD_NUMA) {
/*
- * SD_OVERLAP domains cannot assume that child groups
+ * SD_NUMA domains cannot assume that child groups
* span the current group.
*/
@@ -10003,7 +10107,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
} else {
/*
- * !SD_OVERLAP domains can assume that child groups
+ * !SD_NUMA domains can assume that child groups
* span the current group.
*/
@@ -10210,7 +10314,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group
(sgs->group_weight - sgs->idle_cpus != 1))
return false;
- return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
+ return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
}
/* One group has more than one SMT CPU while the other group does not */
@@ -10285,7 +10389,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* When there is more than 1 task, the group_overloaded case already
* takes care of cpu with reduced capacity
*/
- if (rq->cfs.h_nr_running != 1)
+ if (rq->cfs.h_nr_runnable != 1)
return false;
return check_cpu_capacity(rq, sd);
@@ -10307,7 +10411,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
bool *sg_overloaded,
bool *sg_overutilized)
{
- int i, nr_running, local_group;
+ int i, nr_running, local_group, sd_flags = env->sd->flags;
+ bool balancing_at_rd = !env->sd->parent;
memset(sgs, 0, sizeof(*sgs));
@@ -10320,21 +10425,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
- if (nr_running > 1)
- *sg_overloaded = 1;
-
if (cpu_overutilized(i))
*sg_overutilized = 1;
-#ifdef CONFIG_NUMA_BALANCING
- sgs->nr_numa_running += rq->nr_numa_running;
- sgs->nr_preferred_running += rq->nr_preferred_running;
-#endif
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -10344,10 +10442,21 @@ static inline void update_sg_lb_stats(struct lb_env *env,
continue;
}
+ /* Overload indicator is only updated at root domain */
+ if (balancing_at_rd && nr_running > 1)
+ *sg_overloaded = 1;
+
+#ifdef CONFIG_NUMA_BALANCING
+ /* Only fbq_classify_group() uses this to classify NUMA groups */
+ if (sd_flags & SD_NUMA) {
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+ }
+#endif
if (local_group)
continue;
- if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ if (sd_flags & SD_ASYM_CPUCAPACITY) {
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -10442,7 +10551,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
- return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
+ return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
+ READ_ONCE(sg->asym_prefer_cpu));
case group_misfit_task:
/*
@@ -10550,7 +10660,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
return remote;
return all;
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
return all;
@@ -10560,7 +10670,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
return regular;
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
struct sg_lb_stats;
@@ -10627,7 +10737,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (sd->flags & SD_ASYM_CPUCAPACITY)
sgs->group_misfit_task_load = 1;
- for_each_cpu(i, sched_group_span(group)) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -10635,7 +10745,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
@@ -11417,7 +11527,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- nr_running = rq->cfs.h_nr_running;
+ nr_running = rq->cfs.h_nr_runnable;
if (!nr_running)
continue;
@@ -11576,7 +11686,7 @@ static int need_active_balance(struct lb_env *env)
* available on dst_cpu.
*/
if (env->idle &&
- (env->src_rq->cfs.h_nr_running == 1)) {
+ (env->src_rq->cfs.h_nr_runnable == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@@ -11656,6 +11766,43 @@ static int should_we_balance(struct lb_env *env)
return group_balance_cpu(sg) == env->dst_cpu;
}
+static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ if (!schedstat_enabled())
+ return;
+
+ switch (env->migration_type) {
+ case migrate_load:
+ __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
+ break;
+ case migrate_util:
+ __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
+ break;
+ case migrate_task:
+ __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
+ break;
+ case migrate_misfit:
+ __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ break;
+ }
+}
+
+/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -11681,6 +11828,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
+ bool need_unlock = false;
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
@@ -11692,6 +11840,14 @@ redo:
goto out_balanced;
}
+ if (!need_unlock && (sd->flags & SD_SERIALIZE)) {
+ int zero = 0;
+ if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1))
+ goto out_balanced;
+
+ need_unlock = true;
+ }
+
group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
@@ -11706,7 +11862,7 @@ redo:
WARN_ON_ONCE(busiest == env.dst_rq);
- schedstat_add(sd->lb_imbalance[idle], env.imbalance);
+ update_lb_imbalance_stat(&env, sd, idle);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
@@ -11932,6 +12088,9 @@ out_one_pinned:
sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
out:
+ if (need_unlock)
+ atomic_set_release(&sched_balance_running, 0);
+
return ld_moved;
}
@@ -12057,21 +12216,6 @@ out_unlock:
}
/*
- * This flag serializes load-balancing passes over large domains
- * (above the NODE topology level) - only one load-balancing instance
- * may run at a time, to reduce overhead on very large systems with
- * lots of CPUs and large NUMA distances.
- *
- * - Note that load-balancing passes triggered while another one
- * is executing are skipped and not re-tried.
- *
- * - Also note that this does not serialize rebalance_domains()
- * execution, as non-SD_SERIALIZE domains will still be
- * load-balanced in parallel.
- */
-static atomic_t sched_balance_running = ATOMIC_INIT(0);
-
-/*
* Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
@@ -12080,24 +12224,43 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
{
+ sd->newidle_call++;
+ sd->newidle_success += success;
+
+ if (sd->newidle_call >= 1024) {
+ sd->newidle_ratio = sd->newidle_success;
+ sd->newidle_call /= 2;
+ sd->newidle_success /= 2;
+ }
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
+{
+ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
+ unsigned long now = jiffies;
+
+ if (cost)
+ update_newidle_stats(sd, success);
+
if (cost > sd->max_newidle_lb_cost) {
/*
* Track max cost of a domain to make sure to not delay the
* next wakeup on the CPU.
*/
sd->max_newidle_lb_cost = cost;
- sd->last_decay_max_lb_cost = jiffies;
- } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
+ sd->last_decay_max_lb_cost = now;
+
+ } else if (time_after(now, next_decay)) {
/*
* Decay the newidle max times by ~1% per second to ensure that
* it is not outdated and the current max cost is actually
* shorter.
*/
sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
- sd->last_decay_max_lb_cost = jiffies;
-
+ sd->last_decay_max_lb_cost = now;
return true;
}
@@ -12120,7 +12283,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
- int need_serialize, need_decay = 0;
+ int need_decay = 0;
u64 max_cost = 0;
rcu_read_lock();
@@ -12129,7 +12292,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* Decay the newidle max times here because this is a regular
* visit to all the domains.
*/
- need_decay = update_newidle_cost(sd, 0);
+ need_decay = update_newidle_cost(sd, 0, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -12144,13 +12307,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
}
interval = get_sd_balance_interval(sd, busy);
-
- need_serialize = sd->flags & SD_SERIALIZE;
- if (need_serialize) {
- if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
- goto out;
- }
-
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
@@ -12164,9 +12320,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
- if (need_serialize)
- atomic_set_release(&sched_balance_running, 0);
-out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
@@ -12204,16 +12357,13 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- *
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
- * anywhere yet.
*/
static inline int find_new_ilb(void)
{
const struct cpumask *hk_mask;
int ilb_cpu;
- hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
@@ -12231,7 +12381,8 @@ static inline int find_new_ilb(void)
* Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
* SMP function call (IPI).
*
- * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
+ * (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -12319,7 +12470,7 @@ static void nohz_balancer_kick(struct rq *rq)
* If there's a runnable CFS task and the current CPU has reduced
* capacity, kick the ILB to see if there's a better CPU to run on:
*/
- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
+ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12409,7 +12560,7 @@ unlock:
void nohz_balance_exit_idle(struct rq *rq)
{
- SCHED_WARN_ON(rq != this_rq());
+ WARN_ON_ONCE(rq != this_rq());
if (likely(!rq->nohz_tick_stopped))
return;
@@ -12445,16 +12596,12 @@ void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- SCHED_WARN_ON(cpu != smp_processor_id());
+ WARN_ON_ONCE(cpu != smp_processor_id());
/* If this CPU is going down, then nothing needs to be done: */
if (!cpu_active(cpu))
return;
- /* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
- return;
-
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
@@ -12532,7 +12679,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
int balance_cpu;
struct rq *rq;
- SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+ WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
* We assume there will be no idle load after this update and clear
@@ -12568,7 +12715,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* work being done for other CPUs. Next load
* balancing owner will pick it up.
*/
- if (need_resched()) {
+ if (!idle_cpu(this_cpu) && need_resched()) {
if (flags & NOHZ_STATS_KICK)
has_blocked_load = true;
if (flags & NOHZ_NEXT_KICK)
@@ -12674,13 +12821,6 @@ static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
- /*
- * This CPU doesn't want to be disturbed by scheduler
- * housekeeping
- */
- if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
- return;
-
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
@@ -12697,7 +12837,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
-#else /* !CONFIG_NO_HZ_COMMON */
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline void nohz_balancer_kick(struct rq *rq) { }
static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
@@ -12706,7 +12846,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle
}
static inline void nohz_newidle_balance(struct rq *this_rq) { }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* sched_balance_newidle is called by schedule() if this_cpu is about to become
@@ -12758,18 +12898,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ goto out;
+ }
if (!get_rd_overloaded(this_rq->rd) ||
- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+ this_rq->avg_idle < sd->max_newidle_lb_cost) {
- if (sd)
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
-
goto out;
}
rcu_read_unlock();
+ rq_modified_clear(this_rq);
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
@@ -12785,6 +12928,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
break;
if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned int weight = 1;
+
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
+ }
+ weight = (1024 + weight/2) / weight;
+ }
pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
@@ -12792,10 +12951,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
}
/*
@@ -12817,11 +12980,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task)
+ if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ /* If a higher prio class was modified, restart the pick */
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -12842,9 +13005,9 @@ out:
/*
* This softirq handler is triggered via SCHED_SOFTIRQ from two places:
*
- * - directly from the local scheduler_tick() for periodic load balancing
+ * - directly from the local sched_tick() for periodic load balancing
*
- * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
+ * - indirectly from a remote sched_tick() for NOHZ idle balancing
* through the SMP cross-call nohz_csd_func()
*/
static __latent_entropy void sched_balance_softirq(void)
@@ -12903,8 +13066,6 @@ static void rq_offline_fair(struct rq *rq)
clear_tg_offline_cfs_rqs(rq);
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_SCHED_CORE
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
@@ -12935,13 +13096,176 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
/*
- * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ * Consider any infeasible weight scenario. Take for instance two tasks,
+ * each bound to their respective sibling, one with weight 1 and one with
+ * weight 2. Then the lower weight task will run ahead of the higher weight
+ * task without bound.
+ *
+ * This utterly destroys the concept of a shared time base.
+ *
+ * Remember; all this is about a proportionally fair scheduling, where each
+ * tasks receives:
+ *
+ * w_i
+ * dt_i = ---------- dt (1)
+ * \Sum_j w_j
+ *
+ * which we do by tracking a virtual time, s_i:
+ *
+ * 1
+ * s_i = --- d[t]_i (2)
+ * w_i
+ *
+ * Where d[t] is a delta of discrete time, while dt is an infinitesimal.
+ * The immediate corollary is that the ideal schedule S, where (2) to use
+ * an infinitesimal delta, is:
+ *
+ * 1
+ * S = ---------- dt (3)
+ * \Sum_i w_i
+ *
+ * From which we can define the lag, or deviation from the ideal, as:
+ *
+ * lag(i) = S - s_i (4)
+ *
+ * And since the one and only purpose is to approximate S, we get that:
+ *
+ * \Sum_i w_i lag(i) := 0 (5)
+ *
+ * If this were not so, we no longer converge to S, and we can no longer
+ * claim our scheduler has any of the properties we derive from S. This is
+ * exactly what you did above, you broke it!
+ *
+ *
+ * Let's continue for a while though; to see if there is anything useful to
+ * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i:
+ *
+ * \Sum_i w_i s_i
+ * S = -------------- (6)
+ * \Sum_i w_i
+ *
+ * Which gives us a way to compute S, given our s_i. Now, if you've read
+ * our code, you know that we do not in fact do this, the reason for this
+ * is two-fold. Firstly, computing S in that way requires a 64bit division
+ * for every time we'd use it (see 12), and secondly, this only describes
+ * the steady-state, it doesn't handle dynamics.
+ *
+ * Anyway, in (6): s_i -> x + (s_i - x), to get:
+ *
+ * \Sum_i w_i (s_i - x)
+ * S - x = -------------------- (7)
+ * \Sum_i w_i
+ *
+ * Which shows that S and s_i transform alike (which makes perfect sense
+ * given that S is basically the (weighted) average of s_i).
+ *
+ * So the thing to remember is that the above is strictly UP. It is
+ * possible to generalize to multiple runqueues -- however it gets really
+ * yuck when you have to add affinity support, as illustrated by our very
+ * first counter-example.
+ *
+ * Luckily I think we can avoid needing a full multi-queue variant for
+ * core-scheduling (or load-balancing). The crucial observation is that we
+ * only actually need this comparison in the presence of forced-idle; only
+ * then do we need to tell if the stalled rq has higher priority over the
+ * other.
+ *
+ * [XXX assumes SMT2; better consider the more general case, I suspect
+ * it'll work out because our comparison is always between 2 rqs and the
+ * answer is only interesting if one of them is forced-idle]
+ *
+ * And (under assumption of SMT2) when there is forced-idle, there is only
+ * a single queue, so everything works like normal.
+ *
+ * Let, for our runqueue 'k':
+ *
+ * T_k = \Sum_i w_i s_i
+ * W_k = \Sum_i w_i ; for all i of k (8)
+ *
+ * Then we can write (6) like:
+ *
+ * T_k
+ * S_k = --- (9)
+ * W_k
+ *
+ * From which immediately follows that:
+ *
+ * T_k + T_l
+ * S_k+l = --------- (10)
+ * W_k + W_l
+ *
+ * On which we can define a combined lag:
+ *
+ * lag_k+l(i) := S_k+l - s_i (11)
+ *
+ * And that gives us the tools to compare tasks across a combined runqueue.
+ *
+ *
+ * Combined this gives the following:
+ *
+ * a) when a runqueue enters force-idle, sync it against it's sibling rq(s)
+ * using (7); this only requires storing single 'time'-stamps.
+ *
+ * b) when comparing tasks between 2 runqueues of which one is forced-idle,
+ * compare the combined lag, per (11).
+ *
+ * Now, of course cgroups (I so hate them) make this more interesting in
+ * that a) seems to suggest we need to iterate all cgroup on a CPU at such
+ * boundaries, but I think we can avoid that. The force-idle is for the
+ * whole CPU, all it's rqs. So we can mark it in the root and lazily
+ * propagate downward on demand.
+ */
+
+/*
+ * So this sync is basically a relative reset of S to 0.
+ *
+ * So with 2 queues, when one goes idle, we drop them both to 0 and one
+ * then increases due to not being idle, and the idle one builds up lag to
+ * get re-elected. So far so simple, right?
+ *
+ * When there's 3, we can have the situation where 2 run and one is idle,
+ * we sync to 0 and let the idle one build up lag to get re-election. Now
+ * suppose another one also drops idle. At this point dropping all to 0
+ * again would destroy the built-up lag from the queue that was already
+ * idle, not good.
+ *
+ * So instead of syncing everything, we can:
+ *
+ * less := !((s64)(s_a - s_b) <= 0)
+ *
+ * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b
+ * == v_a - (v_b - S_a + S_b)
+ *
+ * IOW, we can recast the (lag) comparison to a one-sided difference.
+ * So if then, instead of syncing the whole queue, sync the idle queue
+ * against the active queue with S_a + S_b at the point where we sync.
+ *
+ * (XXX consider the implication of living in a cyclic group: N / 2^n N)
+ *
+ * This gives us means of syncing single queues against the active queue,
+ * and for already idle queues to preserve their build-up lag.
+ *
+ * Of course, then we get the situation where there's 2 active and one
+ * going idle, who do we pick to sync against? Theory would have us sync
+ * against the combined S, but as we've already demonstrated, there is no
+ * such thing in infeasible weight scenarios.
+ *
+ * One thing I've considered; and this is where that core_active rudiment
+ * came from, is having active queues sync up between themselves after
+ * every tick. This limits the observed divergence due to the work
+ * conservancy.
+ *
+ * On top of that, we can improve upon things by employing (10) here.
+ */
+
+/*
+ * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
*/
static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
bool forceidle)
@@ -12955,7 +13279,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
cfs_rq->forceidle_seq = fi_seq;
}
- cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
}
}
@@ -12979,7 +13303,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
struct cfs_rq *cfs_rqb;
s64 delta;
- SCHED_WARN_ON(task_rq(b)->core != rq->core);
+ WARN_ON_ONCE(task_rq(b)->core != rq->core);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -13001,18 +13325,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
cfs_rqa = sea->cfs_rq;
cfs_rqb = seb->cfs_rq;
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
cfs_rqa = &task_rq(a)->cfs;
cfs_rqb = &task_rq(b)->cfs;
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
- * min_vruntime_fi, which would have been updated in prior calls
+ * zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+ (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13028,9 +13352,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
#endif
return throttled_hierarchy(cfs_rq);
}
-#else
+#else /* !CONFIG_SCHED_CORE: */
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
-#endif
+#endif /* !CONFIG_SCHED_CORE */
/*
* scheduler tick hitting a task of our scheduling class.
@@ -13074,12 +13398,15 @@ static void task_fork_fair(struct task_struct *p)
* the current task.
*/
static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
- if (rq->cfs.nr_running == 1)
+ if (p->prio == oldprio)
+ return;
+
+ if (rq->cfs.nr_queued == 1)
return;
/*
@@ -13090,8 +13417,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
- } else
+ } else {
wakeup_preempt(rq, p, 0);
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13103,10 +13431,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- if (!throttled_hierarchy(cfs_rq))
+ /*
+ * If a task gets attached to this cfs_rq and before being queued,
+ * it gets migrated to another CPU due to reasons like affinity
+ * change, make sure this cfs_rq stays on leaf cfs_rq list to have
+ * that removed load decayed or it can cause faireness problem.
+ */
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
@@ -13117,22 +13448,20 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
update_load_avg(cfs_rq, se, UPDATE_TG);
- if (cfs_rq_throttled(cfs_rq))
- break;
-
- if (!throttled_hierarchy(cfs_rq))
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
+
+ assert_list_leaf_cfs_rq(rq_of(cfs_rq));
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_SMP
/*
* In case the task sched_avg hasn't been attached:
* - A forked task which hasn't been woken up by wake_up_new_task().
@@ -13141,7 +13470,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
*/
if (!se->avg.last_update_time)
return;
-#endif
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
@@ -13175,6 +13503,12 @@ static void attach_task_cfs_rq(struct task_struct *p)
attach_entity_cfs_rq(se);
}
+static void switching_from_fair(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+}
+
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -13182,7 +13516,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
attach_task_cfs_rq(p);
@@ -13205,7 +13539,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
{
struct sched_entity *se = &p->se;
-#ifdef CONFIG_SMP
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
@@ -13213,11 +13546,10 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
*/
list_move(&se->group_node, &rq->cfs_tasks);
}
-#endif
if (!first)
return;
- SCHED_WARN_ON(se->sched_delayed);
+ WARN_ON_ONCE(se->sched_delayed);
if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
@@ -13250,10 +13582,8 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifdef CONFIG_SMP
+ cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
raw_spin_lock_init(&cfs_rq->removed.lock);
-#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13268,10 +13598,8 @@ static void task_change_group_fair(struct task_struct *p)
detach_task_cfs_rq(p);
-#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
-#endif
set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
@@ -13489,7 +13817,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
- struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
+ struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
long idle_task_delta;
struct rq_flags rf;
@@ -13500,16 +13828,8 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
goto next_cpu;
- if (se->on_rq) {
- parent_cfs_rq = cfs_rq_of(se);
- if (cfs_rq_is_idle(grp_cfs_rq))
- parent_cfs_rq->idle_nr_running++;
- else
- parent_cfs_rq->idle_nr_running--;
- }
-
- idle_task_delta = grp_cfs_rq->h_nr_running -
- grp_cfs_rq->idle_h_nr_running;
+ idle_task_delta = grp_cfs_rq->h_nr_queued -
+ grp_cfs_rq->h_nr_idle;
if (!cfs_rq_is_idle(grp_cfs_rq))
idle_task_delta *= -1;
@@ -13519,7 +13839,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (!se->on_rq)
break;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ cfs_rq->h_nr_idle += idle_task_delta;
/* Already accounted at parent level and above. */
if (cfs_rq_is_idle(cfs_rq))
@@ -13563,6 +13883,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
*/
DEFINE_SCHED_CLASS(fair) = {
+ .queue_mask = 2,
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
@@ -13571,12 +13893,10 @@ DEFINE_SCHED_CLASS(fair) = {
.wakeup_preempt = check_preempt_wakeup_fair,
.pick_task = pick_task_fair,
- .pick_next_task = __pick_next_task_fair,
+ .pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
-#ifdef CONFIG_SMP
- .balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -13585,13 +13905,13 @@ DEFINE_SCHED_CLASS(fair) = {
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_fair,
-#endif
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
.reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
+ .switching_from = switching_from_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
@@ -13612,7 +13932,6 @@ DEFINE_SCHED_CLASS(fair) = {
#endif
};
-#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq, *pos;
@@ -13646,11 +13965,9 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
-#ifdef CONFIG_SMP
int i;
for_each_possible_cpu(i) {
@@ -13672,6 +13989,4 @@ __init void init_sched_fair_class(void)
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
-#endif /* SMP */
-
}