diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 6 | ||||
-rw-r--r-- | kernel/sched/auto_group.h | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 92 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 276 | ||||
-rw-r--r-- | kernel/sched/loadavg.c (renamed from kernel/sched/proc.c) | 236 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 10 | ||||
-rw-r--r-- | kernel/sched/stats.h | 15 | ||||
-rw-r--r-- | kernel/sched/wait.c | 4 |
12 files changed, 346 insertions, 303 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be87024875..67687973ce80 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o +obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index eae160dd669d..750ed601ddf7 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -1,5 +1,3 @@ -#ifdef CONFIG_SCHED_AUTOGROUP - #include "sched.h" #include <linux/proc_fs.h> @@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + if (!READ_ONCE(sysctl_sched_autogroup_enabled)) goto out; for_each_thread(p, t) @@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } #endif /* CONFIG_SCHED_DEBUG */ - -#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816..890c95f2587a 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 123673291ffb..20b858f2db22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) @@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + struct wake_q_node *node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_up_list(). + */ + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + return; + + get_task_struct(task); + + /* + * The head is context local, there can be no concurrency. + */ + *head->lastp = node; + head->lastp = &node->next; +} + +void wake_up_q(struct wake_q_head *head) +{ + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); + /* task can safely be re-inserted now */ + node = node->next; + task->wake_q.next = NULL; + + /* + * wake_up_process() implies a wmb() to pair with the queueing + * in wake_q_add() so as not to miss wakeups. + */ + wake_up_process(task); + put_task_struct(task); + } +} + /* * resched_curr - mark rq's current task 'to be rescheduled now'. * @@ -2397,9 +2443,9 @@ unsigned long nr_iowait_cpu(int cpu) void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { - struct rq *this = this_rq(); - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->cpu_load[0]; + struct rq *rq = this_rq(); + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->load.weight; } #ifdef CONFIG_SMP @@ -2497,6 +2543,7 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2525,7 +2572,7 @@ void scheduler_tick(void) u64 scheduler_tick_max_deferment(void) { struct rq *rq = this_rq(); - unsigned long next, now = ACCESS_ONCE(jiffies); + unsigned long next, now = READ_ONCE(jiffies); next = rq->last_sched_tick + HZ; @@ -2726,9 +2773,7 @@ again: * - return from syscall or exception to user-space * - return from interrupt-handler to user-space * - * WARNING: all callers must re-check need_resched() afterward and reschedule - * accordingly in case an event triggered the need for rescheduling (such as - * an interrupt waking up a task) while preemption was disabled in __schedule(). + * WARNING: must be called with preemption disabled! */ static void __sched __schedule(void) { @@ -2737,7 +2782,6 @@ static void __sched __schedule(void) struct rq *rq; int cpu; - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_note_context_switch(); @@ -2801,8 +2845,6 @@ static void __sched __schedule(void) raw_spin_unlock_irq(&rq->lock); post_schedule(rq); - - sched_preempt_enable_no_resched(); } static inline void sched_submit_work(struct task_struct *tsk) @@ -2823,7 +2865,9 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { + preempt_disable(); __schedule(); + sched_preempt_enable_no_resched(); } while (need_resched()); } EXPORT_SYMBOL(schedule); @@ -2862,15 +2906,14 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_active_enter(); __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); + preempt_active_exit(); /* * Check again in case we missed a preemption opportunity * between schedule and now. */ - barrier(); } while (need_resched()); } @@ -2917,7 +2960,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void) return; do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_active_enter(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -2927,8 +2970,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void) __schedule(); exception_exit(prev_ctx); - __preempt_count_sub(PREEMPT_ACTIVE); - barrier(); + preempt_active_exit(); } while (need_resched()); } EXPORT_SYMBOL_GPL(preempt_schedule_context); @@ -2952,17 +2994,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - __preempt_count_add(PREEMPT_ACTIVE); + preempt_active_enter(); local_irq_enable(); __schedule(); local_irq_disable(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); + preempt_active_exit(); } while (need_resched()); exception_exit(prev_state); @@ -5314,7 +5350,7 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; -static void __cpuinit set_cpu_rq_start_time(void) +static void set_cpu_rq_start_time(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); @@ -7734,11 +7770,11 @@ static long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; - rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8394b1ee600c..f5a64ffad176 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new) { cputime_t old; - while (new > (old = ACCESS_ONCE(*counter))) + while (new > (old = READ_ONCE(*counter))) cmpxchg_cputime(counter, old, new); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5e95145088fd..890ce951c717 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If we are dealing with a -deadline task, we must diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffeaa4105e48..0d4632f7799b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ -static int get_update_sysctl_factor(void) +static unsigned int get_update_sysctl_factor(void) { - unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { @@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int factor = get_update_sysctl_factor(); + unsigned int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { - unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); + unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; @@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p) u64 runtime, period; spinlock_t *group_lock = NULL; - seq = ACCESS_ONCE(p->mm->numa_scan_seq); + /* + * The p->mm->numa_scan_seq field gets updated without + * exclusive access. Use READ_ONCE() here to ensure + * that the field is read in a single access: + */ + seq = READ_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; @@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, } rcu_read_lock(); - tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + tsk = READ_ONCE(cpu_rq(cpu)->curr); if (!cpupid_match_pid(tsk, cpupid)) goto no_join; @@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) static void reset_ptenuma_scan(struct task_struct *p) { - ACCESS_ONCE(p->mm->numa_scan_seq)++; + /* + * We only did a read acquisition of the mmap sem, so + * p->mm->numa_scan_seq is written to without exclusive access + * and the update is not guaranteed to be atomic. That's not + * much of an issue though, since this is just used for + * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not + * expensive, to avoid any form of compiler optimizations: + */ + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); p->mm->numa_scan_offset = 0; } @@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* + * per rq 'load' arrray crap; XXX kill this. + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +static void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = READ_ONCE(jiffies); + unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = READ_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = this_rq->cfs.runnable_load_avg; + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); +} + /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { @@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); + unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env) } #ifdef CONFIG_NUMA_BALANCING -/* Returns true if the destination node has incurred more faults */ +/* + * Returns true if the destination node is the preferred node. + * Needs to match fbq_classify_rq(): if there is a runnable task + * that is not on its preferred node, we should identify it. + */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_faults, dst_faults; int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || @@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - if (numa_group) { - /* Task is already in the group's interleave set. */ - if (node_isset(src_nid, numa_group->active_nodes)) - return false; - - /* Task is moving into the group's interleave set. */ - if (node_isset(dst_nid, numa_group->active_nodes)) - return true; - - return group_faults(p, dst_nid) > group_faults(p, src_nid); - } - /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) return true; - return task_faults(p, dst_nid) > task_faults(p, src_nid); + /* Migrating away from the preferred node is bad. */ + if (src_nid == p->numa_preferred_nid) + return false; + + if (numa_group) { + src_faults = group_faults(p, src_nid); + dst_faults = group_faults(p, dst_nid); + } else { + src_faults = task_faults(p, src_nid); + dst_faults = task_faults(p, dst_nid); + } + + return dst_faults > src_faults; } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_faults, dst_faults; int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - if (numa_group) { - /* Task is moving within/into the group's interleave set. */ - if (node_isset(dst_nid, numa_group->active_nodes)) - return false; + /* Migrating away from the preferred node is bad. */ + if (src_nid == p->numa_preferred_nid) + return true; - /* Task is moving out of the group's interleave set. */ - if (node_isset(src_nid, numa_group->active_nodes)) - return true; + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) + return false; - return group_faults(p, dst_nid) < group_faults(p, src_nid); + if (numa_group) { + src_faults = group_faults(p, src_nid); + dst_faults = group_faults(p, dst_nid); + } else { + src_faults = task_faults(p, src_nid); + dst_faults = task_faults(p, dst_nid); } - /* Migrating away from the preferred node is always bad. */ - if (src_nid == p->numa_preferred_nid) - return true; - - return task_faults(p, dst_nid) < task_faults(p, src_nid); + return dst_faults < src_faults; } #else @@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu) * Since we're reading these variables without serialization make sure * we read them once before doing sanity checks on them. */ - age_stamp = ACCESS_ONCE(rq->age_stamp); - avg = ACCESS_ONCE(rq->rt_avg); + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); delta = __rq_clock_broken(rq) - age_stamp; if (unlikely(delta < 0)) diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c index 8ecd552fe4f2..ef7159012cf3 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/loadavg.c @@ -1,7 +1,9 @@ /* - * kernel/sched/proc.c + * kernel/sched/loadavg.c * - * Kernel load calculations, forked from sched/core.c + * This file contains the magic bits required to compute the global loadavg + * figure. Its a silly number but people think its important. We go through + * great pains to make it work on big machines and tickless kernels. */ #include <linux/export.h> @@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq) long nr_active, delta = 0; nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; @@ -186,6 +188,7 @@ void calc_load_enter_idle(void) delta = calc_load_fold_active(this_rq); if (delta) { int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); } } @@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { unsigned long result = 1UL << frac_bits; - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; } return result; @@ -285,7 +290,6 @@ static unsigned long calc_load_n(unsigned long load, unsigned long exp, unsigned long active, unsigned int n) { - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } @@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { } /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. + * + * Called from the global timer code. */ void calc_global_load(unsigned long ticks) { @@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks) } /* - * Called from update_cpu_load() to periodically update this CPU's + * Called from scheduler_tick() to periodically update this CPU's * active count. */ -static void calc_load_account_active(struct rq *this_rq) +void calc_global_load_tick(struct rq *this_rq) { long delta; @@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq) this_rq->calc_load_update += LOAD_FREQ; } - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_SMP -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->cfs.runnable_load_avg; -} -#else -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->load.weight; -} -#endif - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = get_rq_runnable_load(this_rq); - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ */ - -/* - * Called from scheduler_tick() - */ -void update_cpu_load_active(struct rq *this_rq) -{ - unsigned long load = get_rq_runnable_load(this_rq); - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); - - calc_load_account_active(this_rq); -} diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 575da76a3874..560d2fa623c3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If the current task on @p's runqueue is an RT task, then diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e129993958..d85455539d5c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); + +#ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); +#else +static inline void update_cpu_load_active(struct rq *this_rq) { } +#endif /* * Helpers for converting nanosecond timing to jiffy resolution @@ -707,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static inline u64 __rq_clock_broken(struct rq *rq) { - return ACCESS_ONCE(rq->clock); + return READ_ONCE(rq->clock); } static inline u64 rq_clock(struct rq *rq) @@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void update_idle_cpu_load(struct rq *this_rq); - extern void init_task_runnable_average(struct task_struct *p); static inline void add_nr_running(struct rq *rq, unsigned count) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4ab704339656..077ebbd5e10f 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) return false; /* @@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.utime += cputime; - raw_spin_unlock(&cputimer->lock); + atomic64_add(cputime, &cputimer->cputime_atomic.utime); } /** @@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.stime += cputime; - raw_spin_unlock(&cputimer->lock); + atomic64_add(cputime, &cputimer->cputime_atomic.stime); } /** @@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.sum_exec_runtime += ns; - raw_spin_unlock(&cputimer->lock); + atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 852143a79f36..2ccec988d6b7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io); __sched int bit_wait_timeout(struct wait_bit_key *word) { - unsigned long now = ACCESS_ONCE(jiffies); + unsigned long now = READ_ONCE(jiffies); if (signal_pending_state(current->state, current)) return 1; if (time_after_eq(now, word->timeout)) @@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); __sched int bit_wait_io_timeout(struct wait_bit_key *word) { - unsigned long now = ACCESS_ONCE(jiffies); + unsigned long now = READ_ONCE(jiffies); if (signal_pending_state(current->state, current)) return 1; if (time_after_eq(now, word->timeout)) |