diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-11 18:21:38 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-11 18:21:38 -0800 |
| commit | f57d54bab696133fae569c5f01352249c36fc74f (patch) | |
| tree | 8ebe3c6deaf95c424c86843c3d290fbf2a9e80d2 /kernel/sched/cputime.c | |
| parent | da830e589a45f0c42eef6f3cbd07275f8893f181 (diff) | |
| parent | c1ad41f1f7270c1956da13fa8fd59d8d5929d56e (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The biggest change affects group scheduling: we now track the runnable
average on a per-task entity basis, allowing a smoother, exponential
decay average based load/weight estimation instead of the previous
binary on-the-runqueue/off-the-runqueue load weight method.
This will inevitably disturb workloads that were in some sort of
borderline balancing state or unstable equilibrium, so an eye has to
be kept on regressions.
For that reason the new load average is only limited to group
scheduling (shares distribution) at the moment (which was also hurting
the most from the prior, crude weight calculation and whose scheduling
quality wins most from this change) - but we plan to extend this to
regular SMP balancing as well in the future, which will simplify and
speed up things a bit.
Other changes involve ongoing preparatory work to extend NOHZ to the
scheduler as well, eventually allowing completely irq-free user-space
execution."
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled"
cputime: Comment cputime's adjusting code
cputime: Consolidate cputime adjustment code
cputime: Rename thread_group_times to thread_group_cputime_adjusted
cputime: Move thread_group_cputime() to sched code
vtime: Warn if irqs aren't disabled on system time accounting APIs
vtime: No need to disable irqs on vtime_account()
vtime: Consolidate a bit the ctx switch code
vtime: Explicitly account pending user time on process tick
vtime: Remove the underscore prefix invasion
sched/autogroup: Fix crash on reboot when autogroup is disabled
cputime: Separate irqtime accounting from generic vtime
cputime: Specialize irq vtime hooks
kvm: Directly account vtime to system on guest switch
vtime: Make vtime_account_system() irqsafe
vtime: Gather vtime declarations to their own header file
sched: Describe CFS load-balancer
sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking
sched: Make __update_entity_runnable_avg() fast
sched: Update_cfs_shares at period edge
...
Diffstat (limited to 'kernel/sched/cputime.c')
| -rw-r--r-- | kernel/sched/cputime.c | 131 |
1 files changed, 95 insertions, 36 deletions
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 81b763ba58a6..293b202fcf79 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void vtime_account(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr) irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(irqtime_account_irq); static int irqtime_account_hi_update(void) { @@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void) return false; } +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + times->utime += t->utime; + times->stime += t->stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *st = cputime.stime; } +void vtime_account_system_irqsafe(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + vtime_account_system(tsk); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + + vtime_account_user(prev); + arch_vtime_task_switch(prev); +} +#endif + /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) #ifndef __ARCH_HAS_VTIME_ACCOUNT void vtime_account(struct task_struct *tsk) { - unsigned long flags; - - local_irq_save(flags); - if (in_interrupt() || !is_idle_task(tsk)) vtime_account_system(tsk); else vtime_account_idle(tsk); - - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(vtime_account); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + + utime = curr->utime; + total = utime + curr->stime; /* - * Use CFS's precise accounting: + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) utime = rtime; /* - * Compare with previous values, to keep monotonicity: + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); - *ut = p->prev_utime; - *st = p->prev_stime; + *ut = prev->utime; + *st = prev->stime; +} + +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* * Must be called with siglock held. */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif |
