diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/clock.c | 3 | ||||
-rw-r--r-- | kernel/sched/core.c | 72 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 6 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 16 | ||||
-rw-r--r-- | kernel/sched/debug.c | 3 | ||||
-rw-r--r-- | kernel/sched/idle.c | 150 | ||||
-rw-r--r-- | kernel/sched/sched.h | 10 | ||||
-rw-r--r-- | kernel/sched/stats.c | 2 |
8 files changed, 162 insertions, 100 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b30a2924ef14..3ef6451e972e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -60,13 +60,14 @@ #include <linux/sched.h> #include <linux/static_key.h> #include <linux/workqueue.h> +#include <linux/compiler.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a47902c687ae..268a45ea238c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include <linux/init_task.h> #include <linux/binfmts.h> #include <linux/context_tracking.h> +#include <linux/compiler.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -432,7 +433,7 @@ void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); rq->hrtick_csd_pending = 1; } } @@ -555,12 +556,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { @@ -823,19 +827,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - u64 st; - steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - st = steal_ticks(steal); - steal = st * TICK_NSEC; - rq->prev_steal_time_rq += steal; - delta -= steal; } #endif @@ -2848,52 +2846,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - #ifdef CONFIG_RT_MUTEXES /* @@ -6501,7 +6453,7 @@ static cpumask_var_t fallback_doms; * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void) { return 0; } @@ -7233,7 +7185,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7660,7 +7612,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7678,7 +7630,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) sched_move_task(task); } @@ -8017,8 +7969,7 @@ static struct cftype cpu_files[] = { { } /* terminate */ }; -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", +struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, @@ -8026,7 +7977,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .subsys_id = cpu_cgroup_subsys_id, .base_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 622e0818f905..c143ee380e3a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return css_ca(task_css(tsk, cpuacct_subsys_id)); + return css_ca(task_css(tsk, cpuacct_cgrp_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) @@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", +struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, .base_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 58624a65f124..a95097cb4591 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; + u64 steal; + cputime_t steal_ct; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; + /* + * cputime_t may be less precise than nsecs (eg: if it's + * based on jiffies). Lets cast the result to cputime + * granularity and account the rest on the next rounds. + */ + steal_ct = nsecs_to_cputime(steal); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - account_steal_time(st); - return st; + account_steal_time(steal_ct); + return steal_ct; } #endif return false; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f3344c31632a..695f9773bb60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; + return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); } #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b7976a127178..8f4390a079c7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -63,6 +63,136 @@ void __weak arch_cpu_idle(void) local_irq_enable(); } +/** + * cpuidle_idle_call - the main idle function + * + * NOTE: no locks or semaphores should be used here + * return non-zero on failure + */ +static int cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int next_state, entered_state, ret; + bool broadcast; + + /* + * Check if the idle task must be rescheduled. If it is the + * case, exit the function after re-enabling the local irq and + * set again the polling flag + */ + if (current_clr_polling_and_test()) { + local_irq_enable(); + __current_set_polling(); + return 0; + } + + /* + * During the idle period, stop measuring the disabled irqs + * critical sections latencies + */ + stop_critical_timings(); + + /* + * Tell the RCU framework we are entering an idle section, + * so no more rcu read side critical sections and one more + * step to the grace period + */ + rcu_idle_enter(); + + /* + * Check if the cpuidle framework is ready, otherwise fallback + * to the default arch specific idle method + */ + ret = cpuidle_enabled(drv, dev); + + if (!ret) { + /* + * Ask the governor to choose an idle state it thinks + * it is convenient to go to. There is *always* a + * convenient idle state + */ + next_state = cpuidle_select(drv, dev); + + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + } else { + broadcast = !!(drv->states[next_state].flags & + CPUIDLE_FLAG_TIMER_STOP); + + if (broadcast) + /* + * Tell the time framework to switch + * to a broadcast timer because our + * local timer will be shutdown. If a + * local timer is used from another + * cpu as a broadcast timer, this call + * may fail if it is not available + */ + ret = clockevents_notify( + CLOCK_EVT_NOTIFY_BROADCAST_ENTER, + &dev->cpu); + + if (!ret) { + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + /* + * Enter the idle state previously + * returned by the governor + * decision. This function will block + * until an interrupt occurs and will + * take care of re-enabling the local + * interrupts + */ + entered_state = cpuidle_enter(drv, dev, + next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, + dev->cpu); + + if (broadcast) + clockevents_notify( + CLOCK_EVT_NOTIFY_BROADCAST_EXIT, + &dev->cpu); + + /* + * Give the governor an opportunity to reflect on the + * outcome + */ + cpuidle_reflect(dev, entered_state); + } + } + } + + /* + * We can't use the cpuidle framework, let's use the default + * idle routine + */ + if (ret) + arch_cpu_idle(); + + __current_set_polling(); + + /* + * It is up to the idle functions to enable back the local + * interrupt + */ + if (WARN_ON_ONCE(irqs_disabled())) + local_irq_enable(); + + rcu_idle_exit(); + start_critical_timings(); + + return 0; +} + /* * Generic idle loop implementation */ @@ -90,23 +220,11 @@ static void cpu_idle_loop(void) * know that the IPI is going to arrive right * away */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + if (cpu_idle_force_poll || tick_check_broadcast_expired()) cpu_idle_poll(); - } else { - if (!current_clr_polling_and_test()) { - stop_critical_timings(); - rcu_idle_enter(); - if (cpuidle_idle_call()) - arch_cpu_idle(); - if (WARN_ON_ONCE(irqs_disabled())) - local_irq_enable(); - rcu_idle_exit(); - start_critical_timings(); - } else { - local_irq_enable(); - } - __current_set_polling(); - } + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f2de7a175620..c9007f28d3a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1216,16 +1216,6 @@ extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index da98af347e8b..a476bea17fbc 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void) proc_create("schedstat", 0, NULL, &proc_schedstat_operations); return 0; } -module_init(proc_schedstat_init); +subsys_initcall(proc_schedstat_init); |