diff options
Diffstat (limited to 'kernel/sched/clock.c')
| -rw-r--r-- | kernel/sched/clock.c | 394 |
1 files changed, 263 insertions, 131 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..f5e6dd6a6b3a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,7 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * sched_clock for unstable cpu clocks + * sched_clock() for unstable CPU clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> @@ -11,7 +12,7 @@ * Guillaume Chazarain <guichaz@gmail.com> * * - * What: + * What this file implements: * * cpu_clock(i) provides a fast (execution time) high resolution * clock with bounded drift between CPUs. The value of cpu_clock(i) @@ -26,10 +27,11 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) - * local_clock() -- is cpu_clock() on the current cpu. + * local_clock() -- is cpu_clock() on the current CPU. * - * How: + * sched_clock_cpu(i) + * + * How it is implemented: * * The implementation either uses sched_clock() when * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the @@ -39,7 +41,7 @@ * Otherwise it tries to create a semi stable clock from a mixture of other * clocks, including: * - * - GTOD (clock monotomic) + * - GTOD (clock monotonic) * - sched_clock() * - explicit idle events * @@ -50,39 +52,41 @@ * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * - * Notes: - * - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things - * like cpufreq interrupts that can change the base clock (TSC) multiplier - * and cause funny jumps in time -- although the filtering provided by - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on - * sched_clock(). */ -#include <linux/spinlock.h> -#include <linux/hardirq.h> -#include <linux/export.h> -#include <linux/percpu.h> -#include <linux/ktime.h> -#include <linux/sched.h> + +#include <linux/sched/clock.h> +#include "sched.h" /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +notrace unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); } EXPORT_SYMBOL_GPL(sched_clock); -__read_mostly int sched_clock_running; +static DEFINE_STATIC_KEY_FALSE(sched_clock_running); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__read_mostly int sched_clock_stable; +/* + * We must start with !__sched_clock_stable because the unstable -> stable + * transition is accurate, while the stable -> unstable transition is not. + * + * Similarly we start with __sched_clock_stable_early, thereby assuming we + * will become stable, such that there's only a single 1 -> 0 transition. + */ +static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); +static int __sched_clock_stable_early = 1; + +/* + * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset + */ +__read_mostly u64 __sched_clock_offset; +static __read_mostly u64 __gtod_offset; struct sched_clock_data { u64 tick_raw; @@ -92,42 +96,163 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void) { - return &__get_cpu_var(sched_clock_data); + return this_cpu_ptr(&sched_clock_data); } -static inline struct sched_clock_data *cpu_sdc(int cpu) +notrace static inline struct sched_clock_data *cpu_sdc(int cpu) { return &per_cpu(sched_clock_data, cpu); } -void sched_clock_init(void) +notrace int sched_clock_stable(void) +{ + return static_branch_likely(&__sched_clock_stable); +} + +notrace static void __scd_stamp(struct sched_clock_data *scd) { - u64 ktime_now = ktime_to_ns(ktime_get()); + scd->tick_gtod = ktime_get_ns(); + scd->tick_raw = sched_clock(); +} + +notrace static void __set_sched_clock_stable(void) +{ + struct sched_clock_data *scd; + + /* + * Since we're still unstable and the tick is already running, we have + * to disable IRQs in order to get a consistent scd->tick* reading. + */ + local_irq_disable(); + scd = this_scd(); + /* + * Attempt to make the (initial) unstable->stable transition continuous. + */ + __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); + local_irq_enable(); + + printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + + static_branch_enable(&__sched_clock_stable); + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); +} + +/* + * If we ever get here, we're screwed, because we found out -- typically after + * the fact -- that TSC wasn't good. This means all our clocksources (including + * ktime) could have reported wrong values. + * + * What we do here is an attempt to fix up and continue sort of where we left + * off in a coherent manner. + * + * The only way to fully avoid random clock jumps is to boot with: + * "tsc=unstable". + */ +notrace static void __sched_clock_work(struct work_struct *work) +{ + struct sched_clock_data *scd; int cpu; - for_each_possible_cpu(cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); + /* take a current timestamp and set 'now' */ + preempt_disable(); + scd = this_scd(); + __scd_stamp(scd); + scd->clock = scd->tick_gtod + __gtod_offset; + preempt_enable(); - scd->tick_raw = 0; - scd->tick_gtod = ktime_now; - scd->clock = ktime_now; - } + /* clone to all CPUs */ + for_each_possible_cpu(cpu) + per_cpu(sched_clock_data, cpu) = *scd; + + printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + + static_branch_disable(&__sched_clock_stable); +} + +static DECLARE_WORK(sched_clock_work, __sched_clock_work); + +notrace static void __clear_sched_clock_stable(void) +{ + if (!sched_clock_stable()) + return; + + tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); + schedule_work(&sched_clock_work); +} + +notrace void clear_sched_clock_stable(void) +{ + __sched_clock_stable_early = 0; + + smp_mb(); /* matches sched_clock_init_late() */ + + if (static_key_count(&sched_clock_running.key) == 2) + __clear_sched_clock_stable(); +} + +notrace static void __sched_clock_gtod_offset(void) +{ + struct sched_clock_data *scd = this_scd(); + + __scd_stamp(scd); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; +} + +void __init sched_clock_init(void) +{ + /* + * Set __gtod_offset such that once we mark sched_clock_running, + * sched_clock_tick() continues where sched_clock() left off. + * + * Even if TSC is buggered, we're still UP at this point so it + * can't really be out of sync. + */ + local_irq_disable(); + __sched_clock_gtod_offset(); + local_irq_enable(); + + static_branch_inc(&sched_clock_running); +} +/* + * We run this as late_initcall() such that it runs after all built-in drivers, + * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. + */ +static int __init sched_clock_init_late(void) +{ + static_branch_inc(&sched_clock_running); + /* + * Ensure that it is impossible to not do a static_key update. + * + * Either {set,clear}_sched_clock_stable() must see sched_clock_running + * and do the update, or we must see their __sched_clock_stable_early + * and do the update, or both. + */ + smp_mb(); /* matches {set,clear}_sched_clock_stable() */ - sched_clock_running = 1; + if (__sched_clock_stable_early) + __set_sched_clock_stable(); + + return 0; } +late_initcall(sched_clock_init_late); /* * min, max except they take wrapping into account */ -static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -138,13 +263,13 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) { - u64 now, clock, old_clock, min_clock, max_clock; + u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; again: - now = sched_clock(); + now = sched_clock_noinstr(); delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; @@ -157,20 +282,46 @@ again: * scd->tick_gtod + TICK_NSEC); */ - clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, old_clock); - max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); + gtod = scd->tick_gtod + __gtod_offset; + clock = gtod + delta; + min_clock = wrap_max(gtod, old_clock); + max_clock = wrap_max(old_clock, gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock_noinstr(void) +{ + u64 clock; + + if (static_branch_likely(&__sched_clock_stable)) + return sched_clock_noinstr() + __sched_clock_offset; + + if (!static_branch_likely(&sched_clock_running)) + return sched_clock_noinstr(); + + clock = sched_clock_local(this_scd()); + + return clock; +} + +u64 local_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = local_clock_noinstr(); + preempt_enable_notrace(); + return now; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; @@ -185,21 +336,21 @@ again: * cmpxchg64 below only protects one readout. * * We must reread via sched_clock_local() in the retry case on - * 32bit as an NMI could use sched_clock_local() via the + * 32-bit kernels as an NMI could use sched_clock_local() via the * tracer and hit between the readout of - * the low32bit and the high 32bit portion. + * the low 32-bit and the high 32-bit portion. */ this_clock = sched_clock_local(my_scd); /* - * We must enforce atomic readout on 32bit, otherwise the - * update on the remote cpu can hit inbetween the readout of - * the low32bit and the high 32bit portion. + * We must enforce atomic readout on 32-bit, otherwise the + * update on the remote CPU can hit in between the readout of + * the low 32-bit and the high 32-bit portion. */ remote_clock = cmpxchg64(&scd->clock, 0, 0); #else /* - * On 64bit the read of [my]scd->clock is atomic versus the - * update, so we can avoid the above 32bit dance. + * On 64-bit kernels the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32-bit dance. */ sched_clock_local(my_scd); again: @@ -226,7 +377,7 @@ again: val = remote_clock; } - if (cmpxchg64(ptr, old_val, val) != old_val) + if (!try_cmpxchg64(ptr, &old_val, val)) goto again; return val; @@ -237,140 +388,121 @@ again: * * See cpu_clock(). */ -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; u64 clock; - WARN_ON_ONCE(!irqs_disabled()); + if (sched_clock_stable()) + return sched_clock() + __sched_clock_offset; - if (sched_clock_stable) + if (!static_branch_likely(&sched_clock_running)) return sched_clock(); - if (unlikely(!sched_clock_running)) - return 0ull; - + preempt_disable_notrace(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); + preempt_enable_notrace(); return clock; } +EXPORT_SYMBOL_GPL(sched_clock_cpu); -void sched_clock_tick(void) +notrace void sched_clock_tick(void) { struct sched_clock_data *scd; - u64 now, now_gtod; - if (sched_clock_stable) + if (sched_clock_stable()) return; - if (unlikely(!sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); scd = this_scd(); - now_gtod = ktime_to_ns(ktime_get()); - now = sched_clock(); - - scd->tick_raw = now; - scd->tick_gtod = now_gtod; + __scd_stamp(scd); sched_clock_local(scd); } -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - sched_clock_cpu(smp_processor_id()); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +notrace void sched_clock_tick_stable(void) { - if (timekeeping_suspended) + if (!sched_clock_stable()) return; - sched_clock_tick(); - touch_softlockup_watchdog(); + /* + * Called under watchdog_lock. + * + * The watchdog just found this TSC to (still) be stable, so now is a + * good moment to update our __gtod_offset. Because once we find the + * TSC to be unstable, any computation will be computing crap. + */ + local_irq_disable(); + __sched_clock_gtod_offset(); + local_irq_enable(); } -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); /* - * As outlined at the top, provides a fast, high resolution, nanosecond - * time source that is monotonic per cpu argument and has bounded drift - * between cpus. - * - * ######################### BIG FAT WARNING ########################## - * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # - * # go backwards !! # - * #################################################################### + * We are going deep-idle (IRQs are disabled): */ -u64 cpu_clock(int cpu) +notrace void sched_clock_idle_sleep_event(void) { - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); - - return clock; + sched_clock_cpu(smp_processor_id()); } +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * Similar to cpu_clock() for the current cpu. Time will only be observed - * to be monotonic if care is taken to only compare timestampt taken on the - * same CPU. - * - * See cpu_clock(). + * We just idled; resync with ktime. */ -u64 local_clock(void) +notrace void sched_clock_idle_wakeup_event(void) { - u64 clock; unsigned long flags; + if (sched_clock_stable()) + return; + + if (unlikely(timekeeping_suspended)) + return; + local_irq_save(flags); - clock = sched_clock_cpu(smp_processor_id()); + sched_clock_tick(); local_irq_restore(flags); - - return clock; } +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */ -void sched_clock_init(void) +void __init sched_clock_init(void) { - sched_clock_running = 1; + static_branch_inc(&sched_clock_running); + local_irq_disable(); + generic_sched_clock_init(); + local_irq_enable(); } -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { - if (unlikely(!sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return 0; return sched_clock(); } -u64 cpu_clock(int cpu) -{ - return sched_clock_cpu(cpu); -} +#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -u64 local_clock(void) +/* + * Running clock - returns the time that has elapsed while a guest has been + * running. + * On a guest this value should be local_clock minus the time the guest was + * suspended by the hypervisor (for any reason). + * On bare metal this function should return the same as local_clock. + * Architectures and sub-architectures can override this. + */ +notrace u64 __weak running_clock(void) { - return sched_clock_cpu(0); + return local_clock(); } - -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - -EXPORT_SYMBOL_GPL(cpu_clock); -EXPORT_SYMBOL_GPL(local_clock); |
