summaryrefslogtreecommitdiff
path: root/kernel/time/clocksource.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time/clocksource.c')
-rw-r--r--kernel/time/clocksource.c757
1 files changed, 608 insertions, 149 deletions
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 03918a19cf2d..a1890a073196 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1,26 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * linux/kernel/time/clocksource.c
- *
* This file contains the functions which manage clocksource drivers.
*
* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * TODO WishList:
- * o Allow clocksource drivers to be unregistered
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -32,10 +14,24 @@
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
#include <linux/tick.h>
#include <linux/kthread.h>
+#include <linux/prandom.h>
+#include <linux/cpu.h>
#include "tick-internal.h"
#include "timekeeping_internal.h"
+static void clocksource_enqueue(struct clocksource *cs);
+
+static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
+{
+ u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta);
+
+ if (likely(delta < cs->max_cycles))
+ return clocksource_cyc2ns(delta, cs->mult, cs->shift);
+
+ return mul_u64_u32_shr(delta, cs->mult, cs->shift);
+}
+
/**
* clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
* @mult: pointer to mult variable
@@ -56,7 +52,7 @@
* calculated mult and shift factors. This guarantees that no 64bit
* overflow happens when the input value of the conversion is
* multiplied with the calculated mult factor. Larger ranges may
- * reduce the conversion accuracy by chosing smaller mult and shift
+ * reduce the conversion accuracy by choosing smaller mult and shift
* factors.
*/
void
@@ -94,6 +90,8 @@ EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
/*[Clocksource internal variables]---------
* curr_clocksource:
* currently selected clocksource.
+ * suspend_clocksource:
+ * used to calculate the suspend time.
* clocksource_list:
* linked list with the registered clocksources
* clocksource_mutex:
@@ -102,10 +100,54 @@ EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
* Name of the user-specified clocksource.
*/
static struct clocksource *curr_clocksource;
+static struct clocksource *suspend_clocksource;
static LIST_HEAD(clocksource_list);
static DEFINE_MUTEX(clocksource_mutex);
static char override_name[CS_NAME_LEN];
static int finished_booting;
+static u64 suspend_start;
+
+/*
+ * Interval: 0.5sec.
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
+
+/*
+ * Threshold: 0.0312s, when doubled: 0.0625s.
+ */
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
+
+/*
+ * Maximum permissible delay between two readouts of the watchdog
+ * clocksource surrounding a read of the clocksource being validated.
+ * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
+ * a lower bound for cs->uncertainty_margin values when registering clocks.
+ *
+ * The default of 500 parts per million is based on NTP's limits.
+ * If a clocksource is good enough for NTP, it is good enough for us!
+ *
+ * In other words, by default, even if a clocksource is extremely
+ * precise (for example, with a sub-nanosecond period), the maximum
+ * permissible skew between the clocksource watchdog and the clocksource
+ * under test is not permitted to go below the 500ppm minimum defined
+ * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the
+ * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option.
+ */
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#else
+#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
+#endif
+
+/*
+ * Default for maximum permissible skew when cs->uncertainty_margin is
+ * not specified, and the lower bound even when cs->uncertainty_margin
+ * is specified. This is also the default that is used when registering
+ * clocks with unspecified cs->uncertainty_margin, so this macro is used
+ * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
+ */
+#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
@@ -118,33 +160,63 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
+static int64_t watchdog_max_interval;
-static int clocksource_watchdog_kthread(void *data);
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
+static inline void clocksource_watchdog_lock(unsigned long *flags)
+{
+ spin_lock_irqsave(&watchdog_lock, *flags);
+}
-/*
- * Interval: 0.5sec Threshold: 0.0625s
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
+static inline void clocksource_watchdog_unlock(unsigned long *flags)
+{
+ spin_unlock_irqrestore(&watchdog_lock, *flags);
+}
+
+static int clocksource_watchdog_kthread(void *data);
static void clocksource_watchdog_work(struct work_struct *work)
{
/*
+ * We cannot directly run clocksource_watchdog_kthread() here, because
+ * clocksource_select() calls timekeeping_notify() which uses
+ * stop_machine(). One cannot use stop_machine() from a workqueue() due
+ * lock inversions wrt CPU hotplug.
+ *
+ * Also, we only ever run this work once or twice during the lifetime
+ * of the kernel, so there is no point in creating a more permanent
+ * kthread for this.
+ *
* If kthread_run fails the next watchdog scan over the
* watchdog_list will find the unstable clock again.
*/
kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
}
+static void clocksource_change_rating(struct clocksource *cs, int rating)
+{
+ list_del(&cs->list);
+ cs->rating = rating;
+ clocksource_enqueue(cs);
+}
+
static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
+ /*
+ * If the clocksource is registered clocksource_watchdog_kthread() will
+ * re-rate and re-select.
+ */
+ if (list_empty(&cs->list)) {
+ cs->rating = 0;
+ return;
+ }
+
if (cs->mark_unstable)
cs->mark_unstable(cs);
+ /* kick clocksource_watchdog_kthread() */
if (finished_booting)
schedule_work(&watchdog_work);
}
@@ -153,10 +225,8 @@ static void __clocksource_unstable(struct clocksource *cs)
* clocksource_mark_unstable - mark clocksource unstable via watchdog
* @cs: clocksource to be marked unstable
*
- * This function is called instead of clocksource_change_rating from
- * cpu hotplug code to avoid a deadlock between the clocksource mutex
- * and the cpu hotplug mutex. It defers the update of the clocksource
- * to the watchdog thread.
+ * This function is called by the x86 TSC code to mark clocksources as unstable;
+ * it defers demotion and re-selection to a kthread.
*/
void clocksource_mark_unstable(struct clocksource *cs)
{
@@ -164,19 +234,202 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_lock_irqsave(&watchdog_lock, flags);
if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
- if (list_empty(&cs->wd_list))
+ if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
list_add(&cs->wd_list, &watchdog_list);
__clocksource_unstable(cs);
}
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static void clocksource_watchdog(unsigned long data)
+static int verify_n_cpus = 8;
+module_param(verify_n_cpus, int, 0644);
+
+enum wd_read_status {
+ WD_READ_SUCCESS,
+ WD_READ_UNSTABLE,
+ WD_READ_SKIP
+};
+
+static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+{
+ int64_t md = 2 * watchdog->uncertainty_margin;
+ unsigned int nretries, max_retries;
+ int64_t wd_delay, wd_seq_delay;
+ u64 wd_end, wd_end2;
+
+ max_retries = clocksource_get_max_watchdog_retry();
+ for (nretries = 0; nretries <= max_retries; nretries++) {
+ local_irq_disable();
+ *wdnow = watchdog->read(watchdog);
+ *csnow = cs->read(cs);
+ wd_end = watchdog->read(watchdog);
+ wd_end2 = watchdog->read(watchdog);
+ local_irq_enable();
+
+ wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
+ if (wd_delay <= md + cs->uncertainty_margin) {
+ if (nretries > 1 && nretries >= max_retries) {
+ pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
+ smp_processor_id(), watchdog->name, nretries);
+ }
+ return WD_READ_SUCCESS;
+ }
+
+ /*
+ * Now compute delay in consecutive watchdog read to see if
+ * there is too much external interferences that cause
+ * significant delay in reading both clocksource and watchdog.
+ *
+ * If consecutive WD read-back delay > md, report
+ * system busy, reinit the watchdog and skip the current
+ * watchdog test.
+ */
+ wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
+ if (wd_seq_delay > md)
+ goto skip_test;
+ }
+
+ pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
+ smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
+ return WD_READ_UNSTABLE;
+
+skip_test:
+ pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
+ smp_processor_id(), watchdog->name, wd_seq_delay);
+ pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
+ cs->name, wd_delay);
+ return WD_READ_SKIP;
+}
+
+static u64 csnow_mid;
+static cpumask_t cpus_ahead;
+static cpumask_t cpus_behind;
+static cpumask_t cpus_chosen;
+
+static void clocksource_verify_choose_cpus(void)
+{
+ int cpu, i, n = verify_n_cpus;
+
+ if (n < 0 || n >= num_online_cpus()) {
+ /* Check all of the CPUs. */
+ cpumask_copy(&cpus_chosen, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ return;
+ }
+
+ /* If no checking desired, or no other CPU to check, leave. */
+ cpumask_clear(&cpus_chosen);
+ if (n == 0 || num_online_cpus() <= 1)
+ return;
+
+ /* Make sure to select at least one CPU other than the current CPU. */
+ cpu = cpumask_any_but(cpu_online_mask, smp_processor_id());
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+ cpumask_set_cpu(cpu, &cpus_chosen);
+
+ /* Force a sane value for the boot parameter. */
+ if (n > nr_cpu_ids)
+ n = nr_cpu_ids;
+
+ /*
+ * Randomly select the specified number of CPUs. If the same
+ * CPU is selected multiple times, that CPU is checked only once,
+ * and no replacement CPU is selected. This gracefully handles
+ * situations where verify_n_cpus is greater than the number of
+ * CPUs that are currently online.
+ */
+ for (i = 1; i < n; i++) {
+ cpu = cpumask_random(cpu_online_mask);
+ if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ cpumask_set_cpu(cpu, &cpus_chosen);
+ }
+
+ /* Don't verify ourselves. */
+ cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+}
+
+static void clocksource_verify_one_cpu(void *csin)
+{
+ struct clocksource *cs = (struct clocksource *)csin;
+
+ csnow_mid = cs->read(cs);
+}
+
+void clocksource_verify_percpu(struct clocksource *cs)
+{
+ int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
+ u64 csnow_begin, csnow_end;
+ int cpu, testcpu;
+ s64 delta;
+
+ if (verify_n_cpus == 0)
+ return;
+ cpumask_clear(&cpus_ahead);
+ cpumask_clear(&cpus_behind);
+ cpus_read_lock();
+ migrate_disable();
+ clocksource_verify_choose_cpus();
+ if (cpumask_empty(&cpus_chosen)) {
+ migrate_enable();
+ cpus_read_unlock();
+ pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+ return;
+ }
+ testcpu = smp_processor_id();
+ pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
+ cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ preempt_disable();
+ for_each_cpu(cpu, &cpus_chosen) {
+ if (cpu == testcpu)
+ continue;
+ csnow_begin = cs->read(cs);
+ smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
+ csnow_end = cs->read(cs);
+ delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
+ if (delta < 0)
+ cpumask_set_cpu(cpu, &cpus_behind);
+ delta = (csnow_end - csnow_mid) & cs->mask;
+ if (delta < 0)
+ cpumask_set_cpu(cpu, &cpus_ahead);
+ cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
+ if (cs_nsec > cs_nsec_max)
+ cs_nsec_max = cs_nsec;
+ if (cs_nsec < cs_nsec_min)
+ cs_nsec_min = cs_nsec;
+ }
+ preempt_enable();
+ migrate_enable();
+ cpus_read_unlock();
+ if (!cpumask_empty(&cpus_ahead))
+ pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
+ cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
+ if (!cpumask_empty(&cpus_behind))
+ pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
+ cpumask_pr_args(&cpus_behind), testcpu, cs->name);
+ pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+ testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+}
+EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+
+static inline void clocksource_reset_watchdog(void)
{
struct clocksource *cs;
- u64 csnow, wdnow, cslast, wdlast, delta;
- int64_t wd_nsec, cs_nsec;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+
+static void clocksource_watchdog(struct timer_list *unused)
+{
+ int64_t wd_nsec, cs_nsec, interval;
+ u64 csnow, wdnow, cslast, wdlast;
int next_cpu, reset_pending;
+ struct clocksource *cs;
+ enum wd_read_status read_ret;
+ unsigned long extra_wait = 0;
+ u32 md;
spin_lock(&watchdog_lock);
if (!watchdog_running)
@@ -193,10 +446,31 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
- local_irq_disable();
- csnow = cs->read(cs);
- wdnow = watchdog->read(watchdog);
- local_irq_enable();
+ read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
+
+ if (read_ret == WD_READ_UNSTABLE) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
+ continue;
+ }
+
+ /*
+ * When WD_READ_SKIP is returned, it means the system is likely
+ * under very heavy load, where the latency of reading
+ * watchdog/clocksource is very big, and affect the accuracy of
+ * watchdog check. So give system some space and suspend the
+ * watchdog check for 5 minutes.
+ */
+ if (read_ret == WD_READ_SKIP) {
+ /*
+ * As the watchdog timer will be suspended, and
+ * cs->last could keep unchanged for 5 minutes, reset
+ * the counters.
+ */
+ clocksource_reset_watchdog();
+ extra_wait = HZ * 300;
+ break;
+ }
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
@@ -207,12 +481,8 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
- delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
- wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
- watchdog->shift);
-
- delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
- cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
+ cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
wdlast = cs->wd_last; /* save these in case we print them */
cslast = cs->cs_last;
cs->cs_last = csnow;
@@ -221,14 +491,50 @@ static void clocksource_watchdog(unsigned long data)
if (atomic_read(&watchdog_reset_pending))
continue;
+ /*
+ * The processing of timer softirqs can get delayed (usually
+ * on account of ksoftirqd not getting to run in a timely
+ * manner), which causes the watchdog interval to stretch.
+ * Skew detection may fail for longer watchdog intervals
+ * on account of fixed margins being used.
+ * Some clocksources, e.g. acpi_pm, cannot tolerate
+ * watchdog intervals longer than a few seconds.
+ */
+ interval = max(cs_nsec, wd_nsec);
+ if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
+ if (system_state > SYSTEM_SCHEDULING &&
+ interval > 2 * watchdog_max_interval) {
+ watchdog_max_interval = interval;
+ pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
+ cs_nsec, wd_nsec);
+ }
+ watchdog_timer.expires = jiffies;
+ continue;
+ }
+
/* Check the deviation from the watchdog clocksource. */
- if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+ md = cs->uncertainty_margin + watchdog->uncertainty_margin;
+ if (abs(cs_nsec - wd_nsec) > md) {
+ s64 cs_wd_msec;
+ s64 wd_msec;
+ u32 wd_rem;
+
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
- pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
- watchdog->name, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
- cs->name, csnow, cslast, cs->mask);
+ pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, cs_nsec, csnow, cslast, cs->mask);
+ cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
+ wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
+ pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
+ cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
+ if (curr_clocksource == cs)
+ pr_warn(" '%s' is current clocksource.\n", cs->name);
+ else if (curr_clocksource)
+ pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
+ else
+ pr_warn(" No current clocksource.\n");
__clocksource_unstable(cs);
continue;
}
@@ -277,11 +583,16 @@ static void clocksource_watchdog(unsigned long data)
* Cycle through CPUs to check if the CPUs stay synchronized
* to each other.
*/
- next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(cpu_online_mask);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask);
+
+ /*
+ * Arm timer if not already pending: could race with concurrent
+ * pair clocksource_stop_watchdog() clocksource_start_watchdog().
+ */
+ if (!timer_pending(&watchdog_timer)) {
+ watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
+ add_timer_on(&watchdog_timer, next_cpu);
+ }
out:
spin_unlock(&watchdog_lock);
}
@@ -290,8 +601,7 @@ static inline void clocksource_start_watchdog(void)
{
if (watchdog_running || !watchdog || list_empty(&watchdog_list))
return;
- init_timer(&watchdog_timer);
- watchdog_timer.function = clocksource_watchdog;
+ timer_setup(&watchdog_timer, clocksource_watchdog, 0);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
@@ -301,18 +611,10 @@ static inline void clocksource_stop_watchdog(void)
{
if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
return;
- del_timer(&watchdog_timer);
+ timer_delete(&watchdog_timer);
watchdog_running = 0;
}
-static inline void clocksource_reset_watchdog(void)
-{
- struct clocksource *cs;
-
- list_for_each_entry(cs, &watchdog_list, wd_list)
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
-
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);
@@ -320,9 +622,8 @@ static void clocksource_resume_watchdog(void)
static void clocksource_enqueue_watchdog(struct clocksource *cs)
{
- unsigned long flags;
+ INIT_LIST_HEAD(&cs->wd_list);
- spin_lock_irqsave(&watchdog_lock, flags);
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
/* cs is a clocksource to be watched. */
list_add(&cs->wd_list, &watchdog_list);
@@ -332,7 +633,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
- spin_unlock_irqrestore(&watchdog_lock, flags);
}
static void clocksource_select_watchdog(bool fallback)
@@ -374,9 +674,6 @@ static void clocksource_select_watchdog(bool fallback)
static void clocksource_dequeue_watchdog(struct clocksource *cs)
{
- unsigned long flags;
-
- spin_lock_irqsave(&watchdog_lock, flags);
if (cs != watchdog) {
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
/* cs is a watched clocksource. */
@@ -385,21 +682,25 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
clocksource_stop_watchdog();
}
}
- spin_unlock_irqrestore(&watchdog_lock, flags);
}
static int __clocksource_watchdog_kthread(void)
{
struct clocksource *cs, *tmp;
unsigned long flags;
- LIST_HEAD(unstable);
int select = 0;
+ /* Do any required per-CPU skew verification. */
+ if (curr_clocksource &&
+ curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
+ curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
+ clocksource_verify_percpu(curr_clocksource);
+
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
list_del_init(&cs->wd_list);
- list_add(&cs->wd_list, &unstable);
+ clocksource_change_rating(cs, 0);
select = 1;
}
if (cs->flags & CLOCK_SOURCE_RESELECT) {
@@ -411,11 +712,6 @@ static int __clocksource_watchdog_kthread(void)
clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
- /* Needs to be done outside of watchdog lock */
- list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
- list_del_init(&cs->wd_list);
- __clocksource_change_rating(cs, 0);
- }
return select;
}
@@ -448,8 +744,141 @@ static inline int __clocksource_watchdog_kthread(void) { return 0; }
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
void clocksource_mark_unstable(struct clocksource *cs) { }
+static inline void clocksource_watchdog_lock(unsigned long *flags) { }
+static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
+
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
+static bool clocksource_is_suspend(struct clocksource *cs)
+{
+ return cs == suspend_clocksource;
+}
+
+static void __clocksource_suspend_select(struct clocksource *cs)
+{
+ /*
+ * Skip the clocksource which will be stopped in suspend state.
+ */
+ if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP))
+ return;
+
+ /*
+ * The nonstop clocksource can be selected as the suspend clocksource to
+ * calculate the suspend time, so it should not supply suspend/resume
+ * interfaces to suspend the nonstop clocksource when system suspends.
+ */
+ if (cs->suspend || cs->resume) {
+ pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n",
+ cs->name);
+ }
+
+ /* Pick the best rating. */
+ if (!suspend_clocksource || cs->rating > suspend_clocksource->rating)
+ suspend_clocksource = cs;
+}
+
+/**
+ * clocksource_suspend_select - Select the best clocksource for suspend timing
+ * @fallback: if select a fallback clocksource
+ */
+static void clocksource_suspend_select(bool fallback)
+{
+ struct clocksource *cs, *old_suspend;
+
+ old_suspend = suspend_clocksource;
+ if (fallback)
+ suspend_clocksource = NULL;
+
+ list_for_each_entry(cs, &clocksource_list, list) {
+ /* Skip current if we were requested for a fallback. */
+ if (fallback && cs == old_suspend)
+ continue;
+
+ __clocksource_suspend_select(cs);
+ }
+}
+
+/**
+ * clocksource_start_suspend_timing - Start measuring the suspend timing
+ * @cs: current clocksource from timekeeping
+ * @start_cycles: current cycles from timekeeping
+ *
+ * This function will save the start cycle values of suspend timer to calculate
+ * the suspend time when resuming system.
+ *
+ * This function is called late in the suspend process from timekeeping_suspend(),
+ * that means processes are frozen, non-boot cpus and interrupts are disabled
+ * now. It is therefore possible to start the suspend timer without taking the
+ * clocksource mutex.
+ */
+void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
+{
+ if (!suspend_clocksource)
+ return;
+
+ /*
+ * If current clocksource is the suspend timer, we should use the
+ * tkr_mono.cycle_last value as suspend_start to avoid same reading
+ * from suspend timer.
+ */
+ if (clocksource_is_suspend(cs)) {
+ suspend_start = start_cycles;
+ return;
+ }
+
+ if (suspend_clocksource->enable &&
+ suspend_clocksource->enable(suspend_clocksource)) {
+ pr_warn_once("Failed to enable the non-suspend-able clocksource.\n");
+ return;
+ }
+
+ suspend_start = suspend_clocksource->read(suspend_clocksource);
+}
+
+/**
+ * clocksource_stop_suspend_timing - Stop measuring the suspend timing
+ * @cs: current clocksource from timekeeping
+ * @cycle_now: current cycles from timekeeping
+ *
+ * This function will calculate the suspend time from suspend timer.
+ *
+ * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource.
+ *
+ * This function is called early in the resume process from timekeeping_resume(),
+ * that means there is only one cpu, no processes are running and the interrupts
+ * are disabled. It is therefore possible to stop the suspend timer without
+ * taking the clocksource mutex.
+ */
+u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
+{
+ u64 now, nsec = 0;
+
+ if (!suspend_clocksource)
+ return 0;
+
+ /*
+ * If current clocksource is the suspend timer, we should use the
+ * tkr_mono.cycle_last value from timekeeping as current cycle to
+ * avoid same reading from suspend timer.
+ */
+ if (clocksource_is_suspend(cs))
+ now = cycle_now;
+ else
+ now = suspend_clocksource->read(suspend_clocksource);
+
+ if (now > suspend_start)
+ nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now);
+
+ /*
+ * Disable the suspend timer to save power if current clocksource is
+ * not the suspend timer.
+ */
+ if (!clocksource_is_suspend(cs) && suspend_clocksource->disable)
+ suspend_clocksource->disable(suspend_clocksource);
+
+ return nsec;
+}
+
/**
* clocksource_suspend - suspend the clocksource(s)
*/
@@ -559,9 +988,16 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
cs->maxadj, cs->mask,
&cs->max_cycles);
-}
-#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
+ /*
+ * Threshold for detecting negative motion in clocksource_delta().
+ *
+ * Allow for 0.875 of the counter width so that overly long idle
+ * sleeps, which go slightly over mask/2, do not trigger the
+ * negative motion detection.
+ */
+ cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3);
+}
static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
{
@@ -595,6 +1031,9 @@ static void __clocksource_select(bool skipcur)
if (!best)
return;
+ if (!strlen(override_name))
+ goto found;
+
/* Check for the override clocksource. */
list_for_each_entry(cs, &clocksource_list, list) {
if (skipcur && cs == curr_clocksource)
@@ -626,6 +1065,7 @@ static void __clocksource_select(bool skipcur)
break;
}
+found:
if (curr_clocksource != best && !timekeeping_notify(best)) {
pr_info("Switched to clocksource %s\n", best->name);
curr_clocksource = best;
@@ -650,12 +1090,6 @@ static void clocksource_select_fallback(void)
__clocksource_select(true);
}
-#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
-static inline void clocksource_select(void) { }
-static inline void clocksource_select_fallback(void) { }
-
-#endif
-
/*
* clocksource_done_booting - Called near the end of core bootup
*
@@ -736,6 +1170,31 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
NSEC_PER_SEC / scale, sec * scale);
}
+
+ /*
+ * If the uncertainty margin is not specified, calculate it. If
+ * both scale and freq are non-zero, calculate the clock period, but
+ * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default.
+ * However, if either of scale or freq is zero, be very conservative
+ * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value
+ * for the uncertainty margin. Allow stupidly small uncertainty
+ * margins to be specified by the caller for testing purposes,
+ * but warn to discourage production use of this capability.
+ *
+ * Bottom line: The sum of the uncertainty margins of the
+ * watchdog clocksource and the clocksource under test will be at
+ * least 500ppm by default. For more information, please see the
+ * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above.
+ */
+ if (scale && freq && !cs->uncertainty_margin) {
+ cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
+ if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
+ cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
+ } else if (!cs->uncertainty_margin) {
+ cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+ }
+ WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
+
/*
* Ensure clocksources that have large 'mult' values don't overflow
* when adjusted.
@@ -776,48 +1235,45 @@ EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
*/
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
+ unsigned long flags;
+
+ clocksource_arch_init(cs);
+
+ if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+ cs->id = CSID_GENERIC;
+ if (cs->vdso_clock_mode < 0 ||
+ cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
+ pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
+ cs->name, cs->vdso_clock_mode);
+ cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
+ }
/* Initialize mult/shift and max_idle_ns */
__clocksource_update_freq_scale(cs, scale, freq);
/* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
+
+ clocksource_watchdog_lock(&flags);
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
+ clocksource_watchdog_unlock(&flags);
+
clocksource_select();
clocksource_select_watchdog(false);
+ __clocksource_suspend_select(cs);
mutex_unlock(&clocksource_mutex);
return 0;
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
-static void __clocksource_change_rating(struct clocksource *cs, int rating)
-{
- list_del(&cs->list);
- cs->rating = rating;
- clocksource_enqueue(cs);
-}
-
-/**
- * clocksource_change_rating - Change the rating of a registered clocksource
- * @cs: clocksource to be changed
- * @rating: new rating
- */
-void clocksource_change_rating(struct clocksource *cs, int rating)
-{
- mutex_lock(&clocksource_mutex);
- __clocksource_change_rating(cs, rating);
- clocksource_select();
- clocksource_select_watchdog(false);
- mutex_unlock(&clocksource_mutex);
-}
-EXPORT_SYMBOL(clocksource_change_rating);
-
/*
* Unbind clocksource @cs. Called with clocksource_mutex held
*/
static int clocksource_unbind(struct clocksource *cs)
{
+ unsigned long flags;
+
if (clocksource_is_watchdog(cs)) {
/* Select and try to install a replacement watchdog. */
clocksource_select_watchdog(true);
@@ -831,8 +1287,21 @@ static int clocksource_unbind(struct clocksource *cs)
if (curr_clocksource == cs)
return -EBUSY;
}
+
+ if (clocksource_is_suspend(cs)) {
+ /*
+ * Select and try to install a replacement suspend clocksource.
+ * If no replacement suspend clocksource, we will just let the
+ * clocksource go and have no suspend clocksource.
+ */
+ clocksource_suspend_select(true);
+ }
+
+ clocksource_watchdog_lock(&flags);
clocksource_dequeue_watchdog(cs);
list_del_init(&cs->list);
+ clocksource_watchdog_unlock(&flags);
+
return 0;
}
@@ -854,21 +1323,21 @@ EXPORT_SYMBOL(clocksource_unregister);
#ifdef CONFIG_SYSFS
/**
- * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * current_clocksource_show - sysfs interface for current clocksource
* @dev: unused
* @attr: unused
* @buf: char buffer to be filled with clocksource list
*
* Provides sysfs interface for listing current clocksource.
*/
-static ssize_t
-sysfs_show_current_clocksources(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t current_clocksource_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
ssize_t count = 0;
mutex_lock(&clocksource_mutex);
- count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
+ count = sysfs_emit(buf, "%s\n", curr_clocksource->name);
mutex_unlock(&clocksource_mutex);
return count;
@@ -892,7 +1361,7 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
}
/**
- * sysfs_override_clocksource - interface for manually overriding clocksource
+ * current_clocksource_store - interface for manually overriding clocksource
* @dev: unused
* @attr: unused
* @buf: name of override clocksource
@@ -901,9 +1370,9 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
* Takes input from sysfs interface for manually overriding the default
* clocksource selection.
*/
-static ssize_t sysfs_override_clocksource(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t current_clocksource_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
ssize_t ret;
@@ -917,9 +1386,10 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
return ret;
}
+static DEVICE_ATTR_RW(current_clocksource);
/**
- * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
+ * unbind_clocksource_store - interface for manually unbinding clocksource
* @dev: unused
* @attr: unused
* @buf: unused
@@ -927,7 +1397,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
*
* Takes input from sysfs interface for manually unbinding a clocksource.
*/
-static ssize_t sysfs_unbind_clocksource(struct device *dev,
+static ssize_t unbind_clocksource_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
@@ -951,19 +1421,19 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
return ret ? ret : count;
}
+static DEVICE_ATTR_WO(unbind_clocksource);
/**
- * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * available_clocksource_show - sysfs interface for listing clocksource
* @dev: unused
* @attr: unused
* @buf: char buffer to be filled with clocksource list
*
* Provides sysfs interface for listing registered clocksources
*/
-static ssize_t
-sysfs_show_available_clocksources(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t available_clocksource_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
struct clocksource *src;
ssize_t count = 0;
@@ -987,19 +1457,17 @@ sysfs_show_available_clocksources(struct device *dev,
return count;
}
+static DEVICE_ATTR_RO(available_clocksource);
-/*
- * Sysfs setup bits:
- */
-static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
- sysfs_override_clocksource);
-
-static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
-
-static DEVICE_ATTR(available_clocksource, 0444,
- sysfs_show_available_clocksources, NULL);
+static struct attribute *clocksource_attrs[] = {
+ &dev_attr_current_clocksource.attr,
+ &dev_attr_unbind_clocksource.attr,
+ &dev_attr_available_clocksource.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(clocksource);
-static struct bus_type clocksource_subsys = {
+static const struct bus_type clocksource_subsys = {
.name = "clocksource",
.dev_name = "clocksource",
};
@@ -1007,6 +1475,7 @@ static struct bus_type clocksource_subsys = {
static struct device device_clocksource = {
.id = 0,
.bus = &clocksource_subsys,
+ .groups = clocksource_groups,
};
static int __init init_clocksource_sysfs(void)
@@ -1015,17 +1484,7 @@ static int __init init_clocksource_sysfs(void)
if (!error)
error = device_register(&device_clocksource);
- if (!error)
- error = device_create_file(
- &device_clocksource,
- &dev_attr_current_clocksource);
- if (!error)
- error = device_create_file(&device_clocksource,
- &dev_attr_unbind_clocksource);
- if (!error)
- error = device_create_file(
- &device_clocksource,
- &dev_attr_available_clocksource);
+
return error;
}
@@ -1043,7 +1502,7 @@ static int __init boot_override_clocksource(char* str)
{
mutex_lock(&clocksource_mutex);
if (str)
- strlcpy(override_name, str, sizeof(override_name));
+ strscpy(override_name, str);
mutex_unlock(&clocksource_mutex);
return 1;
}