diff options
Diffstat (limited to 'kernel/time/clocksource.c')
| -rw-r--r-- | kernel/time/clocksource.c | 1092 |
1 files changed, 765 insertions, 327 deletions
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 50a8736757f3..a1890a073196 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1,28 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * linux/kernel/time/clocksource.c - * * This file contains the functions which manage clocksource drivers. * * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * TODO WishList: - * o Allow clocksource drivers to be unregistered */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/device.h> #include <linux/clocksource.h> #include <linux/init.h> @@ -30,84 +14,23 @@ #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ #include <linux/tick.h> #include <linux/kthread.h> +#include <linux/prandom.h> +#include <linux/cpu.h> #include "tick-internal.h" +#include "timekeeping_internal.h" -void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, - u64 start_tstamp) -{ - tc->cc = cc; - tc->cycle_last = cc->read(cc); - tc->nsec = start_tstamp; -} -EXPORT_SYMBOL_GPL(timecounter_init); - -/** - * timecounter_read_delta - get nanoseconds since last call of this function - * @tc: Pointer to time counter - * - * When the underlying cycle counter runs over, this will be handled - * correctly as long as it does not run over more than once between - * calls. - * - * The first call to this function for a new time counter initializes - * the time tracking and returns an undefined result. - */ -static u64 timecounter_read_delta(struct timecounter *tc) -{ - cycle_t cycle_now, cycle_delta; - u64 ns_offset; - - /* read cycle counter: */ - cycle_now = tc->cc->read(tc->cc); - - /* calculate the delta since the last timecounter_read_delta(): */ - cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; - - /* convert to nanoseconds: */ - ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); - - /* update time stamp of timecounter_read_delta() call: */ - tc->cycle_last = cycle_now; - - return ns_offset; -} - -u64 timecounter_read(struct timecounter *tc) -{ - u64 nsec; - - /* increment time by nanoseconds since last call */ - nsec = timecounter_read_delta(tc); - nsec += tc->nsec; - tc->nsec = nsec; - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_read); +static void clocksource_enqueue(struct clocksource *cs); -u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp) +static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) { - u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec; + u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta); - /* - * Instead of always treating cycle_tstamp as more recent - * than tc->cycle_last, detect when it is too far in the - * future and treat it as old time stamp instead. - */ - if (cycle_delta > tc->cc->mask / 2) { - cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); - } else { - nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; - } + if (likely(delta < cs->max_cycles)) + return clocksource_cyc2ns(delta, cs->mult, cs->shift); - return nsec; + return mul_u64_u32_shr(delta, cs->mult, cs->shift); } -EXPORT_SYMBOL_GPL(timecounter_cyc2time); /** * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks @@ -129,7 +52,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is * multiplied with the calculated mult factor. Larger ranges may - * reduce the conversion accuracy by chosing smaller mult and shift + * reduce the conversion accuracy by choosing smaller mult and shift * factors. */ void @@ -162,10 +85,13 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) *mult = tmp; *shift = sft; } +EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); /*[Clocksource internal variables]--------- * curr_clocksource: * currently selected clocksource. + * suspend_clocksource: + * used to calculate the suspend time. * clocksource_list: * linked list with the registered clocksources * clocksource_mutex: @@ -174,10 +100,54 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) * Name of the user-specified clocksource. */ static struct clocksource *curr_clocksource; +static struct clocksource *suspend_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); static char override_name[CS_NAME_LEN]; static int finished_booting; +static u64 suspend_start; + +/* + * Interval: 0.5sec. + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ)) + +/* + * Threshold: 0.0312s, when doubled: 0.0625s. + */ +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) + +/* + * Maximum permissible delay between two readouts of the watchdog + * clocksource surrounding a read of the clocksource being validated. + * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as + * a lower bound for cs->uncertainty_margin values when registering clocks. + * + * The default of 500 parts per million is based on NTP's limits. + * If a clocksource is good enough for NTP, it is good enough for us! + * + * In other words, by default, even if a clocksource is extremely + * precise (for example, with a sub-nanosecond period), the maximum + * permissible skew between the clocksource watchdog and the clocksource + * under test is not permitted to go below the 500ppm minimum defined + * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the + * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. + */ +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#else +#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) +#endif + +/* + * Default for maximum permissible skew when cs->uncertainty_margin is + * not specified, and the lower bound even when cs->uncertainty_margin + * is specified. This is also the default that is used when registering + * clocks with unspecified cs->uncertainty_margin, so this macro is used + * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. + */ +#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); @@ -190,48 +160,73 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; +static int64_t watchdog_max_interval; -static int clocksource_watchdog_kthread(void *data); -static void __clocksource_change_rating(struct clocksource *cs, int rating); +static inline void clocksource_watchdog_lock(unsigned long *flags) +{ + spin_lock_irqsave(&watchdog_lock, *flags); +} -/* - * Interval: 0.5sec Threshold: 0.0625s - */ -#define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static inline void clocksource_watchdog_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&watchdog_lock, *flags); +} + +static int clocksource_watchdog_kthread(void *data); static void clocksource_watchdog_work(struct work_struct *work) { /* + * We cannot directly run clocksource_watchdog_kthread() here, because + * clocksource_select() calls timekeeping_notify() which uses + * stop_machine(). One cannot use stop_machine() from a workqueue() due + * lock inversions wrt CPU hotplug. + * + * Also, we only ever run this work once or twice during the lifetime + * of the kernel, so there is no point in creating a more permanent + * kthread for this. + * * If kthread_run fails the next watchdog scan over the * watchdog_list will find the unstable clock again. */ kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } +static void clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; + + /* + * If the clocksource is registered clocksource_watchdog_kthread() will + * re-rate and re-select. + */ + if (list_empty(&cs->list)) { + cs->rating = 0; + return; + } + + if (cs->mark_unstable) + cs->mark_unstable(cs); + + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(&watchdog_work); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) -{ - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); - __clocksource_unstable(cs); -} - /** * clocksource_mark_unstable - mark clocksource unstable via watchdog * @cs: clocksource to be marked unstable * - * This function is called instead of clocksource_change_rating from - * cpu hotplug code to avoid a deadlock between the clocksource mutex - * and the cpu hotplug mutex. It defers the update of the clocksource - * to the watchdog thread. + * This function is called by the x86 TSC code to mark clocksources as unstable; + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -239,19 +234,202 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_lock_irqsave(&watchdog_lock, flags); if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { - if (list_empty(&cs->wd_list)) + if (!list_empty(&cs->list) && list_empty(&cs->wd_list)) list_add(&cs->wd_list, &watchdog_list); __clocksource_unstable(cs); } spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_watchdog(unsigned long data) +static int verify_n_cpus = 8; +module_param(verify_n_cpus, int, 0644); + +enum wd_read_status { + WD_READ_SUCCESS, + WD_READ_UNSTABLE, + WD_READ_SKIP +}; + +static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) +{ + int64_t md = 2 * watchdog->uncertainty_margin; + unsigned int nretries, max_retries; + int64_t wd_delay, wd_seq_delay; + u64 wd_end, wd_end2; + + max_retries = clocksource_get_max_watchdog_retry(); + for (nretries = 0; nretries <= max_retries; nretries++) { + local_irq_disable(); + *wdnow = watchdog->read(watchdog); + *csnow = cs->read(cs); + wd_end = watchdog->read(watchdog); + wd_end2 = watchdog->read(watchdog); + local_irq_enable(); + + wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); + if (wd_delay <= md + cs->uncertainty_margin) { + if (nretries > 1 && nretries >= max_retries) { + pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", + smp_processor_id(), watchdog->name, nretries); + } + return WD_READ_SUCCESS; + } + + /* + * Now compute delay in consecutive watchdog read to see if + * there is too much external interferences that cause + * significant delay in reading both clocksource and watchdog. + * + * If consecutive WD read-back delay > md, report + * system busy, reinit the watchdog and skip the current + * watchdog test. + */ + wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); + if (wd_seq_delay > md) + goto skip_test; + } + + pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", + smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); + return WD_READ_UNSTABLE; + +skip_test: + pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n", + smp_processor_id(), watchdog->name, wd_seq_delay); + pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n", + cs->name, wd_delay); + return WD_READ_SKIP; +} + +static u64 csnow_mid; +static cpumask_t cpus_ahead; +static cpumask_t cpus_behind; +static cpumask_t cpus_chosen; + +static void clocksource_verify_choose_cpus(void) +{ + int cpu, i, n = verify_n_cpus; + + if (n < 0 || n >= num_online_cpus()) { + /* Check all of the CPUs. */ + cpumask_copy(&cpus_chosen, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + return; + } + + /* If no checking desired, or no other CPU to check, leave. */ + cpumask_clear(&cpus_chosen); + if (n == 0 || num_online_cpus() <= 1) + return; + + /* Make sure to select at least one CPU other than the current CPU. */ + cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return; + cpumask_set_cpu(cpu, &cpus_chosen); + + /* Force a sane value for the boot parameter. */ + if (n > nr_cpu_ids) + n = nr_cpu_ids; + + /* + * Randomly select the specified number of CPUs. If the same + * CPU is selected multiple times, that CPU is checked only once, + * and no replacement CPU is selected. This gracefully handles + * situations where verify_n_cpus is greater than the number of + * CPUs that are currently online. + */ + for (i = 1; i < n; i++) { + cpu = cpumask_random(cpu_online_mask); + if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) + cpumask_set_cpu(cpu, &cpus_chosen); + } + + /* Don't verify ourselves. */ + cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); +} + +static void clocksource_verify_one_cpu(void *csin) +{ + struct clocksource *cs = (struct clocksource *)csin; + + csnow_mid = cs->read(cs); +} + +void clocksource_verify_percpu(struct clocksource *cs) +{ + int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; + u64 csnow_begin, csnow_end; + int cpu, testcpu; + s64 delta; + + if (verify_n_cpus == 0) + return; + cpumask_clear(&cpus_ahead); + cpumask_clear(&cpus_behind); + cpus_read_lock(); + migrate_disable(); + clocksource_verify_choose_cpus(); + if (cpumask_empty(&cpus_chosen)) { + migrate_enable(); + cpus_read_unlock(); + pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); + return; + } + testcpu = smp_processor_id(); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + preempt_disable(); + for_each_cpu(cpu, &cpus_chosen) { + if (cpu == testcpu) + continue; + csnow_begin = cs->read(cs); + smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); + csnow_end = cs->read(cs); + delta = (s64)((csnow_mid - csnow_begin) & cs->mask); + if (delta < 0) + cpumask_set_cpu(cpu, &cpus_behind); + delta = (csnow_end - csnow_mid) & cs->mask; + if (delta < 0) + cpumask_set_cpu(cpu, &cpus_ahead); + cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end); + if (cs_nsec > cs_nsec_max) + cs_nsec_max = cs_nsec; + if (cs_nsec < cs_nsec_min) + cs_nsec_min = cs_nsec; + } + preempt_enable(); + migrate_enable(); + cpus_read_unlock(); + if (!cpumask_empty(&cpus_ahead)) + pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", + cpumask_pr_args(&cpus_ahead), testcpu, cs->name); + if (!cpumask_empty(&cpus_behind)) + pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", + cpumask_pr_args(&cpus_behind), testcpu, cs->name); + pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); +} +EXPORT_SYMBOL_GPL(clocksource_verify_percpu); + +static inline void clocksource_reset_watchdog(void) { struct clocksource *cs; - cycle_t csnow, wdnow; - int64_t wd_nsec, cs_nsec; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + + +static void clocksource_watchdog(struct timer_list *unused) +{ + int64_t wd_nsec, cs_nsec, interval; + u64 csnow, wdnow, cslast, wdlast; int next_cpu, reset_pending; + struct clocksource *cs; + enum wd_read_status read_ret; + unsigned long extra_wait = 0; + u32 md; spin_lock(&watchdog_lock); if (!watchdog_running) @@ -268,10 +446,31 @@ static void clocksource_watchdog(unsigned long data) continue; } - local_irq_disable(); - csnow = cs->read(cs); - wdnow = watchdog->read(watchdog); - local_irq_enable(); + read_ret = cs_watchdog_read(cs, &csnow, &wdnow); + + if (read_ret == WD_READ_UNSTABLE) { + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); + continue; + } + + /* + * When WD_READ_SKIP is returned, it means the system is likely + * under very heavy load, where the latency of reading + * watchdog/clocksource is very big, and affect the accuracy of + * watchdog check. So give system some space and suspend the + * watchdog check for 5 minutes. + */ + if (read_ret == WD_READ_SKIP) { + /* + * As the watchdog timer will be suspended, and + * cs->last could keep unchanged for 5 minutes, reset + * the counters. + */ + clocksource_reset_watchdog(); + extra_wait = HZ * 300; + break; + } /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || @@ -282,23 +481,67 @@ static void clocksource_watchdog(unsigned long data) continue; } - wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, - watchdog->mult, watchdog->shift); - - cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & - cs->mask, cs->mult, cs->shift); + wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow); + cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow); + wdlast = cs->wd_last; /* save these in case we print them */ + cslast = cs->cs_last; cs->cs_last = csnow; cs->wd_last = wdnow; if (atomic_read(&watchdog_reset_pending)) continue; + /* + * The processing of timer softirqs can get delayed (usually + * on account of ksoftirqd not getting to run in a timely + * manner), which causes the watchdog interval to stretch. + * Skew detection may fail for longer watchdog intervals + * on account of fixed margins being used. + * Some clocksources, e.g. acpi_pm, cannot tolerate + * watchdog intervals longer than a few seconds. + */ + interval = max(cs_nsec, wd_nsec); + if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) { + if (system_state > SYSTEM_SCHEDULING && + interval > 2 * watchdog_max_interval) { + watchdog_max_interval = interval; + pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n", + cs_nsec, wd_nsec); + } + watchdog_timer.expires = jiffies; + continue; + } + /* Check the deviation from the watchdog clocksource. */ - if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - clocksource_unstable(cs, cs_nsec - wd_nsec); + md = cs->uncertainty_margin + watchdog->uncertainty_margin; + if (abs(cs_nsec - wd_nsec) > md) { + s64 cs_wd_msec; + s64 wd_msec; + u32 wd_rem; + + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", + smp_processor_id(), cs->name); + pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", + watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); + pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", + cs->name, cs_nsec, csnow, cslast, cs->mask); + cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); + wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); + pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", + cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); + if (curr_clocksource == cs) + pr_warn(" '%s' is current clocksource.\n", cs->name); + else if (curr_clocksource) + pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name); + else + pr_warn(" No current clocksource.\n"); + __clocksource_unstable(cs); continue; } + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { @@ -340,11 +583,16 @@ static void clocksource_watchdog(unsigned long data) * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); + + /* + * Arm timer if not already pending: could race with concurrent + * pair clocksource_stop_watchdog() clocksource_start_watchdog(). + */ + if (!timer_pending(&watchdog_timer)) { + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; + add_timer_on(&watchdog_timer, next_cpu); + } out: spin_unlock(&watchdog_lock); } @@ -353,8 +601,7 @@ static inline void clocksource_start_watchdog(void) { if (watchdog_running || !watchdog || list_empty(&watchdog_list)) return; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; + timer_setup(&watchdog_timer, clocksource_watchdog, 0); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); watchdog_running = 1; @@ -364,18 +611,10 @@ static inline void clocksource_stop_watchdog(void) { if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) return; - del_timer(&watchdog_timer); + timer_delete(&watchdog_timer); watchdog_running = 0; } -static inline void clocksource_reset_watchdog(void) -{ - struct clocksource *cs; - - list_for_each_entry(cs, &watchdog_list, wd_list) - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} - static void clocksource_resume_watchdog(void) { atomic_inc(&watchdog_reset_pending); @@ -383,9 +622,8 @@ static void clocksource_resume_watchdog(void) static void clocksource_enqueue_watchdog(struct clocksource *cs) { - unsigned long flags; + INIT_LIST_HEAD(&cs->wd_list); - spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); @@ -394,13 +632,41 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { + if (!watchdog || cs->rating > watchdog->rating) watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -408,9 +674,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); if (cs != watchdog) { if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { /* cs is a watched clocksource. */ @@ -419,21 +682,25 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) clocksource_stop_watchdog(); } } - spin_unlock_irqrestore(&watchdog_lock, flags); } static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; - LIST_HEAD(unstable); int select = 0; + /* Do any required per-CPU skew verification. */ + if (curr_clocksource && + curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && + curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) + clocksource_verify_percpu(curr_clocksource); + spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - list_add(&cs->wd_list, &unstable); + clocksource_change_rating(cs, 0); select = 1; } if (cs->flags & CLOCK_SOURCE_RESELECT) { @@ -445,11 +712,6 @@ static int __clocksource_watchdog_kthread(void) clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); - /* Needs to be done outside of watchdog lock */ - list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { - list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); - } return select; } @@ -475,13 +737,148 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { } + +static inline void clocksource_watchdog_lock(unsigned long *flags) { } +static inline void clocksource_watchdog_unlock(unsigned long *flags) { } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ +static bool clocksource_is_suspend(struct clocksource *cs) +{ + return cs == suspend_clocksource; +} + +static void __clocksource_suspend_select(struct clocksource *cs) +{ + /* + * Skip the clocksource which will be stopped in suspend state. + */ + if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP)) + return; + + /* + * The nonstop clocksource can be selected as the suspend clocksource to + * calculate the suspend time, so it should not supply suspend/resume + * interfaces to suspend the nonstop clocksource when system suspends. + */ + if (cs->suspend || cs->resume) { + pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n", + cs->name); + } + + /* Pick the best rating. */ + if (!suspend_clocksource || cs->rating > suspend_clocksource->rating) + suspend_clocksource = cs; +} + +/** + * clocksource_suspend_select - Select the best clocksource for suspend timing + * @fallback: if select a fallback clocksource + */ +static void clocksource_suspend_select(bool fallback) +{ + struct clocksource *cs, *old_suspend; + + old_suspend = suspend_clocksource; + if (fallback) + suspend_clocksource = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_suspend) + continue; + + __clocksource_suspend_select(cs); + } +} + +/** + * clocksource_start_suspend_timing - Start measuring the suspend timing + * @cs: current clocksource from timekeeping + * @start_cycles: current cycles from timekeeping + * + * This function will save the start cycle values of suspend timer to calculate + * the suspend time when resuming system. + * + * This function is called late in the suspend process from timekeeping_suspend(), + * that means processes are frozen, non-boot cpus and interrupts are disabled + * now. It is therefore possible to start the suspend timer without taking the + * clocksource mutex. + */ +void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles) +{ + if (!suspend_clocksource) + return; + + /* + * If current clocksource is the suspend timer, we should use the + * tkr_mono.cycle_last value as suspend_start to avoid same reading + * from suspend timer. + */ + if (clocksource_is_suspend(cs)) { + suspend_start = start_cycles; + return; + } + + if (suspend_clocksource->enable && + suspend_clocksource->enable(suspend_clocksource)) { + pr_warn_once("Failed to enable the non-suspend-able clocksource.\n"); + return; + } + + suspend_start = suspend_clocksource->read(suspend_clocksource); +} + +/** + * clocksource_stop_suspend_timing - Stop measuring the suspend timing + * @cs: current clocksource from timekeeping + * @cycle_now: current cycles from timekeeping + * + * This function will calculate the suspend time from suspend timer. + * + * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource. + * + * This function is called early in the resume process from timekeeping_resume(), + * that means there is only one cpu, no processes are running and the interrupts + * are disabled. It is therefore possible to stop the suspend timer without + * taking the clocksource mutex. + */ +u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now) +{ + u64 now, nsec = 0; + + if (!suspend_clocksource) + return 0; + + /* + * If current clocksource is the suspend timer, we should use the + * tkr_mono.cycle_last value from timekeeping as current cycle to + * avoid same reading from suspend timer. + */ + if (clocksource_is_suspend(cs)) + now = cycle_now; + else + now = suspend_clocksource->read(suspend_clocksource); + + if (now > suspend_start) + nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now); + + /* + * Disable the suspend timer to save power if current clocksource is + * not the suspend timer. + */ + if (!clocksource_is_suspend(cs) && suspend_clocksource->disable) + suspend_clocksource->disable(suspend_clocksource); + + return nsec; +} + /** * clocksource_suspend - suspend the clocksource(s) */ @@ -537,51 +934,71 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) } /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs: Pointer to clocksource + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @maxadj: maximum adjustment value to mult (~11%) + * @mask: bitmask for two's complement subtraction of non 64 bit counters + * @max_cyc: maximum cycle value before potential overflow (does not include + * any safety margin) * + * NOTE: This function includes a safety margin of 50%, in other words, we + * return half the number of nanoseconds the hardware counter can technically + * cover. This is done so that we can potentially detect problems caused by + * delayed timers or bad hardware, which might result in time intervals that + * are larger than what the math used can handle without overflows. */ -static u64 clocksource_max_deferment(struct clocksource *cs) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { u64 max_nsecs, max_cycles; /* * Calculate the maximum number of cycles that we can pass to the - * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) - * which is equivalent to the below. - * max_cycles < (2^63)/(cs->mult + cs->maxadj) - * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) - * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) - * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) - * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) - * Please note that we add 1 to the result of the log2 to account for - * any rounding errors, ensure the above inequality is satisfied and - * no overflow will occur. + * cyc2ns() function without overflowing a 64-bit result. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); + max_cycles = ULLONG_MAX; + do_div(max_cycles, mult+maxadj); /* * The actual maximum number of cycles we can defer the clocksource is - * determined by the minimum of max_cycles and cs->mask. + * determined by the minimum of max_cycles and mask. * Note: Here we subtract the maxadj to make sure we don't sleep for * too long if there's a large negative adjustment. */ - max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, - cs->shift); + max_cycles = min(max_cycles, mask); + max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + + /* return the max_cycles value as well if requested */ + if (max_cyc) + *max_cyc = max_cycles; + + /* Return 50% of the actual maximum, so we can detect bad values */ + max_nsecs >>= 1; + + return max_nsecs; +} + +/** + * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles + * @cs: Pointer to clocksource to be updated + * + */ +static inline void clocksource_update_max_deferment(struct clocksource *cs) +{ + cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, + cs->maxadj, cs->mask, + &cs->max_cycles); /* - * To ensure that the clocksource does not wrap whilst we are idle, - * limit the time the clocksource can be deferred by 12.5%. Please - * note a margin of 12.5% is used because this can be computed with - * a shift, versus say 10% which would require division. + * Threshold for detecting negative motion in clocksource_delta(). + * + * Allow for 0.875 of the counter width so that overly long idle + * sleeps, which go slightly over mask/2, do not trigger the + * negative motion detection. */ - return max_nsecs - (max_nsecs >> 3); + cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3); } -#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET - static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { struct clocksource *cs; @@ -614,6 +1031,9 @@ static void __clocksource_select(bool skipcur) if (!best) return; + if (!strlen(override_name)) + goto found; + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (skipcur && cs == curr_clocksource) @@ -627,16 +1047,25 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - printk(KERN_WARNING "Override clocksource %s is not " - "HRT compatible. Cannot switch while in " - "HRT/NOHZ mode\n", cs->name); - override_name[0] = 0; + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); + override_name[0] = 0; + } else { + /* + * The override cannot be currently verified. + * Deferring to let the watchdog check. + */ + pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", + cs->name); + } } else /* Override clocksource can be used. */ best = cs; break; } +found: if (curr_clocksource != best && !timekeeping_notify(best)) { pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; @@ -653,21 +1082,14 @@ static void __clocksource_select(bool skipcur) */ static void clocksource_select(void) { - return __clocksource_select(false); + __clocksource_select(false); } static void clocksource_select_fallback(void) { - return __clocksource_select(true); + __clocksource_select(true); } -#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ - -static inline void clocksource_select(void) { } -static inline void clocksource_select_fallback(void) { } - -#endif - /* * clocksource_done_booting - Called near the end of core bootup * @@ -698,15 +1120,17 @@ static void clocksource_enqueue(struct clocksource *cs) struct list_head *entry = &clocksource_list; struct clocksource *tmp; - list_for_each_entry(tmp, &clocksource_list, list) + list_for_each_entry(tmp, &clocksource_list, list) { /* Keep track of the place, where to insert */ - if (tmp->rating >= cs->rating) - entry = &tmp->list; + if (tmp->rating < cs->rating) + break; + entry = &tmp->list; + } list_add(&cs->list, entry); } /** - * __clocksource_updatefreq_scale - Used update clocksource with new freq + * __clocksource_update_freq_scale - Used update clocksource with new freq * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale @@ -714,48 +1138,89 @@ static void clocksource_enqueue(struct clocksource *cs) * This should only be called from the clocksource->enable() method. * * This *SHOULD NOT* be called directly! Please use the - * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. + * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper + * functions. */ -void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) +void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; + /* - * Calc the maximum number of seconds which we can run before - * wrapping around. For clocksources which have a mask > 32bit - * we need to limit the max sleep time to have a good - * conversion precision. 10 minutes is still a reasonable - * amount. That results in a shift value of 24 for a - * clocksource with mask >= 40bit and f >= 4GHz. That maps to - * ~ 0.06ppm granularity for NTP. We apply the same 12.5% - * margin as we do in clocksource_max_deferment() + * Default clocksources are *special* and self-define their mult/shift. + * But, you're not special, so you should specify a freq value. */ - sec = (cs->mask - (cs->mask >> 3)); - do_div(sec, freq); - do_div(sec, scale); - if (!sec) - sec = 1; - else if (sec > 600 && cs->mask > UINT_MAX) - sec = 600; - - clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, - NSEC_PER_SEC / scale, sec * scale); + if (freq) { + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32-bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40-bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. + */ + sec = cs->mask; + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + } /* - * for clocksources that have large mults, to avoid overflow. - * Since mult may be adjusted by ntp, add an safety extra margin + * If the uncertainty margin is not specified, calculate it. If + * both scale and freq are non-zero, calculate the clock period, but + * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default. + * However, if either of scale or freq is zero, be very conservative + * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value + * for the uncertainty margin. Allow stupidly small uncertainty + * margins to be specified by the caller for testing purposes, + * but warn to discourage production use of this capability. * + * Bottom line: The sum of the uncertainty margins of the + * watchdog clocksource and the clocksource under test will be at + * least 500ppm by default. For more information, please see the + * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above. + */ + if (scale && freq && !cs->uncertainty_margin) { + cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); + if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) + cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; + } else if (!cs->uncertainty_margin) { + cs->uncertainty_margin = WATCHDOG_THRESHOLD; + } + WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); + + /* + * Ensure clocksources that have large 'mult' values don't overflow + * when adjusted. */ cs->maxadj = clocksource_max_adjustment(cs); - while ((cs->mult + cs->maxadj < cs->mult) - || (cs->mult - cs->maxadj > cs->mult)) { + while (freq && ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult))) { cs->mult >>= 1; cs->shift--; cs->maxadj = clocksource_max_adjustment(cs); } - cs->max_idle_ns = clocksource_max_deferment(cs); + /* + * Only warn for *special* clocksources that self-define + * their mult/shift values and don't specify a freq. + */ + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + + clocksource_update_max_deferment(cs); + + pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } -EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); +EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); /** * __clocksource_register_scale - Used to install new clocksources @@ -770,79 +1235,51 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); */ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { + unsigned long flags; - /* Initialize mult/shift and max_idle_ns */ - __clocksource_updatefreq_scale(cs, scale, freq); - - /* Add clocksource to the clcoksource list */ - mutex_lock(&clocksource_mutex); - clocksource_enqueue(cs); - clocksource_enqueue_watchdog(cs); - clocksource_select(); - mutex_unlock(&clocksource_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(__clocksource_register_scale); - + clocksource_arch_init(cs); -/** - * clocksource_register - Used to install new clocksources - * @cs: clocksource to be registered - * - * Returns -EBUSY if registration fails, zero otherwise. - */ -int clocksource_register(struct clocksource *cs) -{ - /* calculate max adjustment for given mult/shift */ - cs->maxadj = clocksource_max_adjustment(cs); - WARN_ONCE(cs->mult + cs->maxadj < cs->mult, - "Clocksource %s might overflow on 11%% adjustment\n", - cs->name); + if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) + cs->id = CSID_GENERIC; + if (cs->vdso_clock_mode < 0 || + cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { + pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", + cs->name, cs->vdso_clock_mode); + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; + } - /* calculate max idle time permitted for this clocksource */ - cs->max_idle_ns = clocksource_max_deferment(cs); + /* Initialize mult/shift and max_idle_ns */ + __clocksource_update_freq_scale(cs, scale, freq); + /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); - clocksource_enqueue(cs); - clocksource_enqueue_watchdog(cs); - clocksource_select(); - mutex_unlock(&clocksource_mutex); - return 0; -} -EXPORT_SYMBOL(clocksource_register); -static void __clocksource_change_rating(struct clocksource *cs, int rating) -{ - list_del(&cs->list); - cs->rating = rating; + clocksource_watchdog_lock(&flags); clocksource_enqueue(cs); -} + clocksource_enqueue_watchdog(cs); + clocksource_watchdog_unlock(&flags); -/** - * clocksource_change_rating - Change the rating of a registered clocksource - * @cs: clocksource to be changed - * @rating: new rating - */ -void clocksource_change_rating(struct clocksource *cs, int rating) -{ - mutex_lock(&clocksource_mutex); - __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); + __clocksource_suspend_select(cs); mutex_unlock(&clocksource_mutex); + return 0; } -EXPORT_SYMBOL(clocksource_change_rating); +EXPORT_SYMBOL_GPL(__clocksource_register_scale); /* * Unbind clocksource @cs. Called with clocksource_mutex held */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + unsigned long flags; + + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ @@ -850,8 +1287,21 @@ static int clocksource_unbind(struct clocksource *cs) if (curr_clocksource == cs) return -EBUSY; } + + if (clocksource_is_suspend(cs)) { + /* + * Select and try to install a replacement suspend clocksource. + * If no replacement suspend clocksource, we will just let the + * clocksource go and have no suspend clocksource. + */ + clocksource_suspend_select(true); + } + + clocksource_watchdog_lock(&flags); clocksource_dequeue_watchdog(cs); list_del_init(&cs->list); + clocksource_watchdog_unlock(&flags); + return 0; } @@ -873,27 +1323,27 @@ EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** - * sysfs_show_current_clocksources - sysfs interface for current clocksource + * current_clocksource_show - sysfs interface for current clocksource * @dev: unused * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing current clocksource. */ -static ssize_t -sysfs_show_current_clocksources(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t current_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) { ssize_t count = 0; mutex_lock(&clocksource_mutex); - count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); + count = sysfs_emit(buf, "%s\n", curr_clocksource->name); mutex_unlock(&clocksource_mutex); return count; } -size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -911,7 +1361,7 @@ size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) } /** - * sysfs_override_clocksource - interface for manually overriding clocksource + * current_clocksource_store - interface for manually overriding clocksource * @dev: unused * @attr: unused * @buf: name of override clocksource @@ -920,11 +1370,11 @@ size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) * Takes input from sysfs interface for manually overriding the default * clocksource selection. */ -static ssize_t sysfs_override_clocksource(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t current_clocksource_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { - size_t ret; + ssize_t ret; mutex_lock(&clocksource_mutex); @@ -936,9 +1386,10 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +static DEVICE_ATTR_RW(current_clocksource); /** - * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * unbind_clocksource_store - interface for manually unbinding clocksource * @dev: unused * @attr: unused * @buf: unused @@ -946,13 +1397,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev, * * Takes input from sysfs interface for manually unbinding a clocksource. */ -static ssize_t sysfs_unbind_clocksource(struct device *dev, +static ssize_t unbind_clocksource_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct clocksource *cs; char name[CS_NAME_LEN]; - size_t ret; + ssize_t ret; ret = sysfs_get_uname(buf, name, count); if (ret < 0) @@ -970,19 +1421,19 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, return ret ? ret : count; } +static DEVICE_ATTR_WO(unbind_clocksource); /** - * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * available_clocksource_show - sysfs interface for listing clocksource * @dev: unused * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing registered clocksources */ -static ssize_t -sysfs_show_available_clocksources(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t available_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) { struct clocksource *src; ssize_t count = 0; @@ -1006,19 +1457,17 @@ sysfs_show_available_clocksources(struct device *dev, return count; } +static DEVICE_ATTR_RO(available_clocksource); -/* - * Sysfs setup bits: - */ -static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, - sysfs_override_clocksource); - -static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); - -static DEVICE_ATTR(available_clocksource, 0444, - sysfs_show_available_clocksources, NULL); +static struct attribute *clocksource_attrs[] = { + &dev_attr_current_clocksource.attr, + &dev_attr_unbind_clocksource.attr, + &dev_attr_available_clocksource.attr, + NULL +}; +ATTRIBUTE_GROUPS(clocksource); -static struct bus_type clocksource_subsys = { +static const struct bus_type clocksource_subsys = { .name = "clocksource", .dev_name = "clocksource", }; @@ -1026,6 +1475,7 @@ static struct bus_type clocksource_subsys = { static struct device device_clocksource = { .id = 0, .bus = &clocksource_subsys, + .groups = clocksource_groups, }; static int __init init_clocksource_sysfs(void) @@ -1034,17 +1484,7 @@ static int __init init_clocksource_sysfs(void) if (!error) error = device_register(&device_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_current_clocksource); - if (!error) - error = device_create_file(&device_clocksource, - &dev_attr_unbind_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_available_clocksource); + return error; } @@ -1062,7 +1502,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strlcpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } @@ -1079,12 +1519,10 @@ __setup("clocksource=", boot_override_clocksource); static int __init boot_override_clock(char* str) { if (!strcmp(str, "pmtmr")) { - printk("Warning: clock=pmtmr is deprecated. " - "Use clocksource=acpi_pm.\n"); + pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n"); return boot_override_clocksource("acpi_pm"); } - printk("Warning! clock= boot option is deprecated. " - "Use clocksource=xyz\n"); + pr_warn("clock= boot option is deprecated - use clocksource=xyz\n"); return boot_override_clocksource(str); } |
