diff options
Diffstat (limited to 'kernel/time/timekeeping.c')
| -rw-r--r-- | kernel/time/timekeeping.c | 1582 |
1 files changed, 1096 insertions, 486 deletions
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5579ead449f2..3ec3daa4acab 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -6,6 +6,7 @@ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/kobject.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> @@ -25,13 +26,16 @@ #include <linux/audit.h> #include <linux/random.h> +#include <vdso/auxclock.h> + #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) -#define TK_MIRROR (1 << 1) -#define TK_CLOCK_WAS_SET (1 << 2) +#define TK_CLOCK_WAS_SET (1 << 1) + +#define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ @@ -41,20 +45,49 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; -DEFINE_RAW_SPINLOCK(timekeeper_lock); - /* * The most important data for readout fits into a single 64 byte * cache line. */ -static struct { +struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; -} tk_core ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), -}; + struct timekeeper shadow_timekeeper; + raw_spinlock_t lock; +} ____cacheline_aligned; + +static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; + +/* The core timekeeper */ +#define tk_core (timekeeper_data[TIMEKEEPER_CORE]) + +#ifdef CONFIG_POSIX_AUX_CLOCKS +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; +} +#else +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return false; +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return false; +} +#endif -static struct timekeeper shadow_timekeeper; +static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs) +{ + tk->offs_aux = offs; + tk->monotonic_to_aux = ktime_to_timespec64(offs); +} /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -114,6 +147,46 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void); +static void tk_aux_update_clocksource(void); +static void tk_aux_advance(void); +#else +static inline void tk_aux_setup(void) { } +static inline void tk_aux_update_clocksource(void) { } +static inline void tk_aux_advance(void) { } +#endif + +unsigned long timekeeper_lock_irqsave(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&tk_core.lock, flags); + return flags; +} + +void timekeeper_unlock_irqrestore(unsigned long flags) +{ + raw_spin_unlock_irqrestore(&tk_core.lock, flags); +} + +/* + * Multigrain timestamps require tracking the latest fine-grained timestamp + * that has been issued, and never returning a coarse-grained timestamp that is + * earlier than that value. + * + * mg_floor represents the latest fine-grained time that has been handed out as + * a file timestamp on the system. This is tracked as a monotonic ktime_t, and + * converted to a realtime clock value on an as-needed basis. + * + * Maintaining mg_floor ensures the multigrain interfaces never issue a + * timestamp earlier than one that has been previously issued. + * + * The exception to this rule is when there is a backward realtime clock jump. If + * such an event occurs, a timestamp can appear to be earlier than a previous one. + */ +static __cacheline_aligned_in_smp atomic64_t mg_floor; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -135,10 +208,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -146,6 +243,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -161,13 +259,15 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec64_to_ktime(tmp); - tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); + WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. @@ -184,7 +284,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. - * This isn't necessary to use when holding the timekeeper_lock or doing + * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ @@ -195,106 +295,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr) return clock->read(clock); } -#ifdef CONFIG_DEBUG_TIMEKEEPING -#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ - -static void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ - - u64 max_cycles = tk->tkr_mono.clock->max_cycles; - const char *name = tk->tkr_mono.clock->name; - - if (offset > max_cycles) { - printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", - offset, name, max_cycles); - printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); - } else { - if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", - offset, name, max_cycles >> 1); - printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); - } - } - - if (tk->underflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->underflow_seen = 0; - } - - if (tk->overflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->overflow_seen = 0; - } -} - -static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) -{ - struct timekeeper *tk = &tk_core.timekeeper; - u64 now, last, mask, max, delta; - unsigned int seq; - - /* - * Since we're called holding a seqcount, the data may shift - * under us while we're doing the calculation. This can cause - * false positives, since we'd note a problem but throw the - * results away. So nest another seqcount here to atomically - * grab the points we are checking with. - */ - do { - seq = read_seqcount_begin(&tk_core.seq); - now = tk_clock_read(tkr); - last = tkr->cycle_last; - mask = tkr->mask; - max = tkr->clock->max_cycles; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - delta = clocksource_delta(now, last, mask); - - /* - * Try to catch underflows by checking if we are seeing small - * mask-relative negative values. - */ - if (unlikely((~delta & mask) < (mask >> 3))) { - tk->underflow_seen = 1; - delta = 0; - } - - /* Cap delta value to the max_cycles values to avoid mult overflows */ - if (unlikely(delta > max)) { - tk->overflow_seen = 1; - delta = tkr->clock->max_cycles; - } - - return delta; -} -#else -static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ -} -static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) -{ - u64 cycle_now, delta; - - /* read clocksource */ - cycle_now = tk_clock_read(tkr); - - /* calculate the delta since the last update_wall_time */ - delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); - - return delta; -} -#endif - /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -370,32 +370,38 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) } /* Timekeeper helper functions. */ - -static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta) +static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) { - u64 nsec; - - nsec = delta * tkr->mult + tkr->xtime_nsec; - nsec >>= tkr->shift; - - return nsec; + return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } -static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) +static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { - u64 delta; + /* Calculate the delta since the last update_wall_time() */ + u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; + + /* + * This detects both negative motion and the case where the delta + * overflows the multiplication with tkr->mult. + */ + if (unlikely(delta > tkr->clock->max_cycles)) { + /* + * Handle clocksource inconsistency between CPUs to prevent + * time from going backwards by checking for the MSB of the + * mask being set in the delta. + */ + if (delta & ~(mask >> 1)) + return tkr->xtime_nsec >> tkr->shift; + + return delta_to_ns_safe(tkr, delta); + } - delta = timekeeping_get_delta(tkr); - return timekeeping_delta_to_ns(tkr, delta); + return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { - u64 delta; - - /* calculate the delta since the last update_wall_time */ - delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); - return timekeeping_delta_to_ns(tkr, delta); + return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } /** @@ -406,7 +412,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * - * Employ the latch technique; see @raw_write_seqcount_latch. + * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a @@ -419,24 +425,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); -} - -static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) -{ - u64 delta, cycles = tk_clock_read(tkr); - delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); - return timekeeping_delta_to_ns(tkr, delta); + write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) @@ -446,10 +446,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) u64 now; do { - seq = raw_read_seqcount_latch(&tkf->seq); + seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += fast_tk_get_delta_ns(tkr); + now += timekeeping_get_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -520,13 +520,13 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); - * timekeeping_update(tk, TK_CLEAR_NTP...); + * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. * - * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns() + * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) @@ -554,91 +554,30 @@ u64 notrace ktime_get_tai_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); -static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. + */ +u64 ktime_get_real_fast_ns(void) { + struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; - u64 basem, baser, delta; + u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - delta = fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + delta = timekeeping_get_ns(tkr); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); - if (mono) - *mono = basem + delta; return baser + delta; } - -/** - * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. - * - * See ktime_get_fast_ns() for documentation of the time stamp ordering. - */ -u64 ktime_get_real_fast_ns(void) -{ - return __ktime_get_real_fast(&tk_fast_mono, NULL); -} EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** - * ktime_get_fast_timestamps: - NMI safe timestamps - * @snapshot: Pointer to timestamp storage - * - * Stores clock monotonic, boottime and realtime timestamps. - * - * Boot time is a racy access on 32bit systems if the sleep time injection - * happens late during resume and not in timekeeping_resume(). That could - * be avoided by expanding struct tk_read_base with boot offset for 32bit - * and adding more overhead to the update. As this is a hard to observe - * once per resume event which can be filtered with reasonable effort using - * the accurate mono/real timestamps, it's probably not worth the trouble. - * - * Aside of that it might be possible on 32 and 64 bit to observe the - * following when the sleep time injection happens late: - * - * CPU 0 CPU 1 - * timekeeping_resume() - * ktime_get_fast_timestamps() - * mono, real = __ktime_get_real_fast() - * inject_sleep_time() - * update boot offset - * boot = mono + bootoffset; - * - * That means that boot time already has the sleep time adjustment, but - * real time does not. On the next readout both are in sync again. - * - * Preventing this for 64bit is not really feasible without destroying the - * careful cache layout of the timekeeper because the sequence count and - * struct tk_read_base would then need two cache lines instead of one. - * - * Access to the time keeper clock source is disabled across the innermost - * steps of suspend/resume. The accessors still work, but the timestamps - * are frozen until time keeping is resumed which happens very early. - * - * For regular suspend/resume there is no observable difference vs. sched - * clock, but it might affect some of the nasty low level debug printks. - * - * OTOH, access to sched clock is not guaranteed across suspend/resume on - * all systems either so it depends on the hardware in use. - * - * If that turns out to be a real problem then this could be mitigated by - * using sched clock in a similar way as during early boot. But it's not as - * trivial as on early boot because it needs some careful protection - * against the clock monotonic timestamp jumping backwards on resume. - */ -void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); - snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); -} - -/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * @@ -679,13 +618,11 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; int ret; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -698,14 +635,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - - return ret; + guard(raw_spinlock_irqsave)(&tk_core.lock); + return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); @@ -714,13 +645,25 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_leap_state(struct timekeeper *tk) { - tk->next_leap_ktime = ntp_get_next_leap(); + tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } /* + * Leap state update for both shadow and the real timekeeper + * Separate to spare a full memcpy() of the timekeeper. + */ +static void tk_update_leap_state_all(struct tk_data *tkd) +{ + write_seqcount_begin(&tkd->seq); + tk_update_leap_state(&tkd->shadow_timekeeper); + tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; + write_seqcount_end(&tkd->seq); +} + +/* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) @@ -753,34 +696,62 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } -/* must hold timekeeper_lock */ -static void timekeeping_update(struct timekeeper *tk, unsigned int action) +/* + * Restore the shadow timekeeper from the real timekeeper. + */ +static void timekeeping_restore_shadow(struct tk_data *tkd) +{ + lockdep_assert_held(&tkd->lock); + memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); +} + +static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { + struct timekeeper *tk = &tkd->shadow_timekeeper; + + lockdep_assert_held(&tkd->lock); + + /* + * Block out readers before running the updates below because that + * updates VDSO and other time related infrastructure. Not blocking + * the readers might let a reader see time going backwards when + * reading from the VDSO after the VDSO update and then reading in + * the kernel from the timekeeper before that got updated. + */ + write_seqcount_begin(&tkd->seq); + if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; - ntp_clear(); + ntp_clear(tk->id); } tk_update_leap_state(tk); tk_update_ktime_data(tk); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (tk->id == TIMEKEEPER_CORE) { + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } else if (tk_is_aux(tk)) { + vdso_time_update_aux(tk); + } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; + /* - * The mirroring of the data to the shadow-timekeeper needs - * to happen last here to ensure we don't over-write the - * timekeeper structure on the next update with stale data + * Update the real timekeeper. + * + * We could avoid this memcpy() by switching pointers, but that has + * the downside that the reader side does not longer benefit from + * the cacheline optimized data layout of the timekeeper and requires + * another indirection. */ - if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &tk_core.timekeeper, - sizeof(tk_core.timekeeper)); + memcpy(&tkd->timekeeper, tk, sizeof(*tk)); + write_seqcount_end(&tkd->seq); } /** @@ -796,14 +767,21 @@ static void timekeeping_forward_now(struct timekeeper *tk) u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; - tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; - tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + while (delta > 0) { + u64 max = tk->tkr_mono.clock->max_cycles; + u64 incr = delta < max ? delta : max; - tk_normalize_xtime(tk); + tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult; + tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult; + tk_normalize_xtime(tk); + delta -= incr; + } + tk_update_coarse_nsecs(tk); } /** @@ -900,8 +878,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; ktime_t base, *offset = offsets[offs]; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -909,7 +887,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -928,6 +906,14 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) unsigned int seq; ktime_t tconv; + if (IS_ENABLED(CONFIG_64BIT)) { + /* + * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and + * tk_update_sleep_time(). + */ + return ktime_add(tmono, READ_ONCE(*offset)); + } + do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); @@ -1037,9 +1023,14 @@ time64_t ktime_get_real_seconds(void) EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** - * __ktime_get_real_seconds - The same as ktime_get_real_seconds - * but without the sequence counter protect. This internal function - * is called just when timekeeping lock is already held. + * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds + * + * The same as ktime_get_real_seconds() but without the sequence counter + * protection. This function is used in restricted contexts like the x86 MCE + * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half + * completed modification and only to be used for such critical contexts. + * + * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { @@ -1058,6 +1049,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) unsigned int seq; ktime_t base_raw; ktime_t base_real; + ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; @@ -1072,6 +1064,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); + base_boot = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); @@ -1079,6 +1073,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); @@ -1180,17 +1175,121 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, } /* - * cycle_between - true if test occurs chronologically between before and after + * timestamp_in_interval - true if ts is chronologically in [start, end] + * + * True if ts occurs chronologically at or after start, and before or at end. */ -static bool cycle_between(u64 before, u64 test, u64 after) +static bool timestamp_in_interval(u64 start, u64 end, u64 ts) { - if (test > before && test < after) + if (ts >= start && ts <= end) return true; - if (test < before && before > after) + if (start > end && (ts >= start || ts <= end)) return true; return false; } +static bool convert_clock(u64 *val, u32 numerator, u32 denominator) +{ + u64 rem, res; + + if (!numerator || !denominator) + return false; + + res = div64_u64_rem(*val, denominator, &rem) * numerator; + *val = res + div_u64(rem * numerator, denominator); + return true; +} + +static bool convert_base_to_cs(struct system_counterval_t *scv) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + u32 num, den; + + /* The timestamp was taken from the time keeper clock source */ + if (cs->id == scv->cs_id) + return true; + + /* + * Check whether cs_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != scv->cs_id) + return false; + + num = scv->use_nsecs ? cs->freq_khz : base->numerator; + den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; + + if (!convert_clock(&scv->cycles, num, den)) + return false; + + scv->cycles += base->offset; + return true; +} + +static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + + /* + * Check whether base_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != base_id) + return false; + + *cycles -= base->offset; + if (!convert_clock(cycles, base->denominator, base->numerator)) + return false; + return true; +} + +static bool convert_ns_to_cs(u64 *delta) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + + if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) + return false; + + *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); + return true; +} + +/** + * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp + * @treal: CLOCK_REALTIME timestamp to convert + * @base_id: base clocksource id + * @cycles: pointer to store the converted base clock timestamp + * + * Converts a supplied, future realtime clock value to the corresponding base clock value. + * + * Return: true if the conversion is successful, false otherwise. + */ +bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 delta; + + do { + seq = read_seqcount_begin(&tk_core.seq); + if ((u64)treal < tk->tkr_mono.base_real) + return false; + delta = (u64)treal - tk->tkr_mono.base_real; + if (!convert_ns_to_cs(&delta)) + return false; + *cycles = tk->tkr_mono.cycle_last + delta; + if (!convert_cs_to_base(cycles, base_id)) + return false; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return true; +} +EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and @@ -1210,7 +1309,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { - struct system_counterval_t system_counterval; + struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; @@ -1232,11 +1331,12 @@ int get_device_system_crosststamp(int (*get_time_fn) return ret; /* - * Verify that the clocksource associated with the captured - * system counter value is the same as the currently installed - * timekeeper clocksource + * Verify that the clocksource ID associated with the captured + * system counter value is the same as for the currently + * installed timekeeper clocksource */ - if (tk->tkr_mono.clock != system_counterval.cs) + if (system_counterval.cs_id == CSID_GENERIC || + !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; @@ -1246,7 +1346,7 @@ int get_device_system_crosststamp(int (*get_time_fn) */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; - if (!cycle_between(interval_start, cycles, now)) { + if (!timestamp_in_interval(interval_start, now, cycles)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; @@ -1259,10 +1359,8 @@ int get_device_system_crosststamp(int (*get_time_fn) tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; - nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, - system_counterval.cycles); - nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, - system_counterval.cycles); + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); @@ -1277,13 +1375,13 @@ int get_device_system_crosststamp(int (*get_time_fn) bool discontinuity; /* - * Check that the counter value occurs after the provided + * Check that the counter value is not before the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || - !cycle_between(history_begin->cycles, - system_counterval.cycles, cycles) || + !timestamp_in_interval(history_begin->cycles, + cycles, system_counterval.cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; @@ -1304,6 +1402,30 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** + * timekeeping_clocksource_has_base - Check whether the current clocksource + * is based on given a base clock + * @id: base clocksource ID + * + * Note: The return value is a snapshot which can become invalid right + * after the function returns. + * + * Return: true if the timekeeper clocksource has a base clock with @id, + * false otherwise + */ +bool timekeeping_clocksource_has_base(enum clocksource_ids id) +{ + /* + * This is a snapshot, so no point in using the sequence + * count. Just prevent the compiler from re-evaluating @base as the + * clocksource might change concurrently. + */ + struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); + + return base ? base->id == id : false; +} +EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); + +/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -1311,89 +1433,102 @@ EXPORT_SYMBOL_GPL(get_device_system_crosststamp); */ int do_settimeofday64(const struct timespec64 *ts) { - struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt; - unsigned long flags; - int ret = 0; if (!timespec64_valid_settod(ts)) return -EINVAL; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - - timekeeping_forward_now(tk); - - xt = tk_xtime(tk); - ts_delta = timespec64_sub(*ts, xt); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { - ret = -EINVAL; - goto out; - } + timekeeping_forward_now(tks); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); + xt = tk_xtime(tks); + ts_delta = timespec64_sub(*ts, xt); - tk_set_xtime(tk, ts); -out: - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { + timekeeping_restore_shadow(&tk_core); + return -EINVAL; + } - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); + tk_set_xtime(tks, ts); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); - if (!ret) { - audit_tk_injoffset(ts_delta); - add_device_randomness(ts, sizeof(*ts)); - } - - return ret; + audit_tk_injoffset(ts_delta); + add_device_randomness(ts, sizeof(*ts)); + return 0; } EXPORT_SYMBOL(do_settimeofday64); +static inline bool timekeeper_is_core_tk(struct timekeeper *tk) +{ + return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; +} + /** - * timekeeping_inject_offset - Adds or subtracts from the current time. + * __timekeeping_inject_offset - Adds or subtracts from the current time. + * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; + struct timekeeper *tks = &tkd->shadow_timekeeper; struct timespec64 tmp; - int ret = 0; if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + timekeeping_forward_now(tks); - timekeeping_forward_now(tk); + if (timekeeper_is_core_tk(tks)) { + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tks), *ts); + if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + + tk_xtime_add(tks, ts); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); + } else { + struct tk_read_base *tkr_mono = &tks->tkr_mono; + ktime_t now, offs; - /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tk), *ts); - if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || - !timespec64_valid_settod(&tmp)) { - ret = -EINVAL; - goto error; + /* Get the current time */ + now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); + /* Add the relative offset change */ + offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); + + /* Prevent that the resulting time becomes negative */ + if (ktime_add(now, offs) < 0) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + tk_update_aux_offs(tks, offs); } - tk_xtime_add(tk, ts); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + return 0; +} -error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + int ret; - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) + ret = __timekeeping_inject_offset(&tk_core, ts); /* Signal hrtimers about time change */ - clock_was_set(CLOCK_SET_WALL); - + if (!ret) + clock_was_set(CLOCK_SET_WALL); return ret; } @@ -1447,43 +1582,36 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) */ static int change_clocksource(void *data) { - struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *new, *old = NULL; - unsigned long flags; - bool change = false; - - new = (struct clocksource *) data; + struct clocksource *new = data, *old = NULL; /* - * If the cs is in module, get a module reference. Succeeds - * for built-in code (owner == NULL) as well. + * If the clocksource is in a module, get a module reference. + * Succeeds for built-in code (owner == NULL) as well. Abort if the + * reference can't be acquired. */ - if (try_module_get(new->owner)) { - if (!new->enable || new->enable(new) == 0) - change = true; - else - module_put(new->owner); - } + if (!try_module_get(new->owner)) + return 0; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + /* Abort if the device can't be enabled */ + if (new->enable && new->enable(new) != 0) { + module_put(new->owner); + return 0; + } - timekeeping_forward_now(tk); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - if (change) { - old = tk->tkr_mono.clock; - tk_setup_internals(tk, new); + timekeeping_forward_now(tks); + old = tks->tkr_mono.clock; + tk_setup_internals(tks, new); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + tk_aux_update_clocksource(); if (old) { if (old->disable) old->disable(old); - module_put(old->owner); } @@ -1532,6 +1660,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); +/** + * ktime_get_clock_ts64 - Returns time of a clock in a timespec + * @id: POSIX clock ID of the clock to read + * @ts: Pointer to the timespec64 to be set + * + * The timestamp is invalidated (@ts->sec is set to -1) if the + * clock @id is not available. + */ +void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) +{ + /* Invalidate time stamp */ + ts->tv_sec = -1; + ts->tv_nsec = 0; + + switch (id) { + case CLOCK_REALTIME: + ktime_get_real_ts64(ts); + return; + case CLOCK_MONOTONIC: + ktime_get_ts64(ts); + return; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(ts); + return; + case CLOCK_AUX ... CLOCK_AUX_LAST: + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + ktime_get_aux_ts64(id, ts); + return; + default: + WARN_ON_ONCE(1); + } +} +EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres @@ -1608,6 +1769,14 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) +{ + raw_spin_lock_init(&tkd->lock); + seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); + tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; + tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; +} + /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * @@ -1632,9 +1801,11 @@ static bool persistent_clock_exists; void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; - struct timekeeper *tk = &tk_core.timekeeper; + struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - unsigned long flags; + + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); + tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -1654,24 +1825,21 @@ void __init timekeeping_init(void) */ wall_to_mono = timespec64_sub(boot_offset, wall_time); - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + guard(raw_spinlock_irqsave)(&tk_core.lock); + ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - tk_setup_internals(tk, clock); + tk_setup_internals(tks, clock); - tk_set_xtime(tk, &wall_time); - tk->raw_sec = 0; + tk_set_xtime(tks, &wall_time); + tks->raw_sec = 0; - tk_set_wall_to_mono(tk, wall_to_mono); + tk_set_wall_to_mono(tks, wall_to_mono); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ @@ -1749,22 +1917,14 @@ bool timekeeping_rtc_skipsuspend(void) */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - - suspend_timing_needed = false; + scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - timekeeping_forward_now(tk); - - __timekeeping_inject_sleeptime(tk, delta); - - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + suspend_timing_needed = false; + timekeeping_forward_now(tks); + __timekeeping_inject_sleeptime(tks, delta); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); @@ -1776,20 +1936,19 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) */ void timekeeping_resume(void) { - struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *clock = tk->tkr_mono.clock; - unsigned long flags; + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; - u64 cycle_now, nsec; bool inject_sleeptime = false; + u64 cycle_now, nsec; + unsigned long flags; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + raw_spin_lock_irqsave(&tk_core.lock, flags); /* * After system resumes, we need to calculate the suspended time and @@ -1803,7 +1962,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk_clock_read(&tk->tkr_mono); + cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); @@ -1815,18 +1974,17 @@ void timekeeping_resume(void) if (inject_sleeptime) { suspend_timing_needed = false; - __timekeeping_inject_sleeptime(tk, &ts_delta); + __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ - tk->tkr_mono.cycle_last = cycle_now; - tk->tkr_raw.cycle_last = cycle_now; + tks->tkr_mono.cycle_last = cycle_now; + tks->tkr_raw.cycle_last = cycle_now; - tk->ntp_error = 0; + tks->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); @@ -1836,13 +1994,18 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - struct timespec64 delta, delta_delta; - static struct timespec64 old_delta; + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; struct clocksource *curr_clock; + unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); @@ -1857,9 +2020,8 @@ int timekeeping_suspend(void) suspend_timing_needed = true; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - timekeeping_forward_now(tk); + raw_spin_lock_irqsave(&tk_core.lock, flags); + timekeeping_forward_now(tks); timekeeping_suspended = 1; /* @@ -1867,8 +2029,8 @@ int timekeeping_suspend(void) * just read from the current clocksource. Save this to potentially * use in suspend timing. */ - curr_clock = tk->tkr_mono.clock; - cycle_now = tk->tkr_mono.cycle_last; + curr_clock = tks->tkr_mono.clock; + cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { @@ -1878,7 +2040,7 @@ int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -1893,10 +2055,9 @@ int timekeeping_suspend(void) } } - timekeeping_update(tk, TK_MIRROR); - halt_fast_timekeeper(tk); - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, 0); + halt_fast_timekeeper(tks); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); @@ -1905,15 +2066,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); @@ -2001,16 +2171,17 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { + u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ - if (likely(tk->ntp_tick == ntp_tick_length())) { + if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { - tk->ntp_tick = ntp_tick_length(); + tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } @@ -2081,7 +2252,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) } /* Figure out if its a leap sec and apply if needed */ - leap = second_overflow(tk->xtime_sec); + leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; @@ -2147,30 +2318,25 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { - struct timekeeper *real_tk = &tk_core.timekeeper; - struct timekeeper *tk = &shadow_timekeeper; - u64 offset; - int shift = 0, maxshift; + struct timekeeper *tk = &tkd->shadow_timekeeper; + struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); + int shift = 0, maxshift; + u64 offset, orig_offset; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) - goto out; + return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), - tk->tkr_mono.cycle_last, tk->tkr_mono.mask); - + tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) - goto out; - - /* Do some additional sanity checking */ - timekeeping_check_update(tk, offset); + return false; /* * With NO_HZ we may have to accumulate many cycle_intervals @@ -2183,11 +2349,10 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, - &clock_set); + offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } @@ -2201,35 +2366,35 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); - write_seqcount_begin(&tk_core.seq); /* - * Update the real timekeeper. - * - * We could avoid this memcpy by switching pointers, but that - * requires changes to all other timekeeper usage sites as - * well, i.e. move the timekeeper pointer getter into the - * spinlocked/seqcount protected sections. And we trade this - * memcpy under the tk_core.seq against one before we start - * updating. + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time */ - timekeeping_update(tk, clock_set); - memcpy(real_tk, tk, sizeof(*tk)); - /* The memcpy must come last. Do not put anything here! */ - write_seqcount_end(&tk_core.seq); -out: - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + + timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + guard(raw_spinlock_irqsave)(&tk_core.lock); + return __timekeeping_advance(&tk_core, mode); +} + /** * update_wall_time - Uses the current clocksource to increment the wall time * + * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); + tk_aux_advance(); } /** @@ -2260,11 +2425,99 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); +/** + * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor + * @ts: timespec64 to be filled + * + * Fetch the global mg_floor value, convert it to realtime and compare it + * to the current coarse-grained time. Fill @ts with whichever is + * latest. Note that this is a filesystem-specific interface and should be + * avoided outside of that context. + */ +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 floor = atomic64_read(&mg_floor); + ktime_t f_real, offset, coarse; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + *ts = tk_xtime_coarse(tk); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + coarse = timespec64_to_ktime(*ts); + f_real = ktime_add(floor, offset); + if (ktime_after(f_real, coarse)) + *ts = ktime_to_timespec64(f_real); +} + +/** + * ktime_get_real_ts64_mg - attempt to update floor value and return result + * @ts: pointer to the timespec to be set + * + * Get a monotonic fine-grained time value and attempt to swap it into + * mg_floor. If that succeeds then accept the new floor value. If it fails + * then another task raced in during the interim time and updated the + * floor. Since any update to the floor must be later than the previous + * floor, either outcome is acceptable. + * + * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), + * and determining that the resulting coarse-grained timestamp did not effect + * a change in ctime. Any more recent floor value would effect a change to + * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. + * + * @ts will be filled with the latest floor value, regardless of the outcome of + * the cmpxchg. Note that this is a filesystem specific interface and should be + * avoided outside of that context. + */ +void ktime_get_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t old = atomic64_read(&mg_floor); + ktime_t offset, mono; + unsigned int seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + mono = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + mono = ktime_add_ns(mono, nsecs); + + /* + * Attempt to update the floor with the new time value. As any + * update must be later then the existing floor, and would effect + * a change to ctime from the perspective of the current task, + * accept the resulting floor value regardless of the outcome of + * the swap. + */ + if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); + timekeeping_inc_mg_floor_swaps(); + } else { + /* + * Another task changed mg_floor since "old" was fetched. + * "old" has been updated with the latest value of "mg_floor". + * That value is newer than the previous floor value, which + * is enough to effect a change to ctime. Accept it. + */ + *ts = ktime_to_timespec64(ktime_add(old, offset)); + } +} + void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; @@ -2274,12 +2527,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); @@ -2339,7 +2592,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct __kernel_timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2398,6 +2651,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return -EINVAL; } + if (aux_clock) { + /* Auxiliary clocks are similar to TAI and do not have leap seconds */ + if (txc->status & (STA_INS | STA_DEL)) + return -EINVAL; + + /* No TAI offset setting */ + if (txc->modes & ADJ_TAI) + return -EINVAL; + + /* No PPS support either */ + if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) + return -EINVAL; + } + return 0; } @@ -2416,88 +2683,431 @@ unsigned long random_get_entropy_fallback(void) } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); -/** - * do_adjtimex() - Accessor function to NTP __do_adjtimex function - */ -int do_adjtimex(struct __kernel_timex *txc) +struct adjtimex_result { + struct audit_ntp_data ad; + struct timespec64 delta; + bool clock_set; +}; + +static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, + struct adjtimex_result *result) { - struct timekeeper *tk = &tk_core.timekeeper; - struct audit_ntp_data ad; - bool clock_set = false; + struct timekeeper *tks = &tkd->shadow_timekeeper; + bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; - unsigned long flags; s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ - ret = timekeeping_validate_timex(txc); + ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); + if (!aux_clock) + ktime_get_real_ts64(&ts); + else + tk_get_aux_ts64(tkd->timekeeper.id, &ts); + + add_device_randomness(&ts, sizeof(ts)); + + guard(raw_spinlock_irqsave)(&tkd->lock); + + if (!tks->clock_valid) + return -ENODEV; + if (txc->modes & ADJ_SETOFFSET) { - struct timespec64 delta; - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; + result->delta.tv_sec = txc->time.tv_sec; + result->delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = timekeeping_inject_offset(&delta); + result->delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(tkd, &result->delta); if (ret) return ret; - - audit_tk_injoffset(delta); + result->clock_set = true; } - audit_ntp_init(&ad); + orig_tai = tai = tks->tai_offset; + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); - ktime_get_real_ts64(&ts); - add_device_randomness(&ts, sizeof(ts)); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); + result->clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); + } - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); - orig_tai = tai = tk->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + return ret; +} - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tk, tai); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - clock_set = true; - } - tk_update_leap_state(tk); +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct adjtimex_result result = { }; + int ret; - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + ret = __do_adjtimex(&tk_core, txc, &result); + if (ret < 0) + return ret; - audit_ntp_log(&ad); + if (txc->modes & ADJ_SETOFFSET) + audit_tk_injoffset(result.delta); - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= timekeeping_advance(TK_ADV_FREQ); + audit_ntp_log(&result.ad); - if (clock_set) - clock_was_set(CLOCK_REALTIME); + if (result.clock_set) + clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(); + ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } +/* + * Invoked from NTP with the time keeper lock held, so lockless access is + * fine. + */ +long ktime_get_ntp_seconds(unsigned int id) +{ + return timekeeper_data[id].timekeeper.xtime_sec; +} + #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function + * @phase_ts: Pointer to timespec64 structure representing phase timestamp + * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - + guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ + +#ifdef CONFIG_POSIX_AUX_CLOCKS +#include "posix-timers.h" + +/* + * Bitmap for the activated auxiliary timekeepers to allow lockless quick + * checks in the hot paths without touching extra cache lines. If set, then + * the state of the corresponding timekeeper has to be re-checked under + * timekeeper::lock. + */ +static unsigned long aux_timekeepers; + +static inline unsigned int clockid_to_tkid(unsigned int id) +{ + return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; +} + +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + if (!clockid_aux_valid(id)) + return NULL; + return &timekeeper_data[clockid_to_tkid(id)]; +} + +/* Invoked from timekeeping after a clocksource change */ +static void tk_aux_update_clocksource(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + struct timekeeper *tks = &tkd->shadow_timekeeper; + + guard(raw_spinlock_irqsave)(&tkd->lock); + if (!tks->clock_valid) + continue; + + timekeeping_forward_now(tks); + tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + } +} + +static void tk_aux_advance(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + /* Lockless quick check to avoid extra cache lines */ + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + + guard(raw_spinlock)(&aux_tkd->lock); + if (aux_tkd->shadow_timekeeper.clock_valid) + __timekeeping_advance(aux_tkd, TK_ADV_TICK); + } +} + +/** + * ktime_get_aux - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @kt: Pointer to ktime_t to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux(clockid_t id, ktime_t *kt) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tk; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + if (!aux_tkd) + return false; + + aux_tk = &aux_tkd->timekeeper; + do { + seq = read_seqcount_begin(&aux_tkd->seq); + if (!aux_tk->clock_valid) + return false; + + base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); + nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); + } while (read_seqcount_retry(&aux_tkd->seq, seq)); + + *kt = ktime_add_ns(base, nsecs); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux); + +/** + * ktime_get_aux_ts64 - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @ts: Pointer to timespec64 to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) +{ + ktime_t now; + + if (!ktime_get_aux(id, &now)) + return false; + *ts = ktime_to_timespec64(now); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); + +static int aux_get_res(clockid_t id, struct timespec64 *tp) +{ + if (!clockid_aux_valid(id)) + return -ENODEV; + + tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; + tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; + return 0; +} + +static int aux_get_timespec(clockid_t id, struct timespec64 *tp) +{ + return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; +} + +static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks; + ktime_t tnow, nsecs; + + if (!timespec64_valid_settod(tnew)) + return -EINVAL; + if (!aux_tkd) + return -ENODEV; + + aux_tks = &aux_tkd->shadow_timekeeper; + + guard(raw_spinlock_irq)(&aux_tkd->lock); + if (!aux_tks->clock_valid) + return -ENODEV; + + /* Forward the timekeeper base time */ + timekeeping_forward_now(aux_tks); + /* + * Get the updated base time. tkr_mono.base has not been + * updated yet, so do that first. That makes the update + * in timekeeping_update_from_shadow() redundant, but + * that's harmless. After that @tnow can be calculated + * by using tkr_mono::cycle_last, which has been set + * by timekeeping_forward_now(). + */ + tk_update_ktime_data(aux_tks); + nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); + tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); + + /* + * Calculate the new AUX offset as delta to @tnow ("monotonic"). + * That avoids all the tk::xtime back and forth conversions as + * xtime ("realtime") is not applicable for auxiliary clocks and + * kept in sync with "monotonic". + */ + tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow)); + + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); + return 0; +} + +static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct adjtimex_result result = { }; + + if (!aux_tkd) + return -ENODEV; + + /* + * @result is ignored for now as there are neither hrtimers nor a + * RTC related to auxiliary clocks for now. + */ + return __do_adjtimex(aux_tkd, txc, &result); +} + +const struct k_clock clock_aux = { + .clock_getres = aux_get_res, + .clock_get_timespec = aux_get_timespec, + .clock_set = aux_clock_set, + .clock_adj = aux_clock_adj, +}; + +static void aux_clock_enable(clockid_t id) +{ + struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; + + /* Prevent the core timekeeper from changing. */ + guard(raw_spinlock_irq)(&tk_core.lock); + + /* + * Setup the auxiliary clock assuming that the raw core timekeeper + * clock frequency conversion is close enough. Userspace has to + * adjust for the deviation via clock_adjtime(2). + */ + guard(raw_spinlock_nested)(&aux_tkd->lock); + + /* Remove leftovers of a previous registration */ + memset(aux_tks, 0, sizeof(*aux_tks)); + /* Restore the timekeeper id */ + aux_tks->id = aux_tkd->timekeeper.id; + /* Setup the timekeeper based on the current system clocksource */ + tk_setup_internals(aux_tks, tkr_raw->clock); + + /* Mark it valid and set it live */ + aux_tks->clock_valid = true; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static void aux_clock_disable(clockid_t id) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + + guard(raw_spinlock_irq)(&aux_tkd->lock); + aux_tkd->shadow_timekeeper.clock_valid = false; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static DEFINE_MUTEX(aux_clock_mutex); + +static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + bool enable; + + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (kstrtobool(buf, &enable) < 0) + return -EINVAL; + + guard(mutex)(&aux_clock_mutex); + if (enable == test_bit(id, &aux_timekeepers)) + return count; + + if (enable) { + aux_clock_enable(CLOCK_AUX + id); + set_bit(id, &aux_timekeepers); + } else { + aux_clock_disable(CLOCK_AUX + id); + clear_bit(id, &aux_timekeepers); + } + return count; +} + +static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + + return sysfs_emit(buf, "%d\n", test_bit(id, &active)); +} + +static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); + +static struct attribute *aux_clock_enable_attrs[] = { + &aux_clock_enable_attr.attr, + NULL +}; + +static const struct attribute_group aux_clock_enable_attr_group = { + .attrs = aux_clock_enable_attrs, +}; + +static int __init tk_aux_sysfs_init(void) +{ + struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; + + if (!tko) + return ret; + + auxo = kobject_create_and_add("aux_clocks", tko); + if (!auxo) + goto err_clean; + + for (int i = 0; i < MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (ret) + goto err_clean; + } + return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; +} +late_initcall(tk_aux_sysfs_init); + +static __init void tk_aux_setup(void) +{ + for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) + tkd_basic_setup(&timekeeper_data[i], i, false); +} +#endif /* CONFIG_POSIX_AUX_CLOCKS */ |
