diff options
Diffstat (limited to 'kernel/time')
38 files changed, 7429 insertions, 3275 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a41753be1a2b..7c6a52f7836c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -17,11 +17,6 @@ config ARCH_CLOCKSOURCE_DATA config ARCH_CLOCKSOURCE_INIT bool -# Clocksources require validation of the clocksource against the last -# cycle update - x86/TSC misfeature -config CLOCKSOURCE_VALIDATE_LAST_CYCLE - bool - # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool @@ -39,6 +34,11 @@ config GENERIC_CLOCKEVENTS_BROADCAST bool depends on GENERIC_CLOCKEVENTS +# Handle broadcast in default_idle_call() +config GENERIC_CLOCKEVENTS_BROADCAST_IDLE + bool + depends on GENERIC_CLOCKEVENTS_BROADCAST + # Automatically adjust the min. reprogramming time for # clock event device config GENERIC_CLOCKEVENTS_MIN_ADJUST @@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE help Tracks idle state on behalf of RCU. -if GENERIC_CLOCKEVENTS menu "Timers subsystem" +if GENERIC_CLOCKEVENTS # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independent of this. @@ -197,13 +197,28 @@ config HIGH_RES_TIMERS the size of the kernel image. config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US - int "Clocksource watchdog maximum allowable skew (in μs)" + int "Clocksource watchdog maximum allowable skew (in microseconds)" depends on CLOCKSOURCE_WATCHDOG range 50 1000 - default 100 + default 125 help Specify the maximum amount of allowable watchdog skew in microseconds before reporting the clocksource to be unstable. + The default is based on a half-second clocksource watchdog + interval and NTP's maximum frequency drift of 500 parts + per million. If the clocksource is good enough for NTP, + it is good enough for the clocksource watchdog! +endif + +config POSIX_AUX_CLOCKS + bool "Enable auxiliary POSIX clocks" + depends on POSIX_TIMERS + help + Auxiliary POSIX clocks are clocks which can be steered + independently of the core timekeeper, which controls the + MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to + provide e.g. lockless time accessors to independent PTP clocks + and other clock domains, which are not correlated to the TAI/NTP + notion of time. endmenu -endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 7e875e63ff3b..f7d52d9543cc 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,5 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += time.o timer.o hrtimer.o + +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING +endif + +obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o @@ -17,7 +23,10 @@ endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o -obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o +ifeq ($(CONFIG_SMP),y) + obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o +endif +obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5897828b9d7e..069d93bfb0c7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -35,7 +35,7 @@ /** * struct alarm_base - Alarm timer bases - * @lock: Lock for syncrhonized access to the base + * @lock: Lock for synchronized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base * @get_timespec: Function to read the namespace time correlating to the base @@ -70,21 +70,17 @@ static DEFINE_SPINLOCK(rtcdev_lock); */ struct rtc_device *alarmtimer_get_rtcdev(void) { - unsigned long flags; struct rtc_device *ret; - spin_lock_irqsave(&rtcdev_lock, flags); + guard(spinlock_irqsave)(&rtcdev_lock); ret = rtcdev; - spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); -static int alarmtimer_rtc_add_device(struct device *dev, - struct class_interface *class_intf) +static int alarmtimer_rtc_add_device(struct device *dev) { - unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct platform_device *pdev; int ret = 0; @@ -102,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (!IS_ERR(pdev)) device_init_wakeup(&pdev->dev, true); - spin_lock_irqsave(&rtcdev_lock, flags); - if (!IS_ERR(pdev) && !rtcdev) { - if (!try_module_get(rtc->owner)) { + scoped_guard(spinlock_irqsave, &rtcdev_lock) { + if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + pdev = NULL; + } else { ret = -1; - goto unlock; } - - rtcdev = rtc; - /* hold a reference so it doesn't go away */ - get_device(dev); - pdev = NULL; - } else { - ret = -1; } -unlock: - spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); - return ret; } @@ -135,7 +124,7 @@ static struct class_interface alarmtimer_rtc_interface = { static int alarmtimer_rtc_interface_setup(void) { - alarmtimer_rtc_interface.class = rtc_class; + alarmtimer_rtc_interface.class = &rtc_class; return class_interface_register(&alarmtimer_rtc_interface); } static void alarmtimer_rtc_interface_remove(void) @@ -198,28 +187,15 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - int ret = HRTIMER_NORESTART; - int restart = ALARMTIMER_NORESTART; - spin_lock_irqsave(&base->lock, flags); - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) + alarmtimer_dequeue(base, alarm); if (alarm->function) - restart = alarm->function(alarm, base->get_ktime()); - - spin_lock_irqsave(&base->lock, flags); - if (restart != ALARMTIMER_NORESTART) { - hrtimer_set_expires(&alarm->timer, alarm->node.expires); - alarmtimer_enqueue(base, alarm); - ret = HRTIMER_RESTART; - } - spin_unlock_irqrestore(&base->lock, flags); + alarm->function(alarm, base->get_ktime()); trace_alarmtimer_fired(alarm, base->get_ktime()); - return ret; - + return HRTIMER_NORESTART; } ktime_t alarm_expires_remaining(const struct alarm *alarm) @@ -242,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); static int alarmtimer_suspend(struct device *dev) { ktime_t min, now, expires; - int i, ret, type; struct rtc_device *rtc; - unsigned long flags; struct rtc_time tm; + int i, ret, type; - spin_lock_irqsave(&freezer_delta_lock, flags); - min = freezer_delta; - expires = freezer_expires; - type = freezer_alarmtype; - freezer_delta = 0; - spin_unlock_irqrestore(&freezer_delta_lock, flags); + scoped_guard(spinlock_irqsave, &freezer_delta_lock) { + min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; + freezer_delta = 0; + } rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ @@ -265,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev) struct timerqueue_node *next; ktime_t delta; - spin_lock_irqsave(&base->lock, flags); - next = timerqueue_getnext(&base->timerqueue); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) + next = timerqueue_getnext(&base->timerqueue); if (!next) continue; delta = ktime_sub(next->expires, base->get_ktime()); @@ -291,6 +265,17 @@ static int alarmtimer_suspend(struct device *dev) rtc_timer_cancel(rtc, &rtctimer); rtc_read_time(rtc, &tm); now = rtc_tm_to_ktime(tm); + + /* + * If the RTC alarm timer only supports a limited time offset, set the + * alarm time to the maximum supported value. + * The system may wake up earlier (possibly much earlier) than expected + * when the alarmtimer runs. This is the best the kernel can do if + * the alarmtimer exceeds the time that the rtc device can be programmed + * for. + */ + min = rtc_bound_alarmtime(rtc, min); + now = ktime_add(now, min); /* Set alarm, if in the past reject suspend briefly to handle */ @@ -324,10 +309,9 @@ static int alarmtimer_resume(struct device *dev) static void __alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); - alarm->timer.function = alarmtimer_fired; alarm->function = function; alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -340,10 +324,10 @@ __alarm_init(struct alarm *alarm, enum alarmtimer_type type, * @function: callback that is run when the alarm fires */ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { - hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); + hrtimer_setup(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); __alarm_init(alarm, type, function); } EXPORT_SYMBOL_GPL(alarm_init); @@ -356,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init); void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); + } trace_alarmtimer_start(alarm, base->get_ktime()); } @@ -385,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); + guard(spinlock_irqsave)(&base->lock); hrtimer_set_expires(&alarm->timer, alarm->node.expires); hrtimer_restart(&alarm->timer); alarmtimer_enqueue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(alarm_restart); @@ -405,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart); int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; int ret; - spin_lock_irqsave(&base->lock, flags); - ret = hrtimer_try_to_cancel(&alarm->timer); - if (ret >= 0) - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + } trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; @@ -483,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now); static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { struct alarm_base *base; - unsigned long flags; ktime_t delta; switch(type) { @@ -502,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) delta = ktime_sub(absexp, base->get_ktime()); - spin_lock_irqsave(&freezer_delta_lock, flags); + guard(spinlock_irqsave)(&freezer_delta_lock); if (!freezer_delta || (delta < freezer_delta)) { freezer_delta = delta; freezer_expires = absexp; freezer_alarmtype = type; } - spin_unlock_irqrestore(&freezer_delta_lock, flags); } /** @@ -519,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) { if (clockid == CLOCK_REALTIME_ALARM) return ALARM_REALTIME; - if (clockid == CLOCK_BOOTTIME_ALARM) - return ALARM_BOOTTIME; - return -1; + + WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM); + return ALARM_BOOTTIME; } /** @@ -533,34 +511,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * * Return: whether the timer is to be restarted */ -static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, - ktime_t now) +static void alarm_handle_timer(struct alarm *alarm, ktime_t now) { - struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarm.alarmtimer); - enum alarmtimer_restart result = ALARMTIMER_NORESTART; - unsigned long flags; - int si_private = 0; - - spin_lock_irqsave(&ptr->it_lock, flags); - - ptr->it_active = 0; - if (ptr->it_interval) - si_private = ++ptr->it_requeue_pending; + struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); - if (posix_timer_event(ptr, si_private) && ptr->it_interval) { - /* - * Handle ignored signals and rearm the timer. This will go - * away once we handle ignored signals proper. - */ - ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); - ++ptr->it_requeue_pending; - ptr->it_active = 1; - result = ALARMTIMER_RESTART; - } - spin_unlock_irqrestore(&ptr->it_lock, flags); - - return result; + guard(spinlock_irqsave)(&ptr->it_lock); + posix_timer_queue_signal(ptr); } /** @@ -721,18 +677,14 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @now: time at the timer expiration * * Wakes up the task that set the alarmtimer - * - * Return: ALARMTIMER_NORESTART */ -static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, - ktime_t now) +static void alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) { - struct task_struct *task = (struct task_struct *)alarm->data; + struct task_struct *task = alarm->data; alarm->data = NULL; if (task) wake_up_process(task); - return ALARMTIMER_NORESTART; } /** @@ -784,10 +736,10 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, static void alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { - hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); + hrtimer_setup_on_stack(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); __alarm_init(alarm, type, function); } @@ -823,7 +775,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, struct restart_block *restart = ¤t->restart_block; struct alarm alarm; ktime_t exp; - int ret = 0; + int ret; if (!alarmtimer_get_rtcdev()) return -EOPNOTSUPP; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 960143b183cd..a59bc75ab7c5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -190,7 +190,7 @@ int clockevents_tick_resume(struct clock_event_device *dev) #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST -/* Limit min_delta to a jiffie */ +/* Limit min_delta to a jiffy */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** @@ -337,13 +337,21 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, } /* - * Called after a notify add to make devices available which were - * released from the notifier call. + * Called after a clockevent has been added which might + * have replaced a current regular or broadcast device. A + * released normal device might be a suitable replacement + * for the current broadcast device. Similarly a released + * broadcast device might be a suitable replacement for a + * normal device. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; + /* + * Keep iterating as long as tick_check_new_device() + * replaces a device. + */ while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); @@ -610,39 +618,30 @@ void clockevents_resume(void) #ifdef CONFIG_HOTPLUG_CPU -# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST /** - * tick_offline_cpu - Take CPU out of the broadcast mechanism + * tick_offline_cpu - Shutdown all clock events related + * to this CPU and take it out of the + * broadcast mechanism. * @cpu: The outgoing CPU * - * Called on the outgoing CPU after it took itself offline. + * Called by the dying CPU during teardown. */ void tick_offline_cpu(unsigned int cpu) { - raw_spin_lock(&clockevents_lock); - tick_broadcast_offline(cpu); - raw_spin_unlock(&clockevents_lock); -} -# endif - -/** - * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu - * @cpu: The dead CPU - */ -void tick_cleanup_dead_cpu(int cpu) -{ struct clock_event_device *dev, *tmp; - unsigned long flags; - raw_spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock(&clockevents_lock); + + tick_broadcast_offline(cpu); + tick_shutdown(); - tick_shutdown(cpu); /* * Unregister the clock event devices which were - * released from the users in the notify chain. + * released above. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); + /* * Now check whether the CPU has left unused per cpu devices */ @@ -654,12 +653,13 @@ void tick_cleanup_dead_cpu(int cpu) list_del(&dev->list); } } - raw_spin_unlock_irqrestore(&clockevents_lock, flags); + + raw_spin_unlock(&clockevents_lock); } #endif #ifdef CONFIG_SYSFS -static struct bus_type clockevents_subsys = { +static const struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; @@ -677,7 +677,7 @@ static ssize_t current_device_show(struct device *dev, raw_spin_lock_irq(&clockevents_lock); td = tick_get_tick_dev(dev); if (td && td->evtdev) - count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + count = sysfs_emit(buf, "%s\n", td->evtdev->name); raw_spin_unlock_irq(&clockevents_lock); return count; } diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index df922f49d171..38dae590b29f 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -22,6 +22,7 @@ #include "tick-internal.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; @@ -104,8 +105,8 @@ static void wdtest_ktime_clocksource_reset(void) static int wdtest_func(void *arg) { unsigned long j1, j2; + int i, max_retries; char *s; - int i; schedule_timeout_uninterruptible(holdoff * HZ); @@ -136,21 +137,23 @@ static int wdtest_func(void *arg) udelay(1); j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); + WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), + "Expected at least 1000ns, got %lu.\n", j2 - j1); /* Verify tsc-like stability with various numbers of errors injected. */ - for (i = 0; i <= max_cswd_read_retries + 1; i++) { - if (i <= 1 && i < max_cswd_read_retries) + max_retries = clocksource_get_max_watchdog_retry(); + for (i = 0; i <= max_retries + 1; i++) { + if (i <= 1 && i < max_retries) s = ""; - else if (i <= max_cswd_read_retries) + else if (i <= max_retries) s = ", expect message"; else s = ", expect clock skew"; - pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s); + pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s); WRITE_ONCE(wdtest_ktime_read_ndelays, i); schedule_timeout_uninterruptible(2 * HZ); WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays)); - WARN_ON_ONCE((i <= max_cswd_read_retries) != + WARN_ON_ONCE((i <= max_retries) != !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); wdtest_ktime_clocksource_reset(); } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9cf32ccda715..a1890a073196 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -20,6 +20,18 @@ #include "tick-internal.h" #include "timekeeping_internal.h" +static void clocksource_enqueue(struct clocksource *cs); + +static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) +{ + u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta); + + if (likely(delta < cs->max_cycles)) + return clocksource_cyc2ns(delta, cs->mult, cs->shift); + + return mul_u64_u32_shr(delta, cs->mult, cs->shift); +} + /** * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks * @mult: pointer to mult variable @@ -96,8 +108,13 @@ static int finished_booting; static u64 suspend_start; /* + * Interval: 0.5sec. + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ)) + +/* * Threshold: 0.0312s, when doubled: 0.0625s. - * Also a default for cs->uncertainty_margin when registering clocks. */ #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) @@ -106,13 +123,30 @@ static u64 suspend_start; * clocksource surrounding a read of the clocksource being validated. * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as * a lower bound for cs->uncertainty_margin values when registering clocks. + * + * The default of 500 parts per million is based on NTP's limits. + * If a clocksource is good enough for NTP, it is good enough for us! + * + * In other words, by default, even if a clocksource is extremely + * precise (for example, with a sub-nanosecond period), the maximum + * permissible skew between the clocksource watchdog and the clocksource + * under test is not permitted to go below the 500ppm minimum defined + * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the + * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US #define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US #else -#define MAX_SKEW_USEC 100 +#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) #endif +/* + * Default for maximum permissible skew when cs->uncertainty_margin is + * not specified, and the lower bound even when cs->uncertainty_margin + * is specified. This is also the default that is used when registering + * clocks with unspecified cs->uncertainty_margin, so this macro is used + * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. + */ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -126,6 +160,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; +static int64_t watchdog_max_interval; static inline void clocksource_watchdog_lock(unsigned long *flags) { @@ -138,12 +173,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags) } static int clocksource_watchdog_kthread(void *data); -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -/* - * Interval: 0.5sec. - */ -#define WATCHDOG_INTERVAL (HZ >> 1) static void clocksource_watchdog_work(struct work_struct *work) { @@ -163,6 +192,13 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } +static void clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); @@ -205,9 +241,6 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -ulong max_cswd_read_retries = 2; -module_param(max_cswd_read_retries, ulong, 0644); -EXPORT_SYMBOL_GPL(max_cswd_read_retries); static int verify_n_cpus = 8; module_param(verify_n_cpus, int, 0644); @@ -219,11 +252,13 @@ enum wd_read_status { static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) { - unsigned int nretries; - u64 wd_end, wd_end2, wd_delta; + int64_t md = 2 * watchdog->uncertainty_margin; + unsigned int nretries, max_retries; int64_t wd_delay, wd_seq_delay; + u64 wd_end, wd_end2; - for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) { + max_retries = clocksource_get_max_watchdog_retry(); + for (nretries = 0; nretries <= max_retries; nretries++) { local_irq_disable(); *wdnow = watchdog->read(watchdog); *csnow = cs->read(cs); @@ -231,11 +266,9 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, wd_end2 = watchdog->read(watchdog); local_irq_enable(); - wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask); - wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, - watchdog->shift); - if (wd_delay <= WATCHDOG_MAX_SKEW) { - if (nretries > 1 || nretries >= max_cswd_read_retries) { + wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); + if (wd_delay <= md + cs->uncertainty_margin) { + if (nretries > 1 && nretries >= max_retries) { pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); } @@ -247,18 +280,17 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, * there is too much external interferences that cause * significant delay in reading both clocksource and watchdog. * - * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2, - * report system busy, reinit the watchdog and skip the current + * If consecutive WD read-back delay > md, report + * system busy, reinit the watchdog and skip the current * watchdog test. */ - wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask); - wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift); - if (wd_seq_delay > WATCHDOG_MAX_SKEW/2) + wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); + if (wd_seq_delay > md) goto skip_test; } - pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", - smp_processor_id(), watchdog->name, wd_delay, nretries); + pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", + smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); return WD_READ_UNSTABLE; skip_test: @@ -278,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) { int cpu, i, n = verify_n_cpus; - if (n < 0) { + if (n < 0 || n >= num_online_cpus()) { /* Check all of the CPUs. */ cpumask_copy(&cpus_chosen, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); @@ -291,9 +323,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_first(cpu_online_mask); - if (cpu == smp_processor_id()) - cpu = cpumask_next(cpu, cpu_online_mask); + cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpumask_set_cpu(cpu, &cpus_chosen); @@ -310,10 +340,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = get_random_u32_below(nr_cpu_ids); - cpu = cpumask_next(cpu - 1, cpu_online_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_online_mask); + cpu = cpumask_random(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } @@ -341,16 +368,18 @@ void clocksource_verify_percpu(struct clocksource *cs) cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); cpus_read_lock(); - preempt_disable(); + migrate_disable(); clocksource_verify_choose_cpus(); if (cpumask_empty(&cpus_chosen)) { - preempt_enable(); + migrate_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } testcpu = smp_processor_id(); - pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + preempt_disable(); for_each_cpu(cpu, &cpus_chosen) { if (cpu == testcpu) continue; @@ -363,14 +392,14 @@ void clocksource_verify_percpu(struct clocksource *cs) delta = (csnow_end - csnow_mid) & cs->mask; if (delta < 0) cpumask_set_cpu(cpu, &cpus_ahead); - delta = clocksource_delta(csnow_end, csnow_begin, cs->mask); - cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end); if (cs_nsec > cs_nsec_max) cs_nsec_max = cs_nsec; if (cs_nsec < cs_nsec_min) cs_nsec_min = cs_nsec; } preempt_enable(); + migrate_enable(); cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", @@ -378,19 +407,28 @@ void clocksource_verify_percpu(struct clocksource *cs) if (!cpumask_empty(&cpus_behind)) pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_behind), testcpu, cs->name); - if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) - pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); + pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); } EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + + static void clocksource_watchdog(struct timer_list *unused) { - u64 csnow, wdnow, cslast, wdlast, delta; + int64_t wd_nsec, cs_nsec, interval; + u64 csnow, wdnow, cslast, wdlast; int next_cpu, reset_pending; - int64_t wd_nsec, cs_nsec; struct clocksource *cs; enum wd_read_status read_ret; + unsigned long extra_wait = 0; u32 md; spin_lock(&watchdog_lock); @@ -410,13 +448,30 @@ static void clocksource_watchdog(struct timer_list *unused) read_ret = cs_watchdog_read(cs, &csnow, &wdnow); - if (read_ret != WD_READ_SUCCESS) { - if (read_ret == WD_READ_UNSTABLE) - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); + if (read_ret == WD_READ_UNSTABLE) { + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); continue; } + /* + * When WD_READ_SKIP is returned, it means the system is likely + * under very heavy load, where the latency of reading + * watchdog/clocksource is very big, and affect the accuracy of + * watchdog check. So give system some space and suspend the + * watchdog check for 5 minutes. + */ + if (read_ret == WD_READ_SKIP) { + /* + * As the watchdog timer will be suspended, and + * cs->last could keep unchanged for 5 minutes, reset + * the counters. + */ + clocksource_reset_watchdog(); + extra_wait = HZ * 300; + break; + } + /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || atomic_read(&watchdog_reset_pending)) { @@ -426,12 +481,8 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); - wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, - watchdog->shift); - - delta = clocksource_delta(csnow, cs->cs_last, cs->mask); - cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow); + cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow); wdlast = cs->wd_last; /* save these in case we print them */ cslast = cs->cs_last; cs->cs_last = csnow; @@ -440,15 +491,44 @@ static void clocksource_watchdog(struct timer_list *unused) if (atomic_read(&watchdog_reset_pending)) continue; + /* + * The processing of timer softirqs can get delayed (usually + * on account of ksoftirqd not getting to run in a timely + * manner), which causes the watchdog interval to stretch. + * Skew detection may fail for longer watchdog intervals + * on account of fixed margins being used. + * Some clocksources, e.g. acpi_pm, cannot tolerate + * watchdog intervals longer than a few seconds. + */ + interval = max(cs_nsec, wd_nsec); + if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) { + if (system_state > SYSTEM_SCHEDULING && + interval > 2 * watchdog_max_interval) { + watchdog_max_interval = interval; + pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n", + cs_nsec, wd_nsec); + } + watchdog_timer.expires = jiffies; + continue; + } + /* Check the deviation from the watchdog clocksource. */ md = cs->uncertainty_margin + watchdog->uncertainty_margin; if (abs(cs_nsec - wd_nsec) > md) { + s64 cs_wd_msec; + s64 wd_msec; + u32 wd_rem; + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", smp_processor_id(), cs->name); pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, cs_nsec, csnow, cslast, cs->mask); + cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); + wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); + pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", + cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); if (curr_clocksource == cs) pr_warn(" '%s' is current clocksource.\n", cs->name); else if (curr_clocksource) @@ -503,16 +583,14 @@ static void clocksource_watchdog(struct timer_list *unused) * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); /* * Arm timer if not already pending: could race with concurrent * pair clocksource_stop_watchdog() clocksource_start_watchdog(). */ if (!timer_pending(&watchdog_timer)) { - watchdog_timer.expires += WATCHDOG_INTERVAL; + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; add_timer_on(&watchdog_timer, next_cpu); } out: @@ -533,18 +611,10 @@ static inline void clocksource_stop_watchdog(void) { if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) return; - del_timer(&watchdog_timer); + timer_delete(&watchdog_timer); watchdog_running = 0; } -static inline void clocksource_reset_watchdog(void) -{ - struct clocksource *cs; - - list_for_each_entry(cs, &watchdog_list, wd_list) - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} - static void clocksource_resume_watchdog(void) { atomic_inc(&watchdog_reset_pending); @@ -630,7 +700,7 @@ static int __clocksource_watchdog_kthread(void) list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); + clocksource_change_rating(cs, 0); select = 1; } if (cs->flags & CLOCK_SOURCE_RESELECT) { @@ -781,7 +851,7 @@ void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles) */ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now) { - u64 now, delta, nsec = 0; + u64 now, nsec = 0; if (!suspend_clocksource) return 0; @@ -796,12 +866,8 @@ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now) else now = suspend_clocksource->read(suspend_clocksource); - if (now > suspend_start) { - delta = clocksource_delta(now, suspend_start, - suspend_clocksource->mask); - nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult, - suspend_clocksource->shift); - } + if (now > suspend_start) + nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now); /* * Disable the suspend timer to save power if current clocksource is @@ -922,6 +988,15 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs) cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, cs->mask, &cs->max_cycles); + + /* + * Threshold for detecting negative motion in clocksource_delta(). + * + * Allow for 0.875 of the counter width so that overly long idle + * sleeps, which go slightly over mask/2, do not trigger the + * negative motion detection. + */ + cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3); } static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) @@ -1097,14 +1172,19 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq } /* - * If the uncertainty margin is not specified, calculate it. - * If both scale and freq are non-zero, calculate the clock - * period, but bound below at 2*WATCHDOG_MAX_SKEW. However, - * if either of scale or freq is zero, be very conservative and - * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the - * uncertainty margin. Allow stupidly small uncertainty margins - * to be specified by the caller for testing purposes, but warn - * to discourage production use of this capability. + * If the uncertainty margin is not specified, calculate it. If + * both scale and freq are non-zero, calculate the clock period, but + * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default. + * However, if either of scale or freq is zero, be very conservative + * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value + * for the uncertainty margin. Allow stupidly small uncertainty + * margins to be specified by the caller for testing purposes, + * but warn to discourage production use of this capability. + * + * Bottom line: The sum of the uncertainty margins of the + * watchdog clocksource and the clocksource under test will be at + * least 500ppm by default. For more information, please see the + * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above. */ if (scale && freq && !cs->uncertainty_margin) { cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); @@ -1187,34 +1267,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } EXPORT_SYMBOL_GPL(__clocksource_register_scale); -static void __clocksource_change_rating(struct clocksource *cs, int rating) -{ - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); -} - -/** - * clocksource_change_rating - Change the rating of a registered clocksource - * @cs: clocksource to be changed - * @rating: new rating - */ -void clocksource_change_rating(struct clocksource *cs, int rating) -{ - unsigned long flags; - - mutex_lock(&clocksource_mutex); - clocksource_watchdog_lock(&flags); - __clocksource_change_rating(cs, rating); - clocksource_watchdog_unlock(&flags); - - clocksource_select(); - clocksource_select_watchdog(false); - clocksource_suspend_select(false); - mutex_unlock(&clocksource_mutex); -} -EXPORT_SYMBOL(clocksource_change_rating); - /* * Unbind clocksource @cs. Called with clocksource_mutex held */ @@ -1285,7 +1337,7 @@ static ssize_t current_clocksource_show(struct device *dev, ssize_t count = 0; mutex_lock(&clocksource_mutex); - count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); + count = sysfs_emit(buf, "%s\n", curr_clocksource->name); mutex_unlock(&clocksource_mutex); return count; @@ -1415,7 +1467,7 @@ static struct attribute *clocksource_attrs[] = { }; ATTRIBUTE_GROUPS(clocksource); -static struct bus_type clocksource_subsys = { +static const struct bus_type clocksource_subsys = { .name = "clocksource", .dev_name = "clocksource", }; @@ -1450,7 +1502,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strlcpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3ae661ab6260..f8ea8c8fc895 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -38,6 +38,7 @@ #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> +#include <linux/sched/isolation.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> @@ -57,6 +58,9 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); + /* * The timer bases: * @@ -73,55 +77,46 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} /* * Functions and macros which are different for UP/SMP systems are kept in a @@ -144,11 +139,6 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return base == &migration_base; -} - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -164,6 +154,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base) static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; @@ -181,27 +172,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will be issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -252,8 +270,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -262,8 +280,7 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -273,13 +290,9 @@ again: #else /* CONFIG_SMP */ -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return false; -} - static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; @@ -346,7 +359,7 @@ static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { - return ((struct hrtimer *) addr)->function; + return ACCESS_PRIVATE((struct hrtimer *)addr, function); } /* @@ -414,6 +427,11 @@ static inline void debug_hrtimer_init(struct hrtimer *timer) debug_object_init(timer, &hrtimer_debug_descr); } +static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); +} + static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { @@ -425,28 +443,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode); - -void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_object_init_on_stack(timer, &hrtimer_debug_descr); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); - -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode); - -void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) -{ - debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); - __hrtimer_init_sleeper(sl, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); - void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); @@ -456,17 +452,23 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); +} + +static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init_on_stack(timer); + trace_hrtimer_setup(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, @@ -641,17 +643,12 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * Is the high resolution mode active ? */ -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } -static inline int hrtimer_hres_active(void) -{ - return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); -} - static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) @@ -675,7 +672,7 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, * set. So we'd effectively block all timers until the T2 event * fires. */ - if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); @@ -727,8 +724,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -744,7 +739,7 @@ static void hrtimer_switch_to_hres(void) base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; - tick_setup_sched_timer(); + tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } @@ -785,13 +780,13 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!__hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); - if (__hrtimer_hres_active(base)) + if (hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); @@ -948,7 +943,7 @@ void clock_was_set(unsigned int bases) cpumask_var_t mask; int cpu; - if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) + if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -1013,26 +1008,29 @@ void hrtimers_resume_local(void) */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** - * hrtimer_forward - forward the timer expiry + * hrtimer_forward() - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. - * Returns the number of overruns. * - * Can be safely called from the callback function of @timer. If - * called from other contexts @timer must neither be enqueued nor - * running the callback and the caller needs to take care of - * serialization. + * .. note:: + * This only updates the timer expiry value and does not requeue the timer. + * + * There is also a variant of the function hrtimer_forward_now(). + * + * Context: Can be safely called from the callback function of @timer. If called + * from other contexts @timer must neither be enqueued nor running the + * callback and the caller needs to take care of serialization. * - * Note: This only updates the timer expiry value and does not requeue - * the timer. + * Return: The number of overruns are returned. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { @@ -1075,13 +1073,13 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * - * Returns 1 when the new timer is the leftmost timer in the tree. + * Returns true when the new timer is the leftmost timer in the tree. */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - enum hrtimer_mode mode) +static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { debug_activate(timer, mode); + WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; @@ -1175,7 +1173,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution - * (i.e. one jiffie) to prevent short timeouts. + * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) @@ -1213,6 +1211,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1224,10 +1223,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. @@ -1241,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); @@ -1256,8 +1261,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid @@ -1347,11 +1371,13 @@ static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) + __acquires(&base->softirq_expiry_lock) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) + __releases(&base->softirq_expiry_lock) { spin_unlock(&base->softirq_expiry_lock); } @@ -1374,13 +1400,25 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, } } +#ifdef CONFIG_SMP +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} +#else +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} +#endif + /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted - * in the middle of a timer callback, then calling del_timer_sync() can + * in the middle of a timer callback, then calling hrtimer_cancel() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer @@ -1484,7 +1522,7 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!__hrtimer_hres_active(cpu_base)) + if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1507,7 +1545,7 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (__hrtimer_hres_active(cpu_base)) { + if (hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { @@ -1528,18 +1566,47 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; + switch (clock_id) { + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; + } +} - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) +{ + return __hrtimer_cb_get_time(timer->base->clockid); +} +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); + +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; @@ -1572,11 +1639,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); + + if (WARN_ON_ONCE(!function)) + ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; + else + ACCESS_PRIVATE(timer, function) = function; } /** - * hrtimer_init - initialize a timer to the given clock + * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized + * @function: the callback function * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, @@ -1586,13 +1659,32 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_setup(timer, clock_id, mode); + __hrtimer_setup(timer, function, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_setup); + +/** + * hrtimer_setup_on_stack - initialize a timer on stack memory + * @timer: The timer to be initialized + * @function: the callback function + * @clock_id: The clock to be used + * @mode: The timer mode + * + * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack + * memory. + */ +void hrtimer_setup_on_stack(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); + debug_setup_on_stack(timer, clock_id, mode); + __hrtimer_setup(timer, function, clock_id, mode); } -EXPORT_SYMBOL_GPL(hrtimer_init); +EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); /* * A timer is active, when it is enqueued into the rbtree or the @@ -1663,7 +1755,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - fn = timer->function; + fn = ACCESS_PRIVATE(timer, function); /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the @@ -1753,7 +1845,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, } } -static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; @@ -1805,7 +1897,7 @@ retry: if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @@ -1868,25 +1960,7 @@ retry: tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } - -/* called with interrupts disabled */ -static inline void __hrtimer_peek_ahead_timers(void) -{ - struct tick_device *td; - - if (!hrtimer_hres_active()) - return; - - td = this_cpu_ptr(&tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); -} - -#else /* CONFIG_HIGH_RES_TIMERS */ - -static inline void __hrtimer_peek_ahead_timers(void) { } - -#endif /* !CONFIG_HIGH_RES_TIMERS */ +#endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy @@ -1897,7 +1971,7 @@ void hrtimer_run_queues(void) unsigned long flags; ktime_t now; - if (__hrtimer_hres_active(cpu_base)) + if (hrtimer_hres_active(cpu_base)) return; /* @@ -1918,7 +1992,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @@ -1956,7 +2030,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_setup_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -1966,8 +2040,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -1989,29 +2063,27 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) + if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } - __hrtimer_init(&sl->timer, clock_id, mode); - sl->timer.function = hrtimer_wakeup; + __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode); sl->task = current; } /** - * hrtimer_init_sleeper - initialize sleeper to the given clock + * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, - enum hrtimer_mode mode) +void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(&sl->timer, clock_id, mode); - __hrtimer_init_sleeper(sl, clock_id, mode); - + debug_setup_on_stack(&sl->timer, clock_id, mode); + __hrtimer_setup_sleeper(sl, clock_id, mode); } -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); +EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { @@ -2072,9 +2144,8 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; int ret; - hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -2086,14 +2157,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - u64 slack; - - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) - slack = 0; - hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); + hrtimer_setup_sleeper_on_stack(&t, clockid, mode); + hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -2106,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); @@ -2126,6 +2192,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, @@ -2147,6 +2214,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, @@ -2171,6 +2239,15 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; + hrtimer_cpu_base_init_expiry_lock(cpu_base); + return 0; +} + +int hrtimers_cpu_starting(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + + /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; @@ -2178,7 +2255,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; - hrtimer_cpu_base_init_expiry_lock(cpu_base); + cpu_base->online = 1; return 0; } @@ -2214,48 +2291,33 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -int hrtimers_dead_cpu(unsigned int scpu) +int hrtimers_cpu_dying(unsigned int dying_cpu) { + int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; - int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); + old_base = this_cpu_ptr(&hrtimer_bases); + new_base = &per_cpu(hrtimer_bases, ncpu); /* - * this BH disable ensures that raise_softirq_irqoff() does - * not wakeup ksoftirqd (and acquire the pi-lock) while - * holding the cpu_base lock - */ - local_bh_disable(); - local_irq_disable(); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = this_cpu_ptr(&hrtimer_bases); - /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&old_base->lock); + raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - hrtimer_update_softirq_timer(new_base, false); + /* Tell the other CPU to retrigger the next event */ + smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); - raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); + old_base->online = 0; + raw_spin_unlock(&old_base->lock); - /* Check, if we got expired work to do */ - __hrtimer_peek_ahead_timers(); - local_irq_enable(); - local_bh_enable(); return 0; } @@ -2264,124 +2326,6 @@ int hrtimers_dead_cpu(unsigned int scpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } - -/** - * schedule_hrtimeout_range_clock - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode - * @clock_id: timer clock to be used - */ -int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode, clockid_t clock_id) -{ - struct hrtimer_sleeper t; - - /* - * Optimize when a zero timeout value is given. It does not - * matter whether this is an absolute or a relative time. - */ - if (expires && *expires == 0) { - __set_current_state(TASK_RUNNING); - return 0; - } - - /* - * A NULL parameter means "infinite" - */ - if (!expires) { - schedule(); - return -EINTR; - } - - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); - hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - hrtimer_sleeper_start_expires(&t, mode); - - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - destroy_hrtimer_on_stack(&t.timer); - - __set_current_state(TASK_RUNNING); - - return !t.task ? 0 : -EINTR; -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); - -/** - * schedule_hrtimeout_range - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range_clock(expires, delta, mode, - CLOCK_MONOTONIC); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); - -/** - * schedule_hrtimeout - sleep until timeout - * @expires: timeout value (ktime_t) - * @mode: timer mode - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout(ktime_t *expires, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range(expires, 0, mode); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 00629e658ca1..7c6110e964e7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -151,7 +151,26 @@ COMPAT_SYSCALL_DEFINE2(getitimer, int, which, #endif /* - * The timer is automagically restarted, when interval != 0 + * Invoked from dequeue_signal() when SIG_ALRM is delivered. + * + * Restart the ITIMER_REAL timer if it is armed as periodic timer. Doing + * this in the signal delivery path instead of self rearming prevents a DoS + * with small increments in the high reolution timer case and reduces timer + * noise in general. + */ +void posixtimer_rearm_itimer(struct task_struct *tsk) +{ + struct hrtimer *tmr = &tsk->signal->real_timer; + + if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { + hrtimer_forward_now(tmr, tsk->signal->it_real_incr); + hrtimer_restart(tmr); + } +} + +/* + * Interval timers are restarted in the signal delivery path. See + * posixtimer_rearm_itimer(). */ enum hrtimer_restart it_real_fn(struct hrtimer *timer) { diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index bc4db9e5ab70..d31a6d40d38d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void) static struct clocksource refined_jiffies; -int register_refined_jiffies(long cycles_per_second) +void __init register_refined_jiffies(long cycles_per_second) { u64 nsec_per_tick, shift_hz; long cycles_per_tick; - - refined_jiffies = clocksource_jiffies; refined_jiffies.name = "refined-jiffies"; refined_jiffies.rating++; @@ -100,5 +98,129 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; __clocksource_register(&refined_jiffies); - return 0; } + +#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ) +#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ) + +static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ) +static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ) +static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies) +static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t) +static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies) +static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs) + +static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz, + sysctl_kern_to_user_int_conv_hz, false) +static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies, + sysctl_user_to_kern_int_conv_userhz, + sysctl_kern_to_user_int_conv_userhz, false) +static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms, + sysctl_kern_to_user_int_conv_ms, false) +static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax, + sysctl_user_to_kern_int_conv_ms, + sysctl_kern_to_user_int_conv_ms, true) + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_jiffies); + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ) + return -EINVAL; + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_userhz_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); + +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies_minmax); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, + HZ, 1000l); +} +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); + diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952a..e76be24b132c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> +#include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> @@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -130,7 +129,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) @@ -165,26 +164,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * HVCLOCK * VVAR * - * The check for vdso_data->clock_mode is in the unlikely path of + * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) { - struct timens_offset *offset = vdata->offset; + struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -219,7 +218,8 @@ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { - struct vdso_data *vdata; + struct vdso_time_data *vdata; + struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) @@ -235,10 +235,16 @@ static void timens_set_vvar_page(struct task_struct *task, goto out; ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); + timens_setup_vdso_clock_data(&vc[i], ns); + + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } out: mutex_unlock(&offset_lock); @@ -246,16 +252,13 @@ out: void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); - kfree(ns); -} - -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) @@ -459,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -469,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -477,9 +478,12 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, - .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 406dccb79c2b..97fa99b96dd0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,26 +18,86 @@ #include <linux/module.h> #include <linux/rtc.h> #include <linux/audit.h> +#include <linux/timekeeper_internal.h> #include "ntp_internal.h" #include "timekeeping_internal.h" - -/* - * NTP timekeeping variables: +/** + * struct ntp_data - Structure holding all NTP related state + * @tick_usec: USER_HZ period in microseconds + * @tick_length: Adjusted tick length + * @tick_length_base: Base value for @tick_length + * @time_state: State of the clock synchronization + * @time_status: Clock status bits + * @time_offset: Time adjustment in nanoseconds + * @time_constant: PLL time constant + * @time_maxerror: Maximum error in microseconds holding the NTP sync distance + * (NTP dispersion + delay / 2) + * @time_esterror: Estimated error in microseconds holding NTP dispersion + * @time_freq: Frequency offset scaled nsecs/secs + * @time_reftime: Time at last adjustment in seconds + * @time_adjust: Adjustment value + * @ntp_tick_adj: Constant boot-param configurable NTP tick adjustment (upscaled) + * @ntp_next_leap_sec: Second value of the next pending leapsecond, or TIME64_MAX if no leap * - * Note: All of the NTP state is protected by the timekeeping locks. + * @pps_valid: PPS signal watchdog counter + * @pps_tf: PPS phase median filter + * @pps_jitter: PPS current jitter in nanoseconds + * @pps_fbase: PPS beginning of the last freq interval + * @pps_shift: PPS current interval duration in seconds (shift value) + * @pps_intcnt: PPS interval counter + * @pps_freq: PPS frequency offset in scaled ns/s + * @pps_stabil: PPS current stability in scaled ns/s + * @pps_calcnt: PPS monitor: calibration intervals + * @pps_jitcnt: PPS monitor: jitter limit exceeded + * @pps_stbcnt: PPS monitor: stability limit exceeded + * @pps_errcnt: PPS monitor: calibration errors + * + * Protected by the timekeeping locks. */ +struct ntp_data { + unsigned long tick_usec; + u64 tick_length; + u64 tick_length_base; + int time_state; + int time_status; + s64 time_offset; + long time_constant; + long time_maxerror; + long time_esterror; + s64 time_freq; + time64_t time_reftime; + long time_adjust; + s64 ntp_tick_adj; + time64_t ntp_next_leap_sec; +#ifdef CONFIG_NTP_PPS + int pps_valid; + long pps_tf[3]; + long pps_jitter; + struct timespec64 pps_fbase; + int pps_shift; + int pps_intcnt; + s64 pps_freq; + long pps_stabil; + long pps_calcnt; + long pps_jitcnt; + long pps_stbcnt; + long pps_errcnt; +#endif +}; - -/* USER_HZ period (usecs): */ -unsigned long tick_usec = USER_TICK_USEC; - -/* SHIFTED_HZ period (nsecs): */ -unsigned long tick_nsec; - -static u64 tick_length; -static u64 tick_length_base; +static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = { + [ 0 ... TIMEKEEPERS_MAX - 1 ] = { + .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, + .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, + }, +}; #define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ @@ -45,46 +105,6 @@ static u64 tick_length_base; (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) #define MAX_TAI_OFFSET 100000 -/* - * phase-lock loop variables - */ - -/* - * clock synchronization status - * - * (TIME_ERROR prevents overwriting the CMOS clock) - */ -static int time_state = TIME_OK; - -/* clock status bits: */ -static int time_status = STA_UNSYNC; - -/* time adjustment (nsecs): */ -static s64 time_offset; - -/* pll time constant: */ -static long time_constant = 2; - -/* maximum error (usecs): */ -static long time_maxerror = NTP_PHASE_LIMIT; - -/* estimated error (usecs): */ -static long time_esterror = NTP_PHASE_LIMIT; - -/* frequency offset (scaled nsecs/secs): */ -static s64 time_freq; - -/* time at last adjustment (secs): */ -static time64_t time_reftime; - -static long time_adjust; - -/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ -static s64 ntp_tick_adj; - -/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ -static time64_t ntp_next_leap_sec = TIME64_MAX; - #ifdef CONFIG_NTP_PPS /* @@ -101,128 +121,115 @@ static time64_t ntp_next_leap_sec = TIME64_MAX; intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static int pps_valid; /* signal watchdog counter */ -static long pps_tf[3]; /* phase median filter */ -static long pps_jitter; /* current jitter (ns) */ -static struct timespec64 pps_fbase; /* beginning of the last freq interval */ -static int pps_shift; /* current interval duration (s) (shift) */ -static int pps_intcnt; /* interval counter */ -static s64 pps_freq; /* frequency offset (scaled ns/s) */ -static long pps_stabil; /* current stability (scaled ns/s) */ - /* - * PPS signal quality monitors - */ -static long pps_calcnt; /* calibration intervals */ -static long pps_jitcnt; /* jitter limit exceeded */ -static long pps_stbcnt; /* stability limit exceeded */ -static long pps_errcnt; /* calibration errors */ - - -/* PPS kernel consumer compensates the whole phase error immediately. + * PPS kernel consumer compensates the whole phase error immediately. * Otherwise, reduce the offset by a fixed factor times the time constant. */ -static inline s64 ntp_offset_chunk(s64 offset) +static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) { - if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + if (ntpdata->time_status & STA_PPSTIME && ntpdata->time_status & STA_PPSSIGNAL) return offset; else - return shift_right(offset, SHIFT_PLL + time_constant); + return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } -static inline void pps_reset_freq_interval(void) +static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) { - /* the PPS calibration interval may end - surprisingly early */ - pps_shift = PPS_INTMIN; - pps_intcnt = 0; + /* The PPS calibration interval may end surprisingly early */ + ntpdata->pps_shift = PPS_INTMIN; + ntpdata->pps_intcnt = 0; } /** * pps_clear - Clears the PPS state variables + * @ntpdata: Pointer to ntp data */ -static inline void pps_clear(void) +static inline void pps_clear(struct ntp_data *ntpdata) { - pps_reset_freq_interval(); - pps_tf[0] = 0; - pps_tf[1] = 0; - pps_tf[2] = 0; - pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; - pps_freq = 0; + pps_reset_freq_interval(ntpdata); + ntpdata->pps_tf[0] = 0; + ntpdata->pps_tf[1] = 0; + ntpdata->pps_tf[2] = 0; + ntpdata->pps_fbase.tv_sec = ntpdata->pps_fbase.tv_nsec = 0; + ntpdata->pps_freq = 0; } -/* Decrease pps_valid to indicate that another second has passed since - * the last PPS signal. When it reaches 0, indicate that PPS signal is - * missing. +/* + * Decrease pps_valid to indicate that another second has passed since the + * last PPS signal. When it reaches 0, indicate that PPS signal is missing. */ -static inline void pps_dec_valid(void) +static inline void pps_dec_valid(struct ntp_data *ntpdata) { - if (pps_valid > 0) - pps_valid--; - else { - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - pps_clear(); + if (ntpdata->pps_valid > 0) { + ntpdata->pps_valid--; + } else { + ntpdata->time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(ntpdata); } } -static inline void pps_set_freq(s64 freq) +static inline void pps_set_freq(struct ntp_data *ntpdata) { - pps_freq = freq; + ntpdata->pps_freq = ntpdata->time_freq; } -static inline int is_error_status(int status) +static inline bool is_error_status(int status) { return (status & (STA_UNSYNC|STA_CLOCKERR)) - /* PPS signal lost when either PPS time or - * PPS frequency synchronization requested + /* + * PPS signal lost when either PPS time or PPS frequency + * synchronization requested */ || ((status & (STA_PPSFREQ|STA_PPSTIME)) && !(status & STA_PPSSIGNAL)) - /* PPS jitter exceeded when - * PPS time synchronization requested */ + /* + * PPS jitter exceeded when PPS time synchronization + * requested + */ || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) - /* PPS wander exceeded or calibration error when - * PPS frequency synchronization requested + /* + * PPS wander exceeded or calibration error when PPS + * frequency synchronization requested */ || ((status & STA_PPSFREQ) && (status & (STA_PPSWANDER|STA_PPSERROR))); } -static inline void pps_fill_timex(struct __kernel_timex *txc) +static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc) { - txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + txc->ppsfreq = shift_right((ntpdata->pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); - txc->jitter = pps_jitter; - if (!(time_status & STA_NANO)) - txc->jitter = pps_jitter / NSEC_PER_USEC; - txc->shift = pps_shift; - txc->stabil = pps_stabil; - txc->jitcnt = pps_jitcnt; - txc->calcnt = pps_calcnt; - txc->errcnt = pps_errcnt; - txc->stbcnt = pps_stbcnt; + txc->jitter = ntpdata->pps_jitter; + if (!(ntpdata->time_status & STA_NANO)) + txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; + txc->shift = ntpdata->pps_shift; + txc->stabil = ntpdata->pps_stabil; + txc->jitcnt = ntpdata->pps_jitcnt; + txc->calcnt = ntpdata->pps_calcnt; + txc->errcnt = ntpdata->pps_errcnt; + txc->stbcnt = ntpdata->pps_stbcnt; } #else /* !CONFIG_NTP_PPS */ -static inline s64 ntp_offset_chunk(s64 offset) +static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) { - return shift_right(offset, SHIFT_PLL + time_constant); + return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } -static inline void pps_reset_freq_interval(void) {} -static inline void pps_clear(void) {} -static inline void pps_dec_valid(void) {} -static inline void pps_set_freq(s64 freq) {} +static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) {} +static inline void pps_clear(struct ntp_data *ntpdata) {} +static inline void pps_dec_valid(struct ntp_data *ntpdata) {} +static inline void pps_set_freq(struct ntp_data *ntpdata) {} -static inline int is_error_status(int status) +static inline bool is_error_status(int status) { return status & (STA_UNSYNC|STA_CLOCKERR); } -static inline void pps_fill_timex(struct __kernel_timex *txc) +static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc) { /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; @@ -237,158 +244,149 @@ static inline void pps_fill_timex(struct __kernel_timex *txc) #endif /* CONFIG_NTP_PPS */ - -/** - * ntp_synced - Returns 1 if the NTP status is not UNSYNC - * - */ -static inline int ntp_synced(void) -{ - return !(time_status & STA_UNSYNC); -} - - -/* - * NTP methods: - */ - /* - * Update (tick_length, tick_length_base, tick_nsec), based - * on (tick_usec, ntp_tick_adj, time_freq): + * Update tick_length and tick_length_base, based on tick_usec, ntp_tick_adj and + * time_freq: */ -static void ntp_update_frequency(void) +static void ntp_update_frequency(struct ntp_data *ntpdata) { - u64 second_length; - u64 new_base; + u64 second_length, new_base, tick_usec = (u64)ntpdata->tick_usec; - second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) - << NTP_SCALE_SHIFT; + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; - second_length += ntp_tick_adj; - second_length += time_freq; + second_length += ntpdata->ntp_tick_adj; + second_length += ntpdata->time_freq; - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; new_base = div_u64(second_length, NTP_INTERVAL_FREQ); /* - * Don't wait for the next second_overflow, apply - * the change to the tick length immediately: + * Don't wait for the next second_overflow, apply the change to the + * tick length immediately: */ - tick_length += new_base - tick_length_base; - tick_length_base = new_base; + ntpdata->tick_length += new_base - ntpdata->tick_length_base; + ntpdata->tick_length_base = new_base; } -static inline s64 ntp_update_offset_fll(s64 offset64, long secs) +static inline s64 ntp_update_offset_fll(struct ntp_data *ntpdata, s64 offset64, long secs) { - time_status &= ~STA_MODE; + ntpdata->time_status &= ~STA_MODE; if (secs < MINSEC) return 0; - if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + if (!(ntpdata->time_status & STA_FLL) && (secs <= MAXSEC)) return 0; - time_status |= STA_MODE; + ntpdata->time_status |= STA_MODE; return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } -static void ntp_update_offset(long offset) +static void ntp_update_offset(struct ntp_data *ntpdata, long offset) { - s64 freq_adj; - s64 offset64; - long secs; + s64 freq_adj, offset64; + long secs, real_secs; - if (!(time_status & STA_PLL)) + if (!(ntpdata->time_status & STA_PLL)) return; - if (!(time_status & STA_NANO)) { + if (!(ntpdata->time_status & STA_NANO)) { /* Make sure the multiplication below won't overflow */ offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); offset *= NSEC_PER_USEC; } - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ + /* Scale the phase adjustment and clamp to the operating range. */ offset = clamp(offset, -MAXPHASE, MAXPHASE); /* * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - secs = (long)(__ktime_get_real_seconds() - time_reftime); - if (unlikely(time_status & STA_FREQHOLD)) + real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); + secs = (long)(real_secs - ntpdata->time_reftime); + if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; - time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = real_secs; offset64 = offset; - freq_adj = ntp_update_offset_fll(offset64, secs); + freq_adj = ntp_update_offset_fll(ntpdata, offset64, secs); /* * Clamp update interval to reduce PLL gain with low * sampling rate (e.g. intermittent network connection) * to avoid instability. */ - if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) - secs = 1 << (SHIFT_PLL + 1 + time_constant); + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + ntpdata->time_constant))) + secs = 1 << (SHIFT_PLL + 1 + ntpdata->time_constant); freq_adj += (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + ntpdata->time_constant)); - freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + freq_adj = min(freq_adj + ntpdata->time_freq, MAXFREQ_SCALED); - time_freq = max(freq_adj, -MAXFREQ_SCALED); + ntpdata->time_freq = max(freq_adj, -MAXFREQ_SCALED); - time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); + ntpdata->time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } -/** - * ntp_clear - Clears the NTP state variables - */ -void ntp_clear(void) +static void __ntp_clear(struct ntp_data *ntpdata) { - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; + /* Stop active adjtime() */ + ntpdata->time_adjust = 0; + ntpdata->time_status |= STA_UNSYNC; + ntpdata->time_maxerror = NTP_PHASE_LIMIT; + ntpdata->time_esterror = NTP_PHASE_LIMIT; - ntp_update_frequency(); + ntp_update_frequency(ntpdata); - tick_length = tick_length_base; - time_offset = 0; + ntpdata->tick_length = ntpdata->tick_length_base; + ntpdata->time_offset = 0; - ntp_next_leap_sec = TIME64_MAX; + ntpdata->ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ - pps_clear(); + pps_clear(ntpdata); +} + +/** + * ntp_clear - Clears the NTP state variables + * @tkid: Timekeeper ID to be able to select proper ntp data array member + */ +void ntp_clear(unsigned int tkid) +{ + __ntp_clear(&tk_ntp_data[tkid]); } -u64 ntp_tick_length(void) +u64 ntp_tick_length(unsigned int tkid) { - return tick_length; + return tk_ntp_data[tkid].tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * @tkid: Timekeeper ID * - * Provides the time of the next leapsecond against CLOCK_REALTIME in - * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next + * leap second against CLOCK_REALTIME in a ktime_t format if a + * leap second is pending. KTIME_MAX otherwise. */ -ktime_t ntp_get_next_leap(void) +ktime_t ntp_get_next_leap(unsigned int tkid) { - ktime_t ret; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + + if (tkid != TIMEKEEPER_CORE) + return KTIME_MAX; - if ((time_state == TIME_INS) && (time_status & STA_INS)) - return ktime_set(ntp_next_leap_sec, 0); - ret = KTIME_MAX; - return ret; + if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) + return ktime_set(ntpdata->ntp_next_leap_sec, 0); + + return KTIME_MAX; } /* - * this routine handles the overflow of the microsecond field + * This routine handles the overflow of the microsecond field * * The tricky bits of code to handle the accurate clock support * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. @@ -397,8 +395,9 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(time64_t secs) +int second_overflow(unsigned int tkid, time64_t secs) { + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; s64 delta; int leap = 0; s32 rem; @@ -408,87 +407,84 @@ int second_overflow(time64_t secs) * day, the system clock is set back one second; if in leap-delete * state, the system clock is set ahead one second. */ - switch (time_state) { + switch (ntpdata->time_state) { case TIME_OK: - if (time_status & STA_INS) { - time_state = TIME_INS; + if (ntpdata->time_status & STA_INS) { + ntpdata->time_state = TIME_INS; div_s64_rem(secs, SECS_PER_DAY, &rem); - ntp_next_leap_sec = secs + SECS_PER_DAY - rem; - } else if (time_status & STA_DEL) { - time_state = TIME_DEL; + ntpdata->ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + } else if (ntpdata->time_status & STA_DEL) { + ntpdata->time_state = TIME_DEL; div_s64_rem(secs + 1, SECS_PER_DAY, &rem); - ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + ntpdata->ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } break; case TIME_INS: - if (!(time_status & STA_INS)) { - ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_OK; - } else if (secs == ntp_next_leap_sec) { + if (!(ntpdata->time_status & STA_INS)) { + ntpdata->ntp_next_leap_sec = TIME64_MAX; + ntpdata->time_state = TIME_OK; + } else if (secs == ntpdata->ntp_next_leap_sec) { leap = -1; - time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); + ntpdata->time_state = TIME_OOP; + pr_notice("Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: - if (!(time_status & STA_DEL)) { - ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_OK; - } else if (secs == ntp_next_leap_sec) { + if (!(ntpdata->time_status & STA_DEL)) { + ntpdata->ntp_next_leap_sec = TIME64_MAX; + ntpdata->time_state = TIME_OK; + } else if (secs == ntpdata->ntp_next_leap_sec) { leap = 1; - ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); + ntpdata->ntp_next_leap_sec = TIME64_MAX; + ntpdata->time_state = TIME_WAIT; + pr_notice("Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: - ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_WAIT; + ntpdata->ntp_next_leap_sec = TIME64_MAX; + ntpdata->time_state = TIME_WAIT; break; case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; + if (!(ntpdata->time_status & (STA_INS | STA_DEL))) + ntpdata->time_state = TIME_OK; break; } - /* Bump the maxerror field */ - time_maxerror += MAXFREQ / NSEC_PER_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; + ntpdata->time_maxerror += MAXFREQ / NSEC_PER_USEC; + if (ntpdata->time_maxerror > NTP_PHASE_LIMIT) { + ntpdata->time_maxerror = NTP_PHASE_LIMIT; + ntpdata->time_status |= STA_UNSYNC; } /* Compute the phase adjustment for the next second */ - tick_length = tick_length_base; + ntpdata->tick_length = ntpdata->tick_length_base; - delta = ntp_offset_chunk(time_offset); - time_offset -= delta; - tick_length += delta; + delta = ntp_offset_chunk(ntpdata, ntpdata->time_offset); + ntpdata->time_offset -= delta; + ntpdata->tick_length += delta; /* Check PPS signal */ - pps_dec_valid(); + pps_dec_valid(ntpdata); - if (!time_adjust) + if (!ntpdata->time_adjust) goto out; - if (time_adjust > MAX_TICKADJ) { - time_adjust -= MAX_TICKADJ; - tick_length += MAX_TICKADJ_SCALED; + if (ntpdata->time_adjust > MAX_TICKADJ) { + ntpdata->time_adjust -= MAX_TICKADJ; + ntpdata->tick_length += MAX_TICKADJ_SCALED; goto out; } - if (time_adjust < -MAX_TICKADJ) { - time_adjust += MAX_TICKADJ; - tick_length -= MAX_TICKADJ_SCALED; + if (ntpdata->time_adjust < -MAX_TICKADJ) { + ntpdata->time_adjust += MAX_TICKADJ; + ntpdata->tick_length -= MAX_TICKADJ_SCALED; goto out; } - tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) - << NTP_SCALE_SHIFT; - time_adjust = 0; + ntpdata->tick_length += (s64)(ntpdata->time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + ntpdata->time_adjust = 0; out: return leap; @@ -611,6 +607,15 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns } #endif +/** + * ntp_synced - Tells whether the NTP status is not UNSYNC + * Returns: true if not UNSYNC, false otherwise + */ +static inline bool ntp_synced(void) +{ + return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC); +} + /* * If we have an externally synchronized Linux clock, then update RTC clock * accordingly every ~11 minutes. Generally RTCs can only store second @@ -660,9 +665,17 @@ rearm: sched_sync_hw_clock(offset_nsec, res != 0); } -void ntp_notify_cmos_timer(void) +void ntp_notify_cmos_timer(bool offset_set) { /* + * If the time jumped (using ADJ_SETOFFSET) cancels sync timer, + * which may have been running if the time was synchronized + * prior to the ADJ_SETOFFSET call. + */ + if (offset_set) + hrtimer_cancel(&sync_hrtimer); + + /* * When the work is currently executed but has not yet the timer * rearmed this queues the work immediately again. No big issue, * just a pointless work scheduled. @@ -673,8 +686,7 @@ void ntp_notify_cmos_timer(void) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } @@ -683,163 +695,156 @@ static inline void __init ntp_init_cmos_sync(void) { } /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(const struct __kernel_timex *txc) +static inline void process_adj_status(struct ntp_data *ntpdata, const struct __kernel_timex *txc) { - if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { - time_state = TIME_OK; - time_status = STA_UNSYNC; - ntp_next_leap_sec = TIME64_MAX; - /* restart PPS frequency calibration */ - pps_reset_freq_interval(); + if ((ntpdata->time_status & STA_PLL) && !(txc->status & STA_PLL)) { + ntpdata->time_state = TIME_OK; + ntpdata->time_status = STA_UNSYNC; + ntpdata->ntp_next_leap_sec = TIME64_MAX; + /* Restart PPS frequency calibration */ + pps_reset_freq_interval(ntpdata); } /* * If we turn on PLL adjustments then reset the * reference time to current time. */ - if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = __ktime_get_real_seconds(); + if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) + ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); /* only set allowed bits */ - time_status &= STA_RONLY; - time_status |= txc->status & ~STA_RONLY; + ntpdata->time_status &= STA_RONLY; + ntpdata->time_status |= txc->status & ~STA_RONLY; } - -static inline void process_adjtimex_modes(const struct __kernel_timex *txc, +static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct __kernel_timex *txc, s32 *time_tai) { if (txc->modes & ADJ_STATUS) - process_adj_status(txc); + process_adj_status(ntpdata, txc); if (txc->modes & ADJ_NANO) - time_status |= STA_NANO; + ntpdata->time_status |= STA_NANO; if (txc->modes & ADJ_MICRO) - time_status &= ~STA_NANO; + ntpdata->time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { - time_freq = txc->freq * PPM_SCALE; - time_freq = min(time_freq, MAXFREQ_SCALED); - time_freq = max(time_freq, -MAXFREQ_SCALED); - /* update pps_freq */ - pps_set_freq(time_freq); + ntpdata->time_freq = txc->freq * PPM_SCALE; + ntpdata->time_freq = min(ntpdata->time_freq, MAXFREQ_SCALED); + ntpdata->time_freq = max(ntpdata->time_freq, -MAXFREQ_SCALED); + /* Update pps_freq */ + pps_set_freq(ntpdata); } if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; + ntpdata->time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; + ntpdata->time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; - if (!(time_status & STA_NANO)) - time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); + ntpdata->time_constant = clamp(txc->constant, 0, MAXTC); + if (!(ntpdata->time_status & STA_NANO)) + ntpdata->time_constant += 4; + ntpdata->time_constant = clamp(ntpdata->time_constant, 0, MAXTC); } - if (txc->modes & ADJ_TAI && - txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) + if (txc->modes & ADJ_TAI && txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) - ntp_update_offset(txc->offset); + ntp_update_offset(ntpdata, txc->offset); if (txc->modes & ADJ_TICK) - tick_usec = txc->tick; + ntpdata->tick_usec = txc->tick; if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) - ntp_update_frequency(); + ntp_update_frequency(ntpdata); } - /* - * adjtimex mainly allows reading (and writing, if superuser) of + * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad) +int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) { + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; if (txc->modes & ADJ_ADJTIME) { - long save_adjust = time_adjust; + long save_adjust = ntpdata->time_adjust; if (!(txc->modes & ADJ_OFFSET_READONLY)) { /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - ntp_update_frequency(); + ntpdata->time_adjust = txc->offset; + ntp_update_frequency(ntpdata); audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); - audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); + audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, ntpdata->time_adjust); } txc->offset = save_adjust; } else { /* If there are input parameters, then process them: */ if (txc->modes) { - audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); - audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); - audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset); + audit_ntp_set_old(ad, AUDIT_NTP_FREQ, ntpdata->time_freq); + audit_ntp_set_old(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); - audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec); + audit_ntp_set_old(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); - process_adjtimex_modes(txc, time_tai); + process_adjtimex_modes(ntpdata, txc, time_tai); - audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); - audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); - audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset); + audit_ntp_set_new(ad, AUDIT_NTP_FREQ, ntpdata->time_freq); + audit_ntp_set_new(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); - audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec); + audit_ntp_set_new(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); } - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset = (u32)txc->offset / NSEC_PER_USEC; + txc->offset = shift_right(ntpdata->time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); + if (!(ntpdata->time_status & STA_NANO)) + txc->offset = div_s64(txc->offset, NSEC_PER_USEC); } - result = time_state; /* mostly `TIME_OK' */ - /* check for errors */ - if (is_error_status(time_status)) + result = ntpdata->time_state; + if (is_error_status(ntpdata->time_status)) result = TIME_ERROR; - txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + txc->freq = shift_right((ntpdata->time_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; + txc->maxerror = ntpdata->time_maxerror; + txc->esterror = ntpdata->time_esterror; + txc->status = ntpdata->time_status; + txc->constant = ntpdata->time_constant; txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; - txc->tick = tick_usec; + txc->tick = ntpdata->tick_usec; txc->tai = *time_tai; - /* fill PPS status fields */ - pps_fill_timex(txc); + /* Fill PPS status fields */ + pps_fill_timex(ntpdata, txc); txc->time.tv_sec = ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; - if (!(time_status & STA_NANO)) + if (!(ntpdata->time_status & STA_NANO)) txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; /* Handle leapsec adjustments */ - if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { - if ((time_state == TIME_INS) && (time_status & STA_INS)) { + if (unlikely(ts->tv_sec >= ntpdata->ntp_next_leap_sec)) { + if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) { result = TIME_OOP; txc->tai++; txc->time.tv_sec--; } - if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + if ((ntpdata->time_state == TIME_DEL) && (ntpdata->time_status & STA_DEL)) { result = TIME_WAIT; txc->tai--; txc->time.tv_sec++; } - if ((time_state == TIME_OOP) && - (ts->tv_sec == ntp_next_leap_sec)) { + if ((ntpdata->time_state == TIME_OOP) && (ts->tv_sec == ntpdata->ntp_next_leap_sec)) result = TIME_WAIT; - } } return result; @@ -847,17 +852,21 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, #ifdef CONFIG_NTP_PPS -/* actually struct pps_normtime is good old struct timespec, but it is +/* + * struct pps_normtime is basically a struct timespec, but it is * semantically different (and it is the reason why it was invented): * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] - * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) + */ struct pps_normtime { s64 sec; /* seconds */ long nsec; /* nanoseconds */ }; -/* normalize the timestamp so that nsec is in the - ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +/* + * Normalize the timestamp so that nsec is in the + * [ -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval + */ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) { struct pps_normtime norm = { @@ -873,54 +882,57 @@ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) return norm; } -/* get current phase correction and jitter */ -static inline long pps_phase_filter_get(long *jitter) +/* Get current phase correction and jitter */ +static inline long pps_phase_filter_get(struct ntp_data *ntpdata, long *jitter) { - *jitter = pps_tf[0] - pps_tf[1]; + *jitter = ntpdata->pps_tf[0] - ntpdata->pps_tf[1]; if (*jitter < 0) *jitter = -*jitter; /* TODO: test various filters */ - return pps_tf[0]; + return ntpdata->pps_tf[0]; } -/* add the sample to the phase filter */ -static inline void pps_phase_filter_add(long err) +/* Add the sample to the phase filter */ +static inline void pps_phase_filter_add(struct ntp_data *ntpdata, long err) { - pps_tf[2] = pps_tf[1]; - pps_tf[1] = pps_tf[0]; - pps_tf[0] = err; + ntpdata->pps_tf[2] = ntpdata->pps_tf[1]; + ntpdata->pps_tf[1] = ntpdata->pps_tf[0]; + ntpdata->pps_tf[0] = err; } -/* decrease frequency calibration interval length. - * It is halved after four consecutive unstable intervals. +/* + * Decrease frequency calibration interval length. It is halved after four + * consecutive unstable intervals. */ -static inline void pps_dec_freq_interval(void) +static inline void pps_dec_freq_interval(struct ntp_data *ntpdata) { - if (--pps_intcnt <= -PPS_INTCOUNT) { - pps_intcnt = -PPS_INTCOUNT; - if (pps_shift > PPS_INTMIN) { - pps_shift--; - pps_intcnt = 0; + if (--ntpdata->pps_intcnt <= -PPS_INTCOUNT) { + ntpdata->pps_intcnt = -PPS_INTCOUNT; + if (ntpdata->pps_shift > PPS_INTMIN) { + ntpdata->pps_shift--; + ntpdata->pps_intcnt = 0; } } } -/* increase frequency calibration interval length. - * It is doubled after four consecutive stable intervals. +/* + * Increase frequency calibration interval length. It is doubled after + * four consecutive stable intervals. */ -static inline void pps_inc_freq_interval(void) +static inline void pps_inc_freq_interval(struct ntp_data *ntpdata) { - if (++pps_intcnt >= PPS_INTCOUNT) { - pps_intcnt = PPS_INTCOUNT; - if (pps_shift < PPS_INTMAX) { - pps_shift++; - pps_intcnt = 0; + if (++ntpdata->pps_intcnt >= PPS_INTCOUNT) { + ntpdata->pps_intcnt = PPS_INTCOUNT; + if (ntpdata->pps_shift < PPS_INTMAX) { + ntpdata->pps_shift++; + ntpdata->pps_intcnt = 0; } } } -/* update clock frequency based on MONOTONIC_RAW clock PPS signal +/* + * Update clock frequency based on MONOTONIC_RAW clock PPS signal * timestamps * * At the end of the calibration interval the difference between the @@ -929,90 +941,88 @@ static inline void pps_inc_freq_interval(void) * too long, the data are discarded. * Returns the difference between old and new frequency values. */ -static long hardpps_update_freq(struct pps_normtime freq_norm) +static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime freq_norm) { long delta, delta_mod; s64 ftemp; - /* check if the frequency interval was too long */ - if (freq_norm.sec > (2 << pps_shift)) { - time_status |= STA_PPSERROR; - pps_errcnt++; - pps_dec_freq_interval(); - printk_deferred(KERN_ERR - "hardpps: PPSERROR: interval too long - %lld s\n", - freq_norm.sec); + /* Check if the frequency interval was too long */ + if (freq_norm.sec > (2 << ntpdata->pps_shift)) { + ntpdata->time_status |= STA_PPSERROR; + ntpdata->pps_errcnt++; + pps_dec_freq_interval(ntpdata); + printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", + freq_norm.sec); return 0; } - /* here the raw frequency offset and wander (stability) is - * calculated. If the wander is less than the wander threshold - * the interval is increased; otherwise it is decreased. + /* + * Here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold the + * interval is increased; otherwise it is decreased. */ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, freq_norm.sec); - delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); - pps_freq = ftemp; + delta = shift_right(ftemp - ntpdata->pps_freq, NTP_SCALE_SHIFT); + ntpdata->pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - printk_deferred(KERN_WARNING - "hardpps: PPSWANDER: change=%ld\n", delta); - time_status |= STA_PPSWANDER; - pps_stbcnt++; - pps_dec_freq_interval(); - } else { /* good sample */ - pps_inc_freq_interval(); + printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); + ntpdata->time_status |= STA_PPSWANDER; + ntpdata->pps_stbcnt++; + pps_dec_freq_interval(ntpdata); + } else { + /* Good sample */ + pps_inc_freq_interval(ntpdata); } - /* the stability metric is calculated as the average of recent - * frequency changes, but is used only for performance - * monitoring + /* + * The stability metric is calculated as the average of recent + * frequency changes, but is used only for performance monitoring */ delta_mod = delta; if (delta_mod < 0) delta_mod = -delta_mod; - pps_stabil += (div_s64(((s64)delta_mod) << - (NTP_SCALE_SHIFT - SHIFT_USEC), - NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; - - /* if enabled, the system clock frequency is updated */ - if ((time_status & STA_PPSFREQ) != 0 && - (time_status & STA_FREQHOLD) == 0) { - time_freq = pps_freq; - ntp_update_frequency(); + ntpdata->pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - ntpdata->pps_stabil) >> PPS_INTMIN; + + /* If enabled, the system clock frequency is updated */ + if ((ntpdata->time_status & STA_PPSFREQ) && !(ntpdata->time_status & STA_FREQHOLD)) { + ntpdata->time_freq = ntpdata->pps_freq; + ntp_update_frequency(ntpdata); } return delta; } -/* correct REALTIME clock phase error against PPS signal */ -static void hardpps_update_phase(long error) +/* Correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(struct ntp_data *ntpdata, long error) { long correction = -error; long jitter; - /* add the sample to the median filter */ - pps_phase_filter_add(correction); - correction = pps_phase_filter_get(&jitter); + /* Add the sample to the median filter */ + pps_phase_filter_add(ntpdata, correction); + correction = pps_phase_filter_get(ntpdata, &jitter); - /* Nominal jitter is due to PPS signal noise. If it exceeds the + /* + * Nominal jitter is due to PPS signal noise. If it exceeds the * threshold, the sample is discarded; otherwise, if so enabled, * the time offset is updated. */ - if (jitter > (pps_jitter << PPS_POPCORN)) { - printk_deferred(KERN_WARNING - "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); - time_status |= STA_PPSJITTER; - pps_jitcnt++; - } else if (time_status & STA_PPSTIME) { - /* correct the time using the phase offset */ - time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, - NTP_INTERVAL_FREQ); - /* cancel running adjtime() */ - time_adjust = 0; + if (jitter > (ntpdata->pps_jitter << PPS_POPCORN)) { + printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (ntpdata->pps_jitter << PPS_POPCORN)); + ntpdata->time_status |= STA_PPSJITTER; + ntpdata->pps_jitcnt++; + } else if (ntpdata->time_status & STA_PPSTIME) { + /* Correct the time using the phase offset */ + ntpdata->time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* Cancel running adjtime() */ + ntpdata->time_adjust = 0; } - /* update jitter */ - pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; + /* Update jitter */ + ntpdata->pps_jitter += (jitter - ntpdata->pps_jitter) >> PPS_INTMIN; } /* @@ -1029,68 +1039,70 @@ static void hardpps_update_phase(long error) */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; struct pps_normtime pts_norm, freq_norm; pts_norm = pps_normalize_ts(*phase_ts); - /* clear the error bits, they will be set again if needed */ - time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + /* Clear the error bits, they will be set again if needed */ + ntpdata->time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); /* indicate signal presence */ - time_status |= STA_PPSSIGNAL; - pps_valid = PPS_VALID; + ntpdata->time_status |= STA_PPSSIGNAL; + ntpdata->pps_valid = PPS_VALID; - /* when called for the first time, - * just start the frequency interval */ - if (unlikely(pps_fbase.tv_sec == 0)) { - pps_fbase = *raw_ts; + /* + * When called for the first time, just start the frequency + * interval + */ + if (unlikely(ntpdata->pps_fbase.tv_sec == 0)) { + ntpdata->pps_fbase = *raw_ts; return; } - /* ok, now we have a base for frequency calculation */ - freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); - - /* check that the signal is in the range - * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ - if ((freq_norm.sec == 0) || - (freq_norm.nsec > MAXFREQ * freq_norm.sec) || - (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { - time_status |= STA_PPSJITTER; - /* restart the frequency calibration interval */ - pps_fbase = *raw_ts; + /* Ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, ntpdata->pps_fbase)); + + /* + * Check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it + */ + if ((freq_norm.sec == 0) || (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + ntpdata->time_status |= STA_PPSJITTER; + /* Restart the frequency calibration interval */ + ntpdata->pps_fbase = *raw_ts; printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } - /* signal is ok */ - - /* check if the current frequency interval is finished */ - if (freq_norm.sec >= (1 << pps_shift)) { - pps_calcnt++; - /* restart the frequency calibration interval */ - pps_fbase = *raw_ts; - hardpps_update_freq(freq_norm); + /* Signal is ok. Check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << ntpdata->pps_shift)) { + ntpdata->pps_calcnt++; + /* Restart the frequency calibration interval */ + ntpdata->pps_fbase = *raw_ts; + hardpps_update_freq(ntpdata, freq_norm); } - hardpps_update_phase(pts_norm.nsec); + hardpps_update_phase(ntpdata, pts_norm.nsec); } #endif /* CONFIG_NTP_PPS */ static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj); if (rc) return rc; - ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } - __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { - ntp_clear(); + for (int id = 0; id < TIMEKEEPERS_MAX; id++) + __ntp_clear(tk_ntp_data + id); ntp_init_cmos_sync(); } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 23d1b74c3065..7084d839c207 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -3,20 +3,19 @@ #define _LINUX_NTP_INTERNAL_H extern void ntp_init(void); -extern void ntp_clear(void); +extern void ntp_clear(unsigned int tkid); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ -extern u64 ntp_tick_length(void); -extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, - const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad); +extern u64 ntp_tick_length(unsigned int tkid); +extern ktime_t ntp_get_next_leap(unsigned int tkid); +extern int second_overflow(unsigned int tkid, time64_t secs); +extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) -extern void ntp_notify_cmos_timer(void); +extern void ntp_notify_cmos_timer(bool offset_set); #else -static inline void ntp_notify_cmos_timer(void) { } +static inline void ntp_notify_cmos_timer(bool offset_set) { } #endif #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 77c0c2370b6d..101a0f7c43e0 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -19,7 +19,8 @@ */ static struct posix_clock *get_posix_clock(struct file *fp) { - struct posix_clock *clk = fp->private_data; + struct posix_clock_context *pccontext = fp->private_data; + struct posix_clock *clk = pccontext->clk; down_read(&clk->rwsem); @@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk) static ssize_t posix_clock_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { + struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -EINVAL; @@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, return -ENODEV; if (clk->ops.read) - err = clk->ops.read(clk, fp->f_flags, buf, count); + err = clk->ops.read(pccontext, fp->f_flags, buf, count); put_posix_clock(clk); @@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) { + struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); __poll_t result = 0; @@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) return EPOLLERR; if (clk->ops.poll) - result = clk->ops.poll(clk, fp, wait); + result = clk->ops.poll(pccontext, fp, wait); put_posix_clock(clk); @@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) static long posix_clock_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { + struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -ENOTTY; @@ -79,37 +83,19 @@ static long posix_clock_ioctl(struct file *fp, return -ENODEV; if (clk->ops.ioctl) - err = clk->ops.ioctl(clk, cmd, arg); + err = clk->ops.ioctl(pccontext, cmd, arg); put_posix_clock(clk); return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(clk, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; struct posix_clock *clk = container_of(inode->i_cdev, struct posix_clock, cdev); + struct posix_clock_context *pccontext; down_read(&clk->rwsem); @@ -117,15 +103,24 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = -ENODEV; goto out; } - if (clk->ops.open) - err = clk->ops.open(clk, fp->f_mode); - else - err = 0; - - if (!err) { - get_device(clk->dev); - fp->private_data = clk; + pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL); + if (!pccontext) { + err = -ENOMEM; + goto out; + } + pccontext->clk = clk; + pccontext->fp = fp; + if (clk->ops.open) { + err = clk->ops.open(pccontext, fp->f_mode); + if (err) { + kfree(pccontext); + goto out; + } } + + fp->private_data = pccontext; + get_device(clk->dev); + err = 0; out: up_read(&clk->rwsem); return err; @@ -133,14 +128,20 @@ out: static int posix_clock_release(struct inode *inode, struct file *fp) { - struct posix_clock *clk = fp->private_data; + struct posix_clock_context *pccontext = fp->private_data; + struct posix_clock *clk; int err = 0; + if (!pccontext) + return -ENODEV; + clk = pccontext->clk; + if (clk->ops.release) - err = clk->ops.release(clk); + err = clk->ops.release(pccontext); put_device(clk->dev); + kfree(pccontext); fp->private_data = NULL; return err; @@ -148,15 +149,12 @@ static int posix_clock_release(struct inode *inode, struct file *fp) static const struct file_operations posix_clock_file_operations = { .owner = THIS_MODULE, - .llseek = no_llseek, .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) @@ -232,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) if (err) return err; - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } @@ -290,6 +288,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) struct posix_clock_desc cd; int err; + if (!timespec64_valid_strict(ts)) + return -EINVAL; + err = get_clock_desc(id, &cd); if (err) return err; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cb925e8ef9a8..0de2bb7cbec0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -243,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, */ static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) { - u64 curr_cputime; -retry: - curr_cputime = atomic64_read(cputime); - if (sum_cputime > curr_cputime) { - if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) - goto retry; - } + u64 curr_cputime = atomic64_read(cputime); + + do { + if (sum_cputime <= curr_cputime) + return; + } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime)); } static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, @@ -494,19 +493,28 @@ static int posix_cpu_timer_del(struct k_itimer *timer) */ WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); } else { - if (timer->it.cpu.firing) + if (timer->it.cpu.firing) { + /* + * Prevent signal delivery. The timer cannot be dequeued + * because it is on the firing list which is not protected + * by sighand->lock. The delivery path is waiting for + * the timer lock. So go back, unlock and retry. + */ + timer->it.cpu.firing = false; ret = TIMER_RETRY; - else + } else { disarm_timer(timer, p); - + } unlock_task_sighand(p, &flags); } out: rcu_read_unlock(); - if (!ret) - put_pid(ctmr->pid); + if (!ret) { + put_pid(ctmr->pid); + timer->it_status = POSIX_TIMER_DISARMED; + } return ret; } @@ -560,6 +568,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); + timer->it_status = POSIX_TIMER_ARMED; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; @@ -585,36 +594,25 @@ static void cpu_timer_fire(struct k_itimer *timer) { struct cpu_timer *ctmr = &timer->it.cpu; - if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - /* - * User don't want any signal. - */ - cpu_timer_setexpires(ctmr, 0); - } else if (unlikely(timer->sigq == NULL)) { + timer->it_status = POSIX_TIMER_DISARMED; + + if (unlikely(ctmr->nanosleep)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); cpu_timer_setexpires(ctmr, 0); - } else if (!timer->it_interval) { - /* - * One-shot timer. Clear it as soon as it's fired. - */ - posix_timer_event(timer, 0); - cpu_timer_setexpires(ctmr, 0); - } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { - /* - * The signal did not get queued because the signal - * was ignored, so we won't get any callback to - * reload the timer. But we need to keep it - * ticking in case the signal is deliverable next time. - */ - posix_cpu_timer_rearm(timer); - ++timer->it_requeue_pending; + } else { + posix_timer_queue_signal(timer); + /* Disable oneshot timers */ + if (!timer->it_interval) + cpu_timer_setexpires(ctmr, 0); } } +static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now); + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -624,9 +622,10 @@ static void cpu_timer_fire(struct k_itimer *timer) static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, struct itimerspec64 *new, struct itimerspec64 *old) { + bool sigev_none = timer->it_sigev_notify == SIGEV_NONE; clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - u64 old_expires, new_expires, old_incr, val; struct cpu_timer *ctmr = &timer->it.cpu; + u64 old_expires, new_expires, now; struct sighand_struct *sighand; struct task_struct *p; unsigned long flags; @@ -663,168 +662,136 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, return -ESRCH; } - /* - * Disarm any old timer after extracting its expiry time. - */ - old_incr = timer->it_interval; + /* Retrieve the current expiry time before disarming the timer */ old_expires = cpu_timer_getexpires(ctmr); if (unlikely(timer->it.cpu.firing)) { - timer->it.cpu.firing = -1; + /* + * Prevent signal delivery. The timer cannot be dequeued + * because it is on the firing list which is not protected + * by sighand->lock. The delivery path is waiting for + * the timer lock. So go back, unlock and retry. + */ + timer->it.cpu.firing = false; ret = TIMER_RETRY; } else { cpu_timer_dequeue(ctmr); + timer->it_status = POSIX_TIMER_DISARMED; } /* - * We need to sample the current value to convert the new - * value from to relative and absolute, and to convert the - * old value from absolute to relative. To set a process - * timer, we need a sample to balance the thread expiry - * times (in arm_timer). With an absolute time, we must - * check if it's already passed. In short, we need a sample. + * Sample the current clock for saving the previous setting + * and for rearming the timer. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) - val = cpu_clock_sample(clkid, p); + now = cpu_clock_sample(clkid, p); else - val = cpu_clock_sample_group(clkid, p, true); + now = cpu_clock_sample_group(clkid, p, !sigev_none); + /* Retrieve the previous expiry value if requested. */ if (old) { - if (old_expires == 0) { - old->it_value.tv_sec = 0; - old->it_value.tv_nsec = 0; - } else { - /* - * Update the timer in case it has overrun already. - * If it has, we'll report it as having overrun and - * with the next reloaded timer already ticking, - * though we are swallowing that pending - * notification here to install the new setting. - */ - u64 exp = bump_cpu_timer(timer, val); - - if (val < exp) { - old_expires = exp - val; - old->it_value = ns_to_timespec64(old_expires); - } else { - old->it_value.tv_nsec = 1; - old->it_value.tv_sec = 0; - } - } + old->it_value = (struct timespec64){ }; + if (old_expires) + __posix_cpu_timer_get(timer, old, now); } + /* Retry if the timer expiry is running concurrently */ if (unlikely(ret)) { - /* - * We are colliding with the timer actually firing. - * Punt after filling in the timer's old value, and - * disable this firing since we are already reporting - * it as an overrun (thanks to bump_cpu_timer above). - */ unlock_task_sighand(p, &flags); goto out; } - if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { - new_expires += val; - } + /* Convert relative expiry time to absolute */ + if (new_expires && !(timer_flags & TIMER_ABSTIME)) + new_expires += now; + + /* Set the new expiry time (might be 0) */ + cpu_timer_setexpires(ctmr, new_expires); /* - * Install the new expiry time (or zero). - * For a timer with no notification action, we don't actually - * arm the timer (we'll just fake it for timer_gettime). + * Arm the timer if it is not disabled, the new expiry value has + * not yet expired and the timer requires signal delivery. + * SIGEV_NONE timers are never armed. In case the timer is not + * armed, enforce the reevaluation of the timer base so that the + * process wide cputime counter can be disabled eventually. */ - cpu_timer_setexpires(ctmr, new_expires); - if (new_expires != 0 && val < new_expires) { - arm_timer(timer, p); + if (likely(!sigev_none)) { + if (new_expires && now < new_expires) + arm_timer(timer, p); + else + trigger_base_recalc_expires(timer, p); } unlock_task_sighand(p, &flags); + + posix_timer_set_common(timer, new); + /* - * Install the new reload setting, and - * set up the signal and overrun bookkeeping. + * If the new expiry time was already in the past the timer was not + * queued. Fire it immediately even if the thread never runs to + * accumulate more time on this clock. */ - timer->it_interval = timespec64_to_ktime(new->it_interval); + if (!sigev_none && new_expires && now >= new_expires) + cpu_timer_fire(timer); +out: + rcu_read_unlock(); + return ret; +} + +static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now) +{ + bool sigev_none = timer->it_sigev_notify == SIGEV_NONE; + u64 expires, iv = timer->it_interval; /* - * This acts as a modification timestamp for the timer, - * so any automatic reload attempt will punt on seeing - * that we have reset the timer manually. + * Make sure that interval timers are moved forward for the + * following cases: + * - SIGEV_NONE timers which are never armed + * - Timers which expired, but the signal has not yet been + * delivered */ - timer->it_requeue_pending = (timer->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timer->it_overrun_last = 0; - timer->it_overrun = -1; - - if (val >= new_expires) { - if (new_expires != 0) { - /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. - */ - cpu_timer_fire(timer); - } + if (iv && timer->it_status != POSIX_TIMER_ARMED) + expires = bump_cpu_timer(timer, now); + else + expires = cpu_timer_getexpires(&timer->it.cpu); + /* + * Expired interval timers cannot have a remaining time <= 0. + * The kernel has to move them forward so that the next + * timer expiry is > @now. + */ + if (now < expires) { + itp->it_value = ns_to_timespec64(expires - now); + } else { /* - * Make sure we don't keep around the process wide cputime - * counter or the tick dependency if they are not necessary. + * A single shot SIGEV_NONE timer must return 0, when it is + * expired! Timers which have a real signal delivery mode + * must return a remaining time greater than 0 because the + * signal has not yet been delivered. */ - sighand = lock_task_sighand(p, &flags); - if (!sighand) - goto out; - - if (!cpu_timer_queued(ctmr)) - trigger_base_recalc_expires(timer, p); - - unlock_task_sighand(p, &flags); + if (!sigev_none) + itp->it_value.tv_nsec = 1; } - out: - rcu_read_unlock(); - if (old) - old->it_interval = ns_to_timespec64(old_incr); - - return ret; } static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct cpu_timer *ctmr = &timer->it.cpu; - u64 now, expires = cpu_timer_getexpires(ctmr); struct task_struct *p; + u64 now; rcu_read_lock(); p = cpu_timer_task_rcu(timer); - if (!p) - goto out; - - /* - * Easy part: convert the reload time. - */ - itp->it_interval = ktime_to_timespec64(timer->it_interval); - - if (!expires) - goto out; + if (p && cpu_timer_getexpires(&timer->it.cpu)) { + itp->it_interval = ktime_to_timespec64(timer->it_interval); - /* - * Sample the clock to take the difference with the expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - now = cpu_clock_sample(clkid, p); - else - now = cpu_clock_sample_group(clkid, p, false); + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + now = cpu_clock_sample(clkid, p); + else + now = cpu_clock_sample_group(clkid, p, false); - if (now < expires) { - itp->it_value = ns_to_timespec64(expires - now); - } else { - /* - * The timer should have expired already, but the firing - * hasn't taken place yet. Say it's just about to expire. - */ - itp->it_value.tv_nsec = 1; - itp->it_value.tv_sec = 0; + __posix_cpu_timer_get(timer, itp, now); } -out: rcu_read_unlock(); } @@ -846,7 +813,9 @@ static u64 collect_timerqueue(struct timerqueue_head *head, if (++i == MAX_COLLECTED || now < expires) return expires; - ctmr->firing = 1; + ctmr->firing = true; + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(ctmr->handling, current); cpu_timer_dequeue(ctmr); list_add_tail(&ctmr->elist, firing); } @@ -1162,7 +1131,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk); #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK static void posix_cpu_timers_work(struct callback_head *work) { + struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work); + + mutex_lock(&cw->mutex); handle_posix_cpu_timers(current); + mutex_unlock(&cw->mutex); +} + +/* + * Invoked from the posix-timer core when a cancel operation failed because + * the timer is marked firing. The caller holds rcu_read_lock(), which + * protects the timer and the task which is expiring it from being freed. + */ +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling); + + /* Has the handling task completed expiry already? */ + if (!tsk) + return; + + /* Ensure that the task cannot go away */ + get_task_struct(tsk); + /* Now drop the RCU protection so the mutex can be locked */ + rcu_read_unlock(); + /* Wait on the expiry mutex */ + mutex_lock(&tsk->posix_cputimers_work.mutex); + /* Release it immediately again. */ + mutex_unlock(&tsk->posix_cputimers_work.mutex); + /* Drop the task reference. */ + put_task_struct(tsk); + /* Relock RCU so the callsite is balanced */ + rcu_read_lock(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + /* Ensure that timr->it.cpu.handling task cannot go away */ + rcu_read_lock(); + spin_unlock_irq(&timr->it_lock); + posix_cpu_timer_wait_running(timr); + rcu_read_unlock(); + /* @timr is on stack and is valid */ + spin_lock_irq(&timr->it_lock); } /* @@ -1178,6 +1189,7 @@ void clear_posix_cputimers_work(struct task_struct *p) sizeof(p->posix_cputimers_work.work)); init_task_work(&p->posix_cputimers_work.work, posix_cpu_timers_work); + mutex_init(&p->posix_cputimers_work.mutex); p->posix_cputimers_work.scheduled = false; } @@ -1256,6 +1268,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk) lockdep_posixtimer_exit(); } +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + cpu_relax(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + spin_unlock_irq(&timr->it_lock); + cpu_relax(); + spin_lock_irq(&timr->it_lock); +} + static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) { return false; @@ -1344,7 +1368,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) * timer call will interfere. */ list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { - int cpu_firing; + bool cpu_firing; /* * spin_lock() is sufficient here even independent of the @@ -1356,14 +1380,16 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; - timer->it.cpu.firing = 0; + timer->it.cpu.firing = false; /* - * The firing flag is -1 if we collided with a reset - * of the timer, which already reported this - * almost-firing as an overrun. So don't generate an event. + * If the firing flag is cleared then this raced with a + * timer rearm/delete operation. So don't generate an + * event. */ - if (likely(cpu_firing >= 0)) + if (likely(cpu_firing)) cpu_timer_fire(timer); + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(timer->it.cpu.handling, NULL); spin_unlock(&timer->it_lock); } } @@ -1380,6 +1406,15 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); /* + * Ensure that release_task(tsk) can't happen while + * handle_posix_cpu_timers() is running. Otherwise, a concurrent + * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and + * miss timer->it.cpu.firing != 0. + */ + if (tsk->exit_state) + return; + + /* * If the actual expiry is deferred to task work context and the * work is already scheduled there is no point to do anything here. */ @@ -1457,6 +1492,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_overrun = -1; error = posix_cpu_timer_create(&timer); timer.it_process = current; + timer.it.cpu.nanosleep = true; if (!error) { static struct itimerspec64 zero_it; @@ -1498,23 +1534,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, expires = cpu_timer_getexpires(&timer.it.cpu); error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { - /* - * Timer is now unarmed, deletion can not fail. - */ + /* Timer is now unarmed, deletion can not fail. */ posix_cpu_timer_del(&timer); + } else { + while (error == TIMER_RETRY) { + posix_cpu_timer_wait_running_nsleep(&timer); + error = posix_cpu_timer_del(&timer); + } } - spin_unlock_irq(&timer.it_lock); - while (error == TIMER_RETRY) { - /* - * We need to handle case when timer was or is in the - * middle of firing. In other cases we already freed - * resources. - */ - spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_del(&timer); - spin_unlock_irq(&timer.it_lock); - } + spin_unlock_irq(&timer.it_lock); if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { /* @@ -1528,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = expires; + restart->nanosleep.expires = ns_to_ktime(expires); if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); } @@ -1570,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec64 t; - t = ns_to_timespec64(restart_block->nanosleep.expires); + t = ktime_to_timespec64(restart_block->nanosleep.expires); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } @@ -1624,6 +1653,7 @@ const struct k_clock clock_posix_cpu = { .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, .timer_rearm = posix_cpu_timer_rearm, + .timer_wait_running = posix_cpu_timer_wait_running, }; const struct k_clock clock_process = { diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 90ea5f373e50..9b6fcb8d85e7 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -17,40 +17,6 @@ #include <linux/time_namespace.h> #include <linux/compat.h> -#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER -/* Architectures may override SYS_NI and COMPAT_SYS_NI */ -#include <asm/syscall_wrapper.h> -#endif - -asmlinkage long sys_ni_posix_timers(void) -{ - pr_err_once("process %d (%s) attempted a POSIX timer syscall " - "while CONFIG_POSIX_TIMERS is not set\n", - current->pid, current->comm); - return -ENOSYS; -} - -#ifndef SYS_NI -#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) -#endif - -#ifndef COMPAT_SYS_NI -#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) -#endif - -SYS_NI(timer_create); -SYS_NI(timer_gettime); -SYS_NI(timer_getoverrun); -SYS_NI(timer_settime); -SYS_NI(timer_delete); -SYS_NI(clock_adjtime); -SYS_NI(getitimer); -SYS_NI(setitimer); -SYS_NI(clock_adjtime32); -#ifdef __ARCH_WANT_SYS_ALARM -SYS_NI(alarm); -#endif - /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC * as it is easy to remain compatible with little code. CLOCK_BOOTTIME @@ -147,6 +113,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; texp = timespec64_to_ktime(t); @@ -157,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, which_clock); } -#ifdef CONFIG_COMPAT -COMPAT_SYS_NI(timer_create); -#endif - -#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) -COMPAT_SYS_NI(getitimer); -COMPAT_SYS_NI(setitimer); -#endif - #ifdef CONFIG_COMPAT_32BIT_TIME -SYS_NI(timer_settime32); -SYS_NI(timer_gettime32); SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, struct old_timespec32 __user *, tp) @@ -240,6 +196,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; texp = timespec64_to_ktime(t); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 5dead89308b7..80a8a09a21a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -9,164 +9,188 @@ * * These are all the functions necessary to implement POSIX clocks & timers */ -#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/jhash.h> #include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/mutex.h> -#include <linux/sched/task.h> - -#include <linux/uaccess.h> #include <linux/list.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/hash.h> +#include <linux/memblock.h> +#include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> +#include <linux/sched/task.h> +#include <linux/slab.h> #include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/workqueue.h> -#include <linux/export.h> -#include <linux/hashtable.h> -#include <linux/compat.h> -#include <linux/nospec.h> +#include <linux/time.h> #include <linux/time_namespace.h> +#include <linux/uaccess.h> #include "timekeeping.h" #include "posix-timers.h" /* - * Management arrays for POSIX timers. Timers are now kept in static hash table - * with 512 entries. - * Timer ids are allocated by local routine, which selects proper hash head by - * key, constructed from current->signal address and per signal struct counter. - * This keeps timer ids unique per process, but now they can intersect between - * processes. + * Timers are managed in a hash table for lockless lookup. The hash key is + * constructed from current::signal and the timer ID and the timer is + * matched against current::signal and the timer ID when walking the hash + * bucket list. + * + * This allows checkpoint/restore to reconstruct the exact timer IDs for + * a process. */ +struct timer_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; -/* - * Lets keep our timers in a slab cache :-) - */ -static struct kmem_cache *posix_timers_cache; +static struct { + struct timer_hash_bucket *buckets; + unsigned long mask; + struct kmem_cache *cache; +} __timer_data __ro_after_init __aligned(4*sizeof(long)); -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) +#define posix_timers_cache (__timer_data.cache) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; -/* - * we assume that the new SIGEV_THREAD_ID shares no bits with the other - * SIGEV values. Here we put out an error if this assumption fails. - */ +#define TIMER_ANY_ID INT_MIN + +/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -/* - * The timer ID is turned into a timer address by idr_find(). - * Verifying a valid ID consists of: - * - * a) checking that idr_find() returns other than -1. - * b) checking that the timer id matches the one in the timer itself. - * c) that the timer owner is in the callers thread group. - */ - -/* - * CLOCKs: The POSIX standard calls for a couple of clocks and allows us - * to implement others. This structure defines the various - * clocks. - * - * RESOLUTION: Clock resolution is used to round up timer and interval - * times, NOT to report clock times, which are reported with as - * much resolution as the system can muster. In some cases this - * resolution may depend on the underlying clock hardware and - * may not be quantifiable until run time, and only then is the - * necessary code is written. The standard says we should say - * something about this issue in the documentation... - * - * FUNCTIONS: The CLOCKs structure defines possible functions to - * handle various clock functions. - * - * The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for - * the timer. 2.) The list, it_lock, it_clock, it_id and - * it_pid fields are not modified by timer code. - * - * Permissions: It is assumed that the clock_settime() function defined - * for each clock will take care of permission checks. Some - * clocks may be set able by any user (i.e. local process - * clocks) others not. Currently the only set able clock we - * have is CLOCK_REALTIME and its high res counter part, both of - * which we beg off on and pass to do_sys_settimeofday(). - */ -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id); -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ +#define lock_timer(tid) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ + __timr; \ }) -static int hash(struct signal_struct *sig, unsigned int nr) +static inline void unlock_timer(struct k_itimer *timr) { - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); + if (likely((timr))) + spin_unlock_irq(&timr->it_lock); } -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) +#define scoped_timer_get_or_fail(_id) \ + scoped_cond_guard(lock_timer, return -EINVAL, _id) + +#define scoped_timer (scope) + +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS_IS_COND_GUARD(lock_timer); + +static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) { + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, - lockdep_is_held(&hash_lock)) { - if ((timer->it_signal == sig) && (timer->it_id == id)) + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { + /* timer->it_signal can be set concurrently */ + if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; } return NULL; } -static struct k_itimer *posix_timer_by_id(timer_t id) +static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) { - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + unsigned long val = (unsigned long)timer->it_signal; + + /* + * Mask out bit 0, which acts as invalid marker to prevent + * posix_timer_by_id() detecting it as valid. + */ + return (struct signal_struct *)(val & ~1UL); +} + +static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, + timer_t id) +{ + struct hlist_head *head = &bucket->head; + struct k_itimer *timer; - return __posix_timers_find(head, sig, id); + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) + return true; + } + return false; } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) { - struct signal_struct *sig = current->signal; - int first_free_id = sig->posix_timer_id; - struct hlist_head *head; - int ret = -ENOENT; - - do { - spin_lock(&hash_lock); - head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; - if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { - hlist_add_head_rcu(&timer->t_hash, head); - ret = sig->posix_timer_id; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); + + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; } - if (++sig->posix_timer_id < 0) - sig->posix_timer_id = 0; - if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) - /* Loop over all possible ids completed */ - ret = -EAGAIN; - spin_unlock(&hash_lock); - } while (ret == -ENOENT); - return ret; + } + return false; } -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +static int posix_timer_add(struct k_itimer *timer, int req_id) { - spin_unlock_irqrestore(&timr->it_lock, flags); + struct signal_struct *sig = current->signal; + + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { + /* Get the next timer ID and clamp it to positive space */ + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; + + if (posix_timer_add_at(timer, sig, id)) + return id; + cond_resched(); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } -/* Get clock_realtime */ static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); @@ -178,7 +202,6 @@ static ktime_t posix_get_realtime_ktime(clockid_t which_clock) return ktime_get_real(); } -/* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) { @@ -191,9 +214,6 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, return do_adjtimex(t); } -/* - * Get monotonic time for posix timers - */ static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); @@ -206,9 +226,6 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) return ktime_get(); } -/* - * Get monotonic-raw time for posix timers - */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); @@ -216,7 +233,6 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) return 0; } - static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_real_ts64(tp); @@ -268,164 +284,115 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) } /* - * Initialize everything, well, just everything in Posix clocks/timers ;) - */ -static __init int init_posix_timers(void) -{ - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof(struct k_itimer), 0, - SLAB_PANIC | SLAB_ACCOUNT, NULL); - return 0; -} -__initcall(init_posix_timers); - -/* * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX */ -static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +static inline int timer_overrun_to_int(struct k_itimer *timr) { - s64 sum = timr->it_overrun_last + (s64)baseval; + if (timr->it_overrun_last > (s64)INT_MAX) + return INT_MAX; - return sum > (s64)INT_MAX ? INT_MAX : (int)sum; + return (int)timr->it_overrun_last; } static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); hrtimer_restart(timer); } +static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr) +{ + guard(spinlock)(&timr->it_lock); + + /* + * Check if the timer is still alive or whether it got modified + * since the signal was queued. In either case, don't rearm and + * drop the signal. + */ + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) + return false; + + if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) + return true; + + timr->kclock->timer_rearm(timr); + timr->it_status = POSIX_TIMER_ARMED; + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1LL; + ++timr->it_signal_seq; + info->si_overrun = timer_overrun_to_int(timr); + return true; +} + /* - * This function is exported for use by the signal deliver code. It is - * called just prior to the info block being released and passes that - * block to us. It's function is to update the overrun entry AND to - * restart the timer. It should only be called if the timer is to be - * restarted (i.e. we have flagged this in the sys_private entry of the - * info block). - * - * To protect against the timer going away while the interrupt is queued, - * we require that the it_requeue_pending flag be set. + * This function is called from the signal delivery code. It decides + * whether the signal should be dropped and rearms interval timers. The + * timer can be unconditionally accessed as there is a reference held on + * it. */ -void posixtimer_rearm(struct kernel_siginfo *info) +bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { - struct k_itimer *timr; - unsigned long flags; + struct k_itimer *timr = container_of(timer_sigq, struct k_itimer, sigq); + bool ret; - timr = lock_timer(info->si_tid, &flags); - if (!timr) - return; - - if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { - timr->kclock->timer_rearm(timr); + /* + * Release siglock to ensure proper locking order versus + * timr::it_lock. Keep interrupts disabled. + */ + spin_unlock(¤t->sighand->siglock); - timr->it_active = 1; - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1LL; - ++timr->it_requeue_pending; + ret = __posixtimer_deliver_signal(info, timr); - info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); - } + /* Drop the reference which was acquired when the signal was queued */ + posixtimer_putref(timr); - unlock_timer(timr, flags); + spin_lock(¤t->sighand->siglock); + return ret; } -int posix_timer_event(struct k_itimer *timr, int si_private) +void posix_timer_queue_signal(struct k_itimer *timr) { - enum pid_type type; - int ret; - /* - * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->posixtimer_rearm(). - * - * If dequeue_signal() sees the "right" value of - * si_sys_private it calls posixtimer_rearm(). - * We re-queue ->sigq and drop ->it_lock(). - * posixtimer_rearm() locks the timer - * and re-schedules it while ->sigq is pending. - * Not really bad, but not that we want. - */ - timr->sigq->info.si_sys_private = si_private; + lockdep_assert_held(&timr->it_lock); + + if (!posixtimer_valid(timr)) + return; - type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; - ret = send_sigqueue(timr->sigq, timr->it_pid, type); - /* If we failed to send the signal the timer stops. */ - return ret > 0; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; + posixtimer_send_sigqueue(timr); } /* - * This function gets called when a POSIX.1b interval timer expires. It - * is used as a callback from the kernel internal timer. The - * run_timer_list code ALWAYS calls with interrupts on. - - * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + * This function gets called when a POSIX.1b interval timer expires from + * the HRTIMER interrupt (soft interrupt on RT kernels). + * + * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI + * based timers. */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { - struct k_itimer *timr; - unsigned long flags; - int si_private = 0; - enum hrtimer_restart ret = HRTIMER_NORESTART; - - timr = container_of(timer, struct k_itimer, it.real.timer); - spin_lock_irqsave(&timr->it_lock, flags); + struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer); - timr->it_active = 0; - if (timr->it_interval != 0) - si_private = ++timr->it_requeue_pending; - - if (posix_timer_event(timr, si_private)) { - /* - * signal was not sent because of sig_ignor - * we will not get a call back to restart it AND - * it should be restarted. - */ - if (timr->it_interval != 0) { - ktime_t now = hrtimer_cb_get_time(timer); - - /* - * FIXME: What we really want, is to stop this - * timer completely and restart it in case the - * SIG_IGN is removed. This is a non trivial - * change which involves sighand locking - * (sigh !), which we don't want to do late in - * the release cycle. - * - * For now we just let timers with an interval - * less than a jiffie expire every jiffie to - * avoid softirq starvation in case of SIG_IGN - * and a very small interval, which would put - * the timer right back on the softirq pending - * list. By moving now ahead of time we trick - * hrtimer_forward() to expire the timer - * later, while we still maintain the overrun - * accuracy, but have some inconsistency in - * the timer_gettime() case. This is at least - * better than a starved softirq. A more - * complex fix which solves also another related - * inconsistency is already in the pipeline. - */ -#ifdef CONFIG_HIGH_RES_TIMERS - { - ktime_t kj = NSEC_PER_SEC / HZ; + guard(spinlock_irqsave)(&timr->it_lock); + posix_timer_queue_signal(timr); + return HRTIMER_NORESTART; +} - if (timr->it_interval < kj) - now = ktime_add(now, kj); - } -#endif - timr->it_overrun += hrtimer_forward(timer, now, - timr->it_interval); - ret = HRTIMER_RESTART; - ++timr->it_requeue_pending; - timr->it_active = 1; - } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; } - - unlock_timer(timr, flags); - return ret; + return -EINVAL; } static struct pid *good_sigevent(sigevent_t * event) @@ -452,45 +419,45 @@ static struct pid *good_sigevent(sigevent_t * event) } } -static struct k_itimer * alloc_posix_timer(void) +static struct k_itimer *alloc_posix_timer(void) { struct k_itimer *tmr; + + if (unlikely(!posix_timers_cache)) + return NULL; + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; - if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + + if (unlikely(!posixtimer_init_sigqueue(&tmr->sigq))) { kmem_cache_free(posix_timers_cache, tmr); return NULL; } - clear_siginfo(&tmr->sigq->info); + rcuref_init(&tmr->rcuref, 1); return tmr; } -static void k_itimer_rcu_free(struct rcu_head *head) +void posixtimer_free_timer(struct k_itimer *tmr) { - struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); - - kmem_cache_free(posix_timers_cache, tmr); + put_pid(tmr->it_pid); + if (tmr->sigq.ucounts) + dec_rlimit_put_ucounts(tmr->sigq.ucounts, UCOUNT_RLIMIT_SIGPENDING); + kfree_rcu(tmr, rcu); } -#define IT_ID_SET 1 -#define IT_ID_NOT_SET 0 -static void release_posix_timer(struct k_itimer *tmr, int it_id_set) +static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - if (it_id_set) { - unsigned long flags; - spin_lock_irqsave(&hash_lock, flags); + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); + + scoped_guard (spinlock, &bucket->lock) hlist_del_rcu(&tmr->t_hash); - spin_unlock_irqrestore(&hash_lock, flags); - } - put_pid(tmr->it_pid); - sigqueue_free(tmr->sigq); - call_rcu(&tmr->rcu, k_itimer_rcu_free); + posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -499,78 +466,107 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; - int it_id_set = IT_ID_NOT_SET; if (!kc) return -EINVAL; if (!kc->timer_create) return -EOPNOTSUPP; + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) return -EAGAIN; spin_lock_init(&new_timer->it_lock); - new_timer_id = posix_timer_add(new_timer); + + /* + * Add the timer to the hash table. The timer is not yet valid + * after insertion, but has a unique ID allocated. + */ + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { - error = new_timer_id; - goto out; + posixtimer_free_timer(new_timer); + return new_timer_id; } - it_id_set = IT_ID_SET; - new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(event)); - rcu_read_unlock(); + scoped_guard (rcu) + new_timer->it_pid = get_pid(good_sigevent(event)); if (!new_timer->it_pid) { error = -EINVAL; goto out; } new_timer->it_sigev_notify = event->sigev_notify; - new_timer->sigq->info.si_signo = event->sigev_signo; - new_timer->sigq->info.si_value = event->sigev_value; + new_timer->sigq.info.si_signo = event->sigev_signo; + new_timer->sigq.info.si_value = event->sigev_value; } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->sigq->info.si_signo = SIGALRM; - memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); - new_timer->sigq->info.si_value.sival_int = new_timer->it_id; + new_timer->sigq.info.si_signo = SIGALRM; + new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } - new_timer->sigq->info.si_tid = new_timer->it_id; - new_timer->sigq->info.si_code = SI_TIMER; + if (new_timer->it_sigev_notify & SIGEV_THREAD_ID) + new_timer->it_pid_type = PIDTYPE_PID; + else + new_timer->it_pid_type = PIDTYPE_TGID; - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { + new_timer->sigq.info.si_tid = new_timer->it_id; + new_timer->sigq.info.si_code = SI_TIMER; + + if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; goto out; } - + /* + * After successful copy out, the timer ID is visible to user space + * now but not yet valid because new_timer::signal low order bit is 1. + * + * Complete the initialization with the clock specific create + * callback. + */ error = kc->timer_create(new_timer); if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - new_timer->it_signal = current->signal; - list_add(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); - - return 0; /* - * In the case of the timer belonging to another task, after - * the task is unlocked, the timer is owned by the other task - * and may cease to exist at any time. Don't use or modify - * new_timer after the unlock call. + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* + * new_timer::it_signal contains the signal pointer with + * bit 0 set, which makes it invalid for syscall operations. + * Store the unmodified signal pointer to make it valid. + */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ + return 0; out: - release_posix_timer(new_timer, it_id_set); + posix_timer_unhash_and_free(new_timer); return error; } @@ -604,14 +600,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -/* - * Locking issues: We need to protect the result of the id look up until - * we get the timer locked down so it is not deleted under us. The - * removal is done under the idr spinlock so we use that here to bridge - * the find to the timer lock. To avoid a dead lock, the timer id MUST - * be release with out holding the timer lock. - */ -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id) { struct k_itimer *timr; @@ -622,18 +611,46 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) if ((unsigned long long)timer_id > INT_MAX) return NULL; - rcu_read_lock(); + /* + * The hash lookup and the timers are RCU protected. + * + * Timers are added to the hash in invalid state where + * timr::it_signal is marked invalid. timer::it_signal is only set + * after the rest of the initialization succeeded. + * + * Timer destruction happens in steps: + * 1) Set timr::it_signal marked invalid with timr::it_lock held + * 2) Release timr::it_lock + * 3) Remove from the hash under hash_lock + * 4) Put the reference count. + * + * The reference count might not drop to zero if timr::sigq is + * queued. In that case the signal delivery or flush will put the + * last reference count. + * + * When the reference count reaches zero, the timer is scheduled + * for RCU removal after the grace period. + * + * Holding rcu_read_lock() across the lookup ensures that + * the timer cannot be freed. + * + * The lookup validates locklessly that timr::it_signal == + * current::it_signal and timr::it_id == @timer_id. timr::it_id + * can't change, but timr::it_signal can become invalid during + * destruction, which makes the locked check fail. + */ + guard(rcu)(); timr = posix_timer_by_id(timer_id); if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); - if (timr->it_signal == current->signal) { - rcu_read_unlock(); + spin_lock_irq(&timr->it_lock); + /* + * Validate under timr::it_lock that timr::it_signal is + * still valid. Pairs with #1 above. + */ + if (timr->it_signal == current->signal) return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); + spin_unlock_irq(&timr->it_lock); } - rcu_read_unlock(); - return NULL; } @@ -652,20 +669,16 @@ static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) } /* - * Get the time remaining on a POSIX.1b interval timer. This function - * is ALWAYS called with spin_lock_irq on the timer, thus it must not - * mess with irq. + * Get the time remaining on a POSIX.1b interval timer. * - * We have a couple of messes to clean up here. First there is the case - * of a timer that has a requeue pending. These timers should appear to - * be in the timer list with an expiry as if we were to requeue them - * now. + * Two issues to handle here: * - * The second issue is the SIGEV_NONE timer which may be active but is - * not really ever put in the timer list (to save system resources). - * This timer may be expired, and if so, we will do it here. Otherwise - * it is the same as a requeue pending timer WRT to what we should - * report. + * 1) The timer has a requeue pending. The return value must appear as + * if the timer has been requeued right now. + * + * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued + * into the hrtimer queue and therefore never expired. Emulate expiry + * here taking #1 into account. */ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { @@ -679,10 +692,14 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) /* interval timer ? */ if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); - } else if (!timr->it_active) { + } else if (timr->it_status == POSIX_TIMER_DISARMED) { /* - * SIGEV_NONE oneshot timers are never queued. Check them - * below. + * SIGEV_NONE oneshot timers are never queued and therefore + * timr->it_status is always DISARMED. The check below + * vs. remaining time will handle this case. + * + * For all other timers there is nothing to update here, so + * return. */ if (!sig_none) return; @@ -691,18 +708,29 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) now = kc->clock_get_ktime(timr->it_clock); /* - * When a requeue is pending or this is a SIGEV_NONE timer move the - * expiry time forward by intervals, so expiry is > now. + * If this is an interval timer and either has requeue pending or + * is a SIGEV_NONE timer move the expiry time forward by intervals, + * so expiry is > now. */ - if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + if (iv && timr->it_status != POSIX_TIMER_ARMED) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); - /* Return 0 only, when the timer is expired and not pending */ + /* + * As @now is retrieved before a possible timer_forward() and + * cannot be reevaluated by the compiler @remaining is based on the + * same @now value. Therefore @remaining is consistent vs. @now. + * + * Consequently all interval timers, i.e. @iv > 0, cannot have a + * remaining time <= 0 because timer_forward() guarantees to move + * them forward so that the next timer expiry is > @now. + */ if (remaining <= 0) { /* - * A single shot SIGEV_NONE timer must return 0, when - * it is expired ! + * A single shot SIGEV_NONE timer must return 0, when it is + * expired! Timers which have a real signal delivery mode + * must return a remaining time greater than 0 because the + * signal has not yet been delivered. */ if (!sig_none) cur_setting->it_value.tv_nsec = 1; @@ -711,27 +739,12 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) } } -/* Get the time remaining on a POSIX.1b interval timer. */ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - struct k_itimer *timr; - const struct k_clock *kc; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - memset(setting, 0, sizeof(*setting)); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, setting); - - unlock_timer(timr, flags); - return ret; + scoped_timer_get_or_fail(timer_id) + scoped_timer->kclock->timer_get(scoped_timer, setting); + return 0; } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -765,29 +778,28 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, #endif -/* - * Get the number of overruns of a POSIX.1b interval timer. This is to - * be the overrun of the timer last delivered. At the same time we are - * accumulating overruns on the next timer. The overrun is frozen when - * the signal is delivered, either at the notify time (if the info block - * is not queued) or at the actual delivery time (as we are informed by - * the call back to posixtimer_rearm(). So all we need to do is - * to pick up the frozen overrun. +/** + * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer + * @timer_id: The timer ID which identifies the timer + * + * The "overrun count" of a timer is one plus the number of expiration + * intervals which have elapsed between the first expiry, which queues the + * signal and the actual signal delivery. On signal delivery the "overrun + * count" is calculated and cached, so it can be returned directly here. + * + * As this is relative to the last queued signal the returned overrun count + * is meaningless outside of the signal delivery path and even there it + * does not accurately reflect the current state when user space evaluates + * it. + * + * Returns: + * -EINVAL @timer_id is invalid + * 1..INT_MAX The number of overruns related to the last delivered signal */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { - struct k_itimer *timr; - int overrun; - unsigned long flags; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timer_overrun_to_int(timr, 0); - unlock_timer(timr, flags); - - return overrun; + scoped_timer_get_or_fail(timer_id) + return timer_overrun_to_int(scoped_timer); } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, @@ -800,7 +812,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -809,11 +821,10 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) - expires = ktime_add_safe(expires, timer->base->get_time()); + expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); if (!sigev_none) @@ -831,27 +842,41 @@ static void common_timer_wait_running(struct k_itimer *timer) } /* - * On PREEMPT_RT this prevent priority inversion against softirq kthread in - * case it gets preempted while executing a timer callback. See comments in - * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a - * cpu_relax(). + * On PREEMPT_RT this prevents priority inversion and a potential livelock + * against the ksoftirqd thread in case that ksoftirqd gets preempted while + * executing a hrtimer callback. + * + * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this + * just results in a cpu_relax(). + * + * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is + * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this + * prevents spinning on an eventually scheduled out task and a livelock + * when the task which tries to delete or disarm the timer has preempted + * the task which runs the expiry in task work context. */ -static struct k_itimer *timer_wait_running(struct k_itimer *timer, - unsigned long *flags) +static void timer_wait_running(struct k_itimer *timer) { - const struct k_clock *kc = READ_ONCE(timer->kclock); - timer_t timer_id = READ_ONCE(timer->it_id); - - /* Prevent kfree(timer) after dropping the lock */ - rcu_read_lock(); - unlock_timer(timer, *flags); + /* + * kc->timer_wait_running() might drop RCU lock. So @timer + * cannot be touched anymore after the function returns! + */ + timer->kclock->timer_wait_running(timer); +} - if (!WARN_ON_ONCE(!kc->timer_wait_running)) - kc->timer_wait_running(timer); +/* + * Set up the new interval and reset the signal delivery data + */ +void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting) +{ + if (new_setting->it_value.tv_sec || new_setting->it_value.tv_nsec) + timer->it_interval = timespec64_to_ktime(new_setting->it_interval); + else + timer->it_interval = 0; - rcu_read_unlock(); - /* Relock the timer. It might be not longer hashed. */ - return lock_timer(timer_id, flags); + /* Reset overrun accounting */ + timer->it_overrun_last = 0; + timer->it_overrun = -1LL; } /* Set a POSIX.1b interval timer. */ @@ -866,8 +891,6 @@ int common_timer_set(struct k_itimer *timr, int flags, if (old_setting) common_timer_get(timr, old_setting); - /* Prevent rearming by clearing the interval */ - timr->it_interval = 0; /* * Careful here. On SMP systems the timer expiry function could be * active and spinning on timr->it_lock. @@ -875,35 +898,27 @@ int common_timer_set(struct k_itimer *timr, int flags, if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; - timr->it_active = 0; - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timr->it_overrun_last = 0; + timr->it_status = POSIX_TIMER_DISARMED; + posix_timer_set_common(timr, new_setting); - /* Switch off the timer when it_value is zero */ + /* Keep timer disarmed when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); if (flags & TIMER_ABSTIME) expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); - timr->it_active = !sigev_none; + if (!sigev_none) + timr->it_status = POSIX_TIMER_ARMED; return 0; } -static int do_timer_settime(timer_t timer_id, int tmr_flags, - struct itimerspec64 *new_spec64, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int error = 0; - if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; @@ -911,27 +926,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); - timr = lock_timer(timer_id, &flags); -retry: - if (!timr) - return -EINVAL; + for (; ; old_spec64 = NULL) { + struct k_itimer *timr; - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - - if (error == TIMER_RETRY) { - // We already got the old time... - old_spec64 = NULL; - /* Unlocks and relocks the timer if it still exists */ - timr = timer_wait_running(timr, &flags); - goto retry; - } - unlock_timer(timr, flags); + scoped_timer_get_or_fail(timer_id) { + timr = scoped_timer; - return error; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; + + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); + if (ret != TIMER_RETRY) + return ret; + + /* Protect the timer from being freed when leaving the lock scope */ + rcu_read_lock(); + } + timer_wait_running(timr); + rcu_read_unlock(); + } } /* Set a POSIX.1b interval timer */ @@ -939,8 +955,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct __kernel_itimerspec __user *, new_setting, struct __kernel_itimerspec __user *, old_setting) { - struct itimerspec64 new_spec, old_spec; - struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; + struct itimerspec64 new_spec, old_spec, *rtn; int error = 0; if (!new_setting) @@ -949,6 +964,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; + rtn = old_setting ? &old_spec : NULL; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { if (put_itimerspec64(&old_spec, old_setting)) @@ -984,92 +1000,115 @@ int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; - timer->it_interval = 0; if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; - timer->it_active = 0; + timer->it_status = POSIX_TIMER_DISARMED; return 0; } -static inline int timer_delete_hook(struct k_itimer *timer) +/* + * If the deleted timer is on the ignored list, remove it and + * drop the associated reference. + */ +static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) { - const struct k_clock *kc = timer->kclock; - - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); + if (!hlist_unhashed(&tmr->ignored_list)) { + hlist_del_init(&tmr->ignored_list); + posixtimer_putref(tmr); + } } -/* Delete a POSIX.1b interval timer. */ -SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) +static void posix_timer_delete(struct k_itimer *timer) { - struct k_itimer *timer; - unsigned long flags; - - timer = lock_timer(timer_id, &flags); + /* + * Invalidate the timer, remove it from the linked list and remove + * it from the ignored list if pending. + * + * The invalidation must be written with siglock held so that the + * signal code observes the invalidated timer::it_signal in + * do_sigaction(), which prevents it from moving a pending signal + * of a deleted timer to the ignore list. + * + * The invalidation also prevents signal queueing, signal delivery + * and therefore rearming from the signal delivery path. + * + * A concurrent lookup can still find the timer in the hash, but it + * will check timer::it_signal with timer::it_lock held and observe + * bit 0 set, which invalidates it. That also prevents the timer ID + * from being handed out before this timer is completely gone. + */ + timer->it_signal_seq++; -retry_delete: - if (!timer) - return -EINVAL; + scoped_guard (spinlock, ¤t->sighand->siglock) { + unsigned long sig = (unsigned long)timer->it_signal | 1UL; - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { - /* Unlocks and relocks the timer if it still exists */ - timer = timer_wait_running(timer, &flags); - goto retry_delete; + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); + hlist_del_rcu(&timer->list); + posix_timer_cleanup_ignored(timer); } - spin_lock(¤t->sighand->siglock); - list_del(&timer->list); - spin_unlock(¤t->sighand->siglock); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - timer->it_signal = NULL; - - unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); - return 0; + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { + guard(rcu)(); + spin_unlock_irq(&timer->it_lock); + timer_wait_running(timer); + spin_lock_irq(&timer->it_lock); + } } -/* - * return timer owned by the process, used by exit_itimers - */ -static void itimer_delete(struct k_itimer *timer) +/* Delete a POSIX.1b interval timer. */ +SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { -retry_delete: - spin_lock_irq(&timer->it_lock); + struct k_itimer *timer; - if (timer_delete_hook(timer) == TIMER_RETRY) { - spin_unlock_irq(&timer->it_lock); - goto retry_delete; + scoped_timer_get_or_fail(timer_id) { + timer = scoped_timer; + posix_timer_delete(timer); } - list_del(&timer->list); - - spin_unlock_irq(&timer->it_lock); - release_posix_timer(timer, IT_ID_SET); + /* Remove it from the hash, which frees up the timer ID */ + posix_timer_unhash_and_free(timer); + return 0; } /* - * This is called by do_exit or de_thread, only when nobody else can - * modify the signal->posix_timers list. Yet we need sighand->siglock - * to prevent the race with /proc/pid/timers. + * Invoked from do_exit() when the last thread of a thread group exits. + * At that point no other task can access the timers of the dying + * task anymore. */ void exit_itimers(struct task_struct *tsk) { - struct list_head timers; - struct k_itimer *tmr; + struct hlist_head timers; + struct hlist_node *next; + struct k_itimer *timer; + + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; - if (list_empty(&tsk->signal->posix_timers)) + if (hlist_empty(&tsk->signal->posix_timers)) return; - spin_lock_irq(&tsk->sighand->siglock); - list_replace_init(&tsk->signal->posix_timers, &timers); - spin_unlock_irq(&tsk->sighand->siglock); + /* Protect against concurrent read via /proc/$PID/timers */ + scoped_guard (spinlock_irq, &tsk->sighand->siglock) + hlist_move_list(&tsk->signal->posix_timers, &timers); + + /* The timers are not longer accessible via tsk::signal */ + hlist_for_each_entry_safe(timer, next, &timers, list) { + scoped_guard (spinlock_irq, &timer->it_lock) + posix_timer_delete(timer); + posix_timer_unhash_and_free(timer); + cond_resched(); + } + + /* + * There should be no timers on the ignored list. itimer_delete() has + * mopped them up. + */ + if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers))) + return; - while (!list_empty(&timers)) { - tmr = list_first_entry(&timers, struct k_itimer, list); - itimer_delete(tmr); + hlist_move_list(&tsk->signal->ignored_posix_timers, &timers); + while (!hlist_empty(&timers)) { + posix_timer_cleanup_ignored(hlist_entry(timers.first, struct k_itimer, + ignored_list)); } } @@ -1085,6 +1124,10 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, if (get_timespec64(&new_tp, tp)) return -EFAULT; + /* + * Permission checks have to be done inside the clock specific + * setter callback. + */ return kc->clock_set(which_clock, &new_tp); } @@ -1135,6 +1178,79 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, return err; } +/** + * sys_clock_getres - Get the resolution of a clock + * @which_clock: The clock to get the resolution for + * @tp: Pointer to a a user space timespec64 for storage + * + * POSIX defines: + * + * "The clock_getres() function shall return the resolution of any + * clock. Clock resolutions are implementation-defined and cannot be set by + * a process. If the argument res is not NULL, the resolution of the + * specified clock shall be stored in the location pointed to by res. If + * res is NULL, the clock resolution is not returned. If the time argument + * of clock_settime() is not a multiple of res, then the value is truncated + * to a multiple of res." + * + * Due to the various hardware constraints the real resolution can vary + * wildly and even change during runtime when the underlying devices are + * replaced. The kernel also can use hardware devices with different + * resolutions for reading the time and for arming timers. + * + * The kernel therefore deviates from the POSIX spec in various aspects: + * + * 1) The resolution returned to user space + * + * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI, + * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW + * the kernel differentiates only two cases: + * + * I) Low resolution mode: + * + * When high resolution timers are disabled at compile or runtime + * the resolution returned is nanoseconds per tick, which represents + * the precision at which timers expire. + * + * II) High resolution mode: + * + * When high resolution timers are enabled the resolution returned + * is always one nanosecond independent of the actual resolution of + * the underlying hardware devices. + * + * For CLOCK_*_ALARM the actual resolution depends on system + * state. When system is running the resolution is the same as the + * resolution of the other clocks. During suspend the actual + * resolution is the resolution of the underlying RTC device which + * might be way less precise than the clockevent device used during + * running state. + * + * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution + * returned is always nanoseconds per tick. + * + * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution + * returned is always one nanosecond under the assumption that the + * underlying scheduler clock has a better resolution than nanoseconds + * per tick. + * + * For dynamic POSIX clocks (PTP devices) the resolution returned is + * always one nanosecond. + * + * 2) Affect on sys_clock_settime() + * + * The kernel does not truncate the time which is handed in to + * sys_clock_settime(). The kernel internal timekeeping is always using + * nanoseconds precision independent of the clocksource device which is + * used to read the time from. The resolution of that device only + * affects the precision of the time returned by sys_clock_gettime(). + * + * Returns: + * 0 Success. @tp contains the resolution + * -EINVAL @which_clock is not a valid clock ID + * -EFAULT Copying the resolution to @tp faulted + * -ENODEV Dynamic POSIX clock is not backed by a device + * -EOPNOTSUPP Dynamic POSIX clock does not support getres() + */ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { @@ -1226,7 +1342,7 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, #endif /* - * nanosleep for monotonic and realtime clocks + * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI */ static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) @@ -1238,8 +1354,13 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } +/* + * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME + * + * Absolute nanosleeps for these clocks are time-namespace adjusted. + */ static int common_nsleep_timens(const clockid_t which_clock, int flags, - const struct timespec64 *rqtp) + const struct timespec64 *rqtp) { ktime_t texp = timespec64_to_ktime(*rqtp); @@ -1270,6 +1391,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; @@ -1297,6 +1419,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; @@ -1402,6 +1525,9 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, +#ifdef CONFIG_POSIX_AUX_CLOCKS + [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux, +#endif }; static const struct k_clock *clockid_to_kclock(const clockid_t id) @@ -1418,3 +1544,31 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id) return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } + +static int __init posixtimer_init(void) +{ + unsigned long i, size; + unsigned int shift; + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof(struct k_itimer), + __alignof__(struct k_itimer), + SLAB_ACCOUNT, NULL); + + if (IS_ENABLED(CONFIG_BASE_SMALL)) + size = 512; + else + size = roundup_pow_of_two(512 * num_possible_cpus()); + + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), + size, 0, 0, &shift, NULL, size, size); + size = 1UL << shift; + timer_hashmask = size - 1; + + for (i = 0; i < size; i++) { + spin_lock_init(&timer_buckets[i].lock); + INIT_HLIST_HEAD(&timer_buckets[i].head); + } + return 0; +} +core_initcall(posixtimer_init); diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index f32a2ebba9b8..7f259e845d24 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -1,6 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define TIMER_RETRY 1 +enum posix_timer_state { + POSIX_TIMER_DISARMED, + POSIX_TIMER_ARMED, + POSIX_TIMER_REQUEUE_PENDING, +}; + struct k_clock { int (*clock_getres)(const clockid_t which_clock, struct timespec64 *tp); @@ -35,11 +41,13 @@ extern const struct k_clock clock_posix_dynamic; extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; +extern const struct k_clock clock_aux; -int posix_timer_event(struct k_itimer *timr, int si_private); +void posix_timer_queue_signal(struct k_itimer *timr); void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting); +void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting); int common_timer_del(struct k_itimer *timer); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8464c5acc913..f39111830ca3 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -64,14 +64,14 @@ static struct clock_data cd ____cacheline_aligned = { .actual_read_sched_clock = jiffy_sched_clock_read, }; -static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { - *seq = raw_read_seqcount_latch(&cd.seq); + *seq = read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } @@ -80,23 +80,45 @@ notrace int sched_clock_read_retry(unsigned int seq) return read_seqcount_latch_retry(&cd.seq, seq); } -unsigned long long notrace sched_clock(void) +static __always_inline unsigned long long __sched_clock(void) { - u64 cyc, res; - unsigned int seq; struct clock_read_data *rd; + unsigned int seq; + u64 cyc, res; do { - rd = sched_clock_read_begin(&seq); + seq = raw_read_seqcount_latch(&cd.seq); + rd = cd.read_data + (seq & 1); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (sched_clock_read_retry(seq)); + } while (raw_read_seqcount_latch_retry(&cd.seq, seq)); return res; } +unsigned long long noinstr sched_clock_noinstr(void) +{ + return __sched_clock(); +} + +unsigned long long notrace sched_clock(void) +{ + unsigned long long ns; + preempt_disable_notrace(); + /* + * All of __sched_clock() is a seqcount_latch reader critical section, + * but relies on the raw helpers which are uninstrumented. For KCSAN, + * mark all accesses in __sched_clock() as atomic. + */ + kcsan_nestable_atomic_begin(); + ns = __sched_clock(); + kcsan_nestable_atomic_end(); + preempt_enable_notrace(); + return ns; +} + /* * Updating the data required to read the clock. * @@ -109,17 +131,19 @@ unsigned long long notrace sched_clock(void) */ static void update_clock_read_data(struct clock_read_data *rd) { - /* update the backup (odd) copy with the new data */ - cd.read_data[1] = *rd; - /* steer readers towards the odd copy */ - raw_write_seqcount_latch(&cd.seq); + write_seqcount_latch_begin(&cd.seq); /* now its safe for us to update the normal (even) copy */ cd.read_data[0] = *rd; /* switch readers back to the even copy */ - raw_write_seqcount_latch(&cd.seq); + write_seqcount_latch(&cd.seq); + + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + write_seqcount_latch_end(&cd.seq); } /* @@ -150,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) return HRTIMER_RESTART; } -void __init -sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -223,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pS as sched_clock source\n", read); } +EXPORT_SYMBOL_GPL(sched_clock_register); void __init generic_sched_clock_init(void) { @@ -239,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } @@ -257,7 +280,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned int seq = raw_read_seqcount_latch(&cd.seq); + unsigned int seq = read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } @@ -273,6 +296,11 @@ int sched_clock_suspend(void) return 0; } +static int sched_clock_syscore_suspend(void *data) +{ + return sched_clock_suspend(); +} + void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -282,14 +310,23 @@ void sched_clock_resume(void) rd->read_sched_clock = cd.actual_read_sched_clock; } -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, +static void sched_clock_syscore_resume(void *data) +{ + sched_clock_resume(); +} + +static const struct syscore_ops sched_clock_syscore_ops = { + .suspend = sched_clock_syscore_suspend, + .resume = sched_clock_syscore_resume, +}; + +static struct syscore sched_clock_syscore = { + .ops = &sched_clock_syscore_ops, }; static int __init sched_clock_syscore_init(void) { - register_syscore_ops(&sched_clock_ops); + register_syscore(&sched_clock_syscore); return 0; } diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c new file mode 100644 index 000000000000..3c90574bd904 --- /dev/null +++ b/kernel/time/sleep_timeout.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kernel internal schedule timeout and sleeping functions + */ + +#include <linux/delay.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> + +#include "tick-internal.h" + +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) +{ + struct process_timer *timeout = timer_container_of(timeout, t, timer); + + wake_up_process(timeout->task); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have elapsed. + * The function behavior depends on the current task state + * (see also set_current_state() description): + * + * %TASK_RUNNING - the scheduler is called, but the task does not sleep + * at all. That happens because sched_submit_work() does nothing for + * tasks in %TASK_RUNNING state. + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be %TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * Returns: 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct process_timer timer; + unsigned long expire; + + switch (timeout) { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + pr_err("%s: wrong timeout value %lx\n", __func__, timeout); + dump_stack(); + __set_current_state(TASK_RUNNING); + goto out; + } + } + + expire = timeout + jiffies; + + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + timer.timer.expires = expire; + add_timer(&timer.timer); + schedule(); + timer_delete_sync(&timer.timer); + + /* Remove the timer from the object tracker */ + timer_destroy_on_stack(&timer.timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * __set_current_state() can be used in schedule_timeout_*() functions, because + * schedule_timeout() calls schedule() unconditionally. + */ + +/** + * schedule_timeout_interruptible - sleep until timeout (interruptible) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +/** + * schedule_timeout_killable - sleep until timeout (killable) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_KILLABLE before starting the timeout. + */ +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +/** + * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. + */ +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/** + * schedule_timeout_idle - sleep until timeout (idle) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_IDLE before starting the timeout. It is similar to + * schedule_timeout_uninterruptible(), except this task will not contribute to + * load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * @clock_id: timer clock to be used + * + * Details are explained in schedule_hrtimeout_range() function description as + * this function is commonly used. + */ +int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode, clockid_t clock_id) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && *expires == 0) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + return -EINTR; + } + + hrtimer_setup_sleeper_on_stack(&t, clock_id, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + hrtimer_sleeper_start_expires(&t, mode); + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly + * for regular (non RT/DL) tasks. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns: 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode + * + * See schedule_hrtimeout_range() for details. @delta argument of + * schedule_hrtimeout_range() is set to 0 and has therefore no impact. + */ +int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Requested sleep duration in milliseconds + * + * msleep() uses jiffy based timeouts for the sleep duration. Because of the + * design of the timer wheel, the maximum additional percentage delay (slack) is + * 12.5%. This is only valid for timers which will end up in level 1 or a higher + * level of the timer wheel. For explanation of those 12.5% please check the + * detailed description about the basics of the timer wheel. + * + * The slack of timers which will end up in level 0 depends on sleep duration + * (msecs) and HZ configuration and can be calculated in the following way (with + * the timer wheel design restriction that the slack is not less than 12.5%): + * + * ``slack = MSECS_PER_TICK / msecs`` + * + * When the allowed slack of the callsite is known, the calculation could be + * turned around to find the minimal allowed sleep duration to meet the + * constraints. For example: + * + * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: + * all sleep durations greater or equal 4ms will meet the constraints. + * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: + * all sleep durations greater or equal 8ms will meet the constraints. + * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: + * all sleep durations greater or equal 16ms will meet the constraints. + * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: + * all sleep durations greater or equal 32ms will meet the constraints. + * + * See also the signal aware variant msleep_interruptible(). + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs); + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Requested sleep duration in milliseconds + * + * See msleep() for some basic information. + * + * The difference between msleep() and msleep_interruptible() is that the sleep + * could be interrupted by a signal delivery and then returns early. + * + * Returns: The remaining time of the sleep duration transformed to msecs (see + * schedule_timeout() for details). + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs); + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} +EXPORT_SYMBOL(msleep_interruptible); + +/** + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping + * + * usleep_range_state() sleeps at least for the minimum specified time but not + * longer than the maximum specified amount of time. The range might reduce + * power usage by allowing hrtimers to coalesce an already scheduled interrupt + * with this hrtimer. In the worst case, an interrupt is scheduled for the upper + * bound. + * + * The sleeping task is set to the specified state before starting the sleep. + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range() or its variants instead of udelay(). The sleep improves + * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). + */ +void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) +{ + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + if (WARN_ON_ONCE(max < min)) + delta = 0; + + for (;;) { + __set_current_state(state); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } +} +EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 13b11eb62685..783f2297111b 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -149,11 +149,12 @@ module_init(udelay_test_init); static void __exit udelay_test_exit(void) { mutex_lock(&udelay_test_lock); - debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL)); + debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL); mutex_unlock(&udelay_test_lock); } module_exit(udelay_test_exit); +MODULE_DESCRIPTION("udelay test module"); MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 797eb93103ad..a88b72b0f35e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * hrtimer callback function is currently running, then * hrtimer_start() cannot move it and the timer stays on the CPU on * which it is assigned at the moment. + */ + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. * - * As this can be called from idle code, the hrtimer_start() - * invocation has to be wrapped with RCU_NONIDLE() as - * hrtimer_start() can call into tracing. + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. */ - RCU_NONIDLE( { - hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); - /* - * The core tick broadcast mode expects bc->bound_on to be set - * correctly to prevent a CPU which has the broadcast hrtimer - * armed from going deep idle. - * - * As tick_broadcast_lock is held, nothing can change the cpu - * base which was just established in hrtimer_start() above. So - * the below access is safe even without holding the hrtimer - * base lock. - */ - bc->bound_on = bctimer.base->cpu_base->cpu; - } ); + bc->bound_on = bctimer.base->cpu_base->cpu; + return 0; } @@ -105,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f7fe6fe36173..0207868c8b4d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -35,14 +35,15 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device); -static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); # ifdef CONFIG_HOTPLUG_CPU static void tick_broadcast_oneshot_offline(unsigned int cpu); # endif #else -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } +static inline void +tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } # ifdef CONFIG_HOTPLUG_CPU @@ -264,7 +265,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, false); ret = 1; } else { /* @@ -500,7 +501,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, false); } } out: @@ -622,9 +623,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void) * to avoid a deep idle transition as we are about to get the * broadcast IPI right away. */ -int tick_check_broadcast_expired(void) +noinstr int tick_check_broadcast_expired(void) { +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); +#else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +#endif } /* @@ -1015,49 +1020,104 @@ static inline ktime_t tick_get_next_period(void) /** * tick_broadcast_setup_oneshot - setup the broadcast device + * @bc: the broadcast device + * @from_periodic: true if called from periodic mode */ -static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, + bool from_periodic) { int cpu = smp_processor_id(); + ktime_t nexttick = 0; if (!bc) return; - /* Set it up only once ! */ - if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = clockevent_state_periodic(bc); - - bc->event_handler = tick_handle_oneshot_broadcast; - + /* + * When the broadcast device was switched to oneshot by the first + * CPU handling the NOHZ change, the other CPUs will reach this + * code via hrtimer_run_queues() -> tick_check_oneshot_change() + * too. Set up the broadcast device only once! + */ + if (bc->event_handler == tick_handle_oneshot_broadcast) { /* - * We must be careful here. There might be other CPUs - * waiting for periodic broadcast. We need to set the - * oneshot_mask bits for those and program the - * broadcast device to fire. + * The CPU which switched from periodic to oneshot mode + * set the broadcast oneshot bit for all other CPUs which + * are in the general (periodic) broadcast mask to ensure + * that CPUs which wait for the periodic broadcast are + * woken up. + * + * Clear the bit for the local CPU as the set bit would + * prevent the first tick_broadcast_enter() after this CPU + * switched to oneshot state to program the broadcast + * device. + * + * This code can also be reached via tick_broadcast_control(), + * but this cannot avoid the tick_broadcast_clear_oneshot() + * as that would break the periodic to oneshot transition of + * secondary CPUs. But that's harmless as the below only + * clears already cleared bits. */ + tick_broadcast_clear_oneshot(cpu); + return; + } + + + bc->event_handler = tick_handle_oneshot_broadcast; + bc->next_event = KTIME_MAX; + + /* + * When the tick mode is switched from periodic to oneshot it must + * be ensured that CPUs which are waiting for periodic broadcast + * get their wake-up at the next tick. This is achieved by ORing + * tick_broadcast_mask into tick_broadcast_oneshot_mask. + * + * For other callers, e.g. broadcast device replacement, + * tick_broadcast_oneshot_mask must not be touched as this would + * set bits for CPUs which are already NOHZ, but not idle. Their + * next tick_broadcast_enter() would observe the bit set and fail + * to update the expiry time and the broadcast event device. + */ + if (from_periodic) { cpumask_copy(tmpmask, tick_broadcast_mask); + /* Remove the local CPU as it is obviously not idle */ cpumask_clear_cpu(cpu, tmpmask); - cpumask_or(tick_broadcast_oneshot_mask, - tick_broadcast_oneshot_mask, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask); - if (was_periodic && !cpumask_empty(tmpmask)) { - ktime_t nextevt = tick_get_next_period(); + /* + * Ensure that the oneshot broadcast handler will wake the + * CPUs which are still waiting for periodic broadcast. + */ + nexttick = tick_get_next_period(); + tick_broadcast_init_next_event(tmpmask, nexttick); - clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); - tick_broadcast_init_next_event(tmpmask, nextevt); - tick_broadcast_set_event(bc, cpu, nextevt); - } else - bc->next_event = KTIME_MAX; - } else { /* - * The first cpu which switches to oneshot mode sets - * the bit for all other cpus which are in the general - * (periodic) broadcast mask. So the bit is set and - * would prevent the first broadcast enter after this - * to program the bc device. + * If the underlying broadcast clock event device is + * already in oneshot state, then there is nothing to do. + * The device was already armed for the next tick + * in tick_handle_broadcast_periodic() */ - tick_broadcast_clear_oneshot(cpu); + if (clockevent_state_oneshot(bc)) + return; } + + /* + * When switching from periodic to oneshot mode arm the broadcast + * device for the next tick. + * + * If the broadcast device has been replaced in oneshot mode and + * the oneshot broadcast mask is not empty, then arm it to expire + * immediately in order to reevaluate the next expiring timer. + * @nexttick is 0 and therefore in the past which will cause the + * clockevent code to force an event. + * + * For both cases the programming can be avoided when the oneshot + * broadcast mask is empty. + * + * tick_broadcast_set_event() implicitly switches the broadcast + * device to oneshot state. + */ + if (!cpumask_empty(tick_broadcast_oneshot_mask)) + tick_broadcast_set_event(bc, cpu, nexttick); } /* @@ -1066,14 +1126,16 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) void tick_broadcast_switch_to_oneshot(void) { struct clock_event_device *bc; + enum tick_device_mode oldmode; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + oldmode = tick_broadcast_device.mode; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -1088,6 +1150,30 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) bc = tick_broadcast_device.evtdev; if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* + * If the broadcast force bit of the current CPU is set, + * then the current CPU has not yet reprogrammed the local + * timer device to avoid a ping-pong race. See + * ___tick_broadcast_oneshot_control(). + * + * If the broadcast device is hrtimer based then + * programming the broadcast event below does not have any + * effect because the local clockevent device is not + * running and not programmed because the broadcast event + * is not earlier than the pending event of the local clock + * event device. As a consequence all CPUs waiting for a + * broadcast event are stuck forever. + * + * Detect this condition and reprogram the cpu local timer + * device to avoid the starvation. + */ + if (tick_check_broadcast_expired()) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); + tick_program_event(td->evtdev->next_event, 1); + } + /* This moves the broadcast assignment to this CPU: */ clockevents_program_event(bc, bc->next_event, 1); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 46789356f856..7e33d3f2e889 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -7,6 +7,7 @@ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ +#include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> @@ -84,7 +85,7 @@ int tick_is_oneshot_available(void) */ static void tick_periodic(int cpu) { - if (tick_do_timer_cpu == cpu) { + if (READ_ONCE(tick_do_timer_cpu) == cpu) { raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); @@ -111,15 +112,13 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); -#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) /* * The cpu might have transitioned to HIGHRES or NOHZ mode via * update_process_times() -> run_local_timers() -> * hrtimer_run_queues(). */ - if (dev->event_handler != tick_handle_periodic) + if (IS_ENABLED(CONFIG_TICK_ONESHOT) && dev->event_handler != tick_handle_periodic) return; -#endif if (!clockevent_state_oneshot(dev)) return; @@ -179,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) } } -#ifdef CONFIG_NO_HZ_FULL -static void giveup_do_timer(void *info) -{ - int cpu = *(unsigned int *)info; - - WARN_ON(tick_do_timer_cpu != smp_processor_id()); - - tick_do_timer_cpu = cpu; -} - -static void tick_take_do_timer_from_boot(void) -{ - int cpu = smp_processor_id(); - int from = tick_do_timer_boot_cpu; - - if (from >= 0 && from != cpu) - smp_call_function_single(from, giveup_do_timer, &cpu, 1); -} -#endif - /* * Setup the tick device */ @@ -217,25 +196,30 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; - + if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) { + WRITE_ONCE(tick_do_timer_cpu, cpu); tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* - * The boot CPU may be nohz_full, in which case set - * tick_do_timer_boot_cpu so the first housekeeping - * secondary that comes up will take do_timer from - * us. + * The boot CPU may be nohz_full, in which case the + * first housekeeping secondary will take do_timer() + * from it. */ if (tick_nohz_full_cpu(cpu)) tick_do_timer_boot_cpu = cpu; - } else if (tick_do_timer_boot_cpu != -1 && - !tick_nohz_full_cpu(cpu)) { - tick_take_do_timer_from_boot(); + } else if (tick_do_timer_boot_cpu != -1 && !tick_nohz_full_cpu(cpu)) { tick_do_timer_boot_cpu = -1; - WARN_ON(tick_do_timer_cpu != cpu); + /* + * The boot CPU will stay in periodic (NOHZ disabled) + * mode until clocksource_done_booting() called after + * smp_init() selects a high resolution clocksource and + * timekeeping_notify() kicks the NOHZ stuff alive. + * + * So this WRITE_ONCE can only race with the READ_ONCE + * check in tick_periodic() but this race is harmless. + */ + WRITE_ONCE(tick_do_timer_cpu, cpu); #endif } @@ -399,37 +383,46 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); #ifdef CONFIG_HOTPLUG_CPU +void tick_assert_timekeeping_handover(void) +{ + WARN_ON_ONCE(tick_do_timer_cpu == smp_processor_id()); +} /* - * Transfer the do_timer job away from a dying cpu. - * - * Called with interrupts disabled. No locking required. If - * tick_do_timer_cpu is owned by this cpu, nothing can change it. + * Stop the tick and transfer the timekeeping job away from a dying cpu. */ -void tick_handover_do_timer(void) +int tick_cpu_dying(unsigned int dying_cpu) { - if (tick_do_timer_cpu == smp_processor_id()) + /* + * If the current CPU is the timekeeper, it's the only one that can + * safely hand over its duty. Also all online CPUs are in stop + * machine, guaranteed not to be idle, therefore there is no + * concurrency and it's safe to pick any online successor. + */ + if (tick_do_timer_cpu == dying_cpu) tick_do_timer_cpu = cpumask_first(cpu_online_mask); + + /* Make sure the CPU won't try to retake the timekeeping duty */ + tick_sched_timer_dying(dying_cpu); + + /* Remove CPU from timer broadcasting */ + tick_offline_cpu(dying_cpu); + + return 0; } /* - * Shutdown an event device on a given cpu: + * Shutdown an event device on the outgoing CPU: * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. + * Called by the dying CPU during teardown, with clockevents_lock held + * and interrupts disabled. */ -void tick_shutdown(unsigned int cpu) +void tick_shutdown(void) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; @@ -510,6 +503,7 @@ void tick_resume(void) #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP); static unsigned int tick_freeze_depth; /** @@ -529,9 +523,22 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + /* + * All other CPUs have their interrupts disabled and are + * suspended to idle. Other tasks have been frozen so there + * is no scheduling happening. This means that there is no + * concurrency in the system at this point. Therefore it is + * okay to acquire a sleeping lock on PREEMPT_RT, such as a + * spinlock, because the lock cannot be held by other CPUs + * or threads and acquiring it cannot block. + * + * Inform lockdep about the situation. + */ + lock_map_acquire_try(&tick_freeze_map); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); + lock_map_release(&tick_freeze_map); } else { tick_suspend_local(); } @@ -553,8 +560,16 @@ void tick_unfreeze(void) raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { + /* + * Similar to tick_freeze(). On resumption the first CPU may + * acquire uncontended sleeping locks while other CPUs block on + * tick_freeze_lock. + */ + lock_map_acquire_try(&tick_freeze_map); timekeeping_resume(); sched_clock_resume(); + lock_map_release(&tick_freeze_map); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 649f2b48e8f0..4e4f7bbe2a64 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -8,6 +8,11 @@ #include "timekeeping.h" #include "tick-sched.h" +struct timer_events { + u64 local; + u64 global; +}; + #ifdef CONFIG_GENERIC_CLOCKEVENTS # define TICK_DO_TIMER_NONE -1 @@ -20,7 +25,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); -extern void tick_shutdown(unsigned int cpu); +extern void tick_offline_cpu(unsigned int cpu); +extern void tick_shutdown(void); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, @@ -56,7 +62,6 @@ extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); -extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* Broadcasting support */ # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST @@ -153,8 +158,16 @@ static inline void tick_nohz_init(void) { } #ifdef CONFIG_NO_HZ_COMMON extern unsigned long tick_nohz_active; extern void timers_update_nohz(void); +extern u64 get_jiffies_update(unsigned long *basej); # ifdef CONFIG_SMP extern struct static_key_false timers_migration_enabled; +extern void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, + struct timer_events *tevt, + unsigned int cpu); +extern void timer_lock_remote_bases(unsigned int cpu); +extern void timer_unlock_remote_bases(unsigned int cpu); +extern bool timer_base_is_idle(void); +extern void timer_expire_remote(unsigned int cpu); # endif #else /* CONFIG_NO_HZ_COMMON */ static inline void timers_update_nohz(void) { } @@ -164,6 +177,7 @@ static inline void timers_update_nohz(void) { } DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle); void timer_clear_idle(void); #define CLOCK_SET_WALL \ @@ -197,3 +211,5 @@ void hrtimers_resume_local(void); #else #define JIFFIES_SHIFT 8 #endif + +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5e2c2c26b3cc..ffee943d796d 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -19,6 +19,10 @@ /** * tick_program_event - program the CPU local timer device for the next event + * @expires: the time at which the next timer event should occur + * @force: flag to force reprograming even if the event time hasn't changed + * + * Return: 0 on success, negative error code on failure */ int tick_program_event(ktime_t expires, int force) { @@ -57,6 +61,13 @@ void tick_resume_oneshot(void) /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + * @newdev: Pointer to the clock event device to configure + * @handler: Function to be called when the event device triggers an interrupt + * @next_event: Initial expiry time for the next event (in ktime) + * + * Configures the specified clock event device for onshot mode, + * assigns the given handler as its event callback, and programs + * the device to trigger at the specified next event time. */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev, /** * tick_switch_to_oneshot - switch to oneshot mode + * @handler: function to call when an event occurs on the tick device + * + * Return: 0 on success, -EINVAL if the tick device is not present, + * not functional, or does not support oneshot mode. */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { @@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * - * returns 1 when either nohz or highres are enabled. otherwise 0. + * Return: 1 when either nohz or highres are enabled, otherwise 0. */ int tick_oneshot_mode_active(void) { @@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void) * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. + * + * Return: 0 on success, -EINVAL if the tick device cannot switch + * to oneshot/high-resolution mode. */ int tick_init_highres(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b0e3c9205946..8ddf74e705d3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -4,10 +4,11 @@ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * - * No idle tick implementation for low and high resolution timers + * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ +#include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> @@ -43,9 +44,8 @@ struct tick_sched *tick_get_tick_sched(int cpu) return &per_cpu(tick_cpu_sched, cpu); } -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* - * The time, when the last jiffy update happened. Write access must hold + * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ @@ -60,13 +60,13 @@ static void tick_do_update_jiffies64(ktime_t now) ktime_t delta, nextp; /* - * 64bit can do a quick check without holding jiffies lock and + * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * - * 32bit cannot do that because the store of tick_next_period - * consists of two 32bit stores and the first store could move it - * to a random point in the future. + * 32-bit cannot do that because the store of 'tick_next_period' + * consists of two 32-bit stores, and the first store could be + * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) @@ -75,7 +75,7 @@ static void tick_do_update_jiffies64(ktime_t now) unsigned int seq; /* - * Avoid contention on jiffies_lock and protect the quick + * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { @@ -90,7 +90,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* - * Reevaluate with the lock held. Another CPU might have done the + * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { @@ -114,25 +114,23 @@ static void tick_do_update_jiffies64(ktime_t now) TICK_NSEC); } - /* Advance jiffies to complete the jiffies_seq protected job */ + /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; - /* - * Keep the tick_next_period variable up to date. - */ + /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick - * check above and ensures that the update to jiffies_64 is - * not reordered vs. the store to tick_next_period, neither + * check above, and ensures that the update to 'jiffies_64' is + * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* - * A plain store is good enough on 32bit as the quick check + * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; @@ -140,7 +138,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* * Release the sequence count. calc_global_load() below is not - * protected by it, but jiffies_lock needs to be held to prevent + * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); @@ -160,75 +158,132 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); - /* Did we start the jiffies update yet ? */ - if (last_jiffies_update == 0) + + /* Have we started the jiffies update yet ? */ + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + last_jiffies_update = tick_next_period; + } period = last_jiffies_update; + write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); + return period; } +static inline int tick_sched_flag_test(struct tick_sched *ts, + unsigned long flag) +{ + return !!(ts->flags & flag); +} + +static inline void tick_sched_flag_set(struct tick_sched *ts, + unsigned long flag) +{ + lockdep_assert_irqs_disabled(); + ts->flags |= flag; +} + +static inline void tick_sched_flag_clear(struct tick_sched *ts, + unsigned long flag) +{ + lockdep_assert_irqs_disabled(); + ts->flags &= ~flag; +} + +/* + * Allow only one non-timekeeper CPU at a time update jiffies from + * the timer tick. + * + * Returns true if update was run. + */ +static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) +{ + static atomic_t in_progress; + int inp; + + inp = atomic_read(&in_progress); + if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) + return false; + + if (ts->last_tick_jiffies == jiffies) + tick_do_update_jiffies64(now); + atomic_set(&in_progress, 0); + return true; +} + #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { - int cpu = smp_processor_id(); + int tick_cpu, cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by - * jiffies_lock. + * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the - * tick_do_timer_cpu never relinquishes. + * 'tick_do_timer_cpu' CPU never relinquishes. */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { + tick_cpu = READ_ONCE(tick_do_timer_cpu); + + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif - tick_do_timer_cpu = cpu; + WRITE_ONCE(tick_do_timer_cpu, cpu); + tick_cpu = cpu; } -#endif - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) + /* Check if jiffies need an update */ + if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* - * If jiffies update stalled for too long (timekeeper in stop_machine() + * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { - if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { - tick_do_update_jiffies64(now); - ts->stalled_jiffies = 0; - ts->last_tick_jiffies = READ_ONCE(jiffies); + if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { + if (tick_limited_update_jiffies64(ts, now)) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } } } - if (ts->inidle) + if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { -#ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while + * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. + * when we go busy again does not account too many ticks. */ - if (ts->tick_stopped) { + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && + tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; @@ -239,11 +294,44 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) */ ts->next_tick = 0; } -#endif + update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } -#endif + +/* + * We rearm the timer until we get disabled by the idle code. + * Called with interrupts disabled. + */ +static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) +{ + struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + tick_sched_do_timer(ts, now); + + /* + * Do not call when we are not in IRQ context and have + * no valid 'regs' pointer + */ + if (regs) + tick_sched_handle(ts, regs); + else + ts->next_tick = 0; + + /* + * In dynticks mode, tick reprogram is deferred: + * - to the idle task if in dynticks-idle + * - to IRQ exit if in full-dynticks. + */ + if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) + return HRTIMER_NORESTART; + + hrtimer_forward(timer, now, TICK_NSEC); + + return HRTIMER_RESTART; +} #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; @@ -281,6 +369,11 @@ static bool check_tick_dependency(atomic_t *dep) return true; } + if (val & TICK_DEP_MASK_RCU_EXP) { + trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); + return true; + } + return false; } @@ -346,7 +439,7 @@ static void tick_nohz_kick_task(struct task_struct *tsk) /* * If the task is not running, run_posix_cpu_timers() - * has nothing to elapse, IPI can then be spared. + * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq @@ -355,6 +448,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk) * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask + * + * XXX given a task picks up the dependency on schedule(), should we + * only care about tasks that are currently on the CPU instead of all + * that are on the runqueue? + * + * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; @@ -409,7 +508,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep, /* * Set a global tick dependency. Used by perf events that rely on freq and - * by unstable clock. + * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { @@ -423,7 +522,7 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit) /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to - * manage events throttling. + * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { @@ -439,7 +538,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { - /* Remote irq work not NMI-safe */ + /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } @@ -457,7 +556,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) @@ -512,7 +611,7 @@ void __tick_nohz_task_switch(void) ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->tick_stopped) { + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { if (atomic_read(¤t->tick_dep_mask) || atomic_read(¤t->signal->tick_dep_mask)) tick_nohz_full_kick(); @@ -527,16 +626,21 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) tick_nohz_full_running = true; } -static int tick_nohz_cpu_down(unsigned int cpu) +bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* - * The tick_do_timer_cpu CPU handles housekeeping duty (unbound + * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ - if (tick_nohz_full_running && tick_do_timer_cpu == cpu) - return -EBUSY; - return 0; + if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) + return false; + return true; +} + +static int tick_nohz_cpu_down(unsigned int cpu) +{ + return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) @@ -547,12 +651,12 @@ void __init tick_nohz_init(void) return; /* - * Full dynticks uses irq work to drive the tick rescheduling on safe - * locking contexts. But then we need irq work to raise its own - * interrupts to avoid circular dependency on the tick + * Full dynticks uses IRQ work to drive the tick rescheduling on safe + * locking contexts. But then we need IRQ work to raise its own + * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { - pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; @@ -579,7 +683,7 @@ void __init tick_nohz_init(void) pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } -#endif +#endif /* #ifdef CONFIG_NO_HZ_FULL */ /* * NOHZ - aka dynamic tick functionality @@ -604,25 +708,26 @@ bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - return ts->tick_stopped; + return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); - return ts->tick_stopped; + return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the - * CPU, which has the update task assigned is in a long sleep. + * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { @@ -637,43 +742,67 @@ static void tick_nohz_update_jiffies(ktime_t now) touch_softlockup_watchdog_sched(); } -/* - * Updates the per-CPU time idle statistics counters - */ -static void -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - if (nr_iowait_cpu(cpu) > 0) - ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); - else - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_entrytime = now; - } + if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) + return; - if (last_update_time) - *last_update_time = ktime_to_us(now); + delta = ktime_sub(now, ts->idle_entrytime); -} + write_seqcount_begin(&ts->idle_sleeptime_seq); + if (nr_iowait_cpu(smp_processor_id()) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); -static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) -{ - update_ts_time_stats(smp_processor_id(), ts, now, NULL); - ts->idle_active = 0; + ts->idle_entrytime = now; + tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); + write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { + write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); - ts->idle_active = 1; + tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); + write_seqcount_end(&ts->idle_sleeptime_seq); + sched_clock_idle_sleep_event(); } +static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, + bool compute_delta, u64 *last_update_time) +{ + ktime_t now, idle; + unsigned int seq; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) + *last_update_time = ktime_to_us(now); + + do { + seq = read_seqcount_begin(&ts->idle_sleeptime_seq); + + if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(*sleeptime, delta); + } else { + idle = *sleeptime; + } + } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); + + return ktime_to_us(idle); + +} + /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query @@ -681,37 +810,22 @@ static void tick_nohz_start_idle(struct tick_sched *ts) * counters if NULL. * * Return the cumulative idle time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note that this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * - * This function returns -1 if NOHZ is not enabled. + * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, idle; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - idle = ts->idle_sleeptime; - } else { - if (ts->idle_active && !nr_iowait_cpu(cpu)) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - idle = ktime_add(ts->idle_sleeptime, delta); - } else { - idle = ts->idle_sleeptime; - } - } - - return ktime_to_us(idle); + return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, + !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); @@ -722,36 +836,22 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * counters if NULL. * * Return the cumulative iowait time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * - * This function returns -1 if NOHZ is not enabled. + * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, iowait; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - iowait = ts->iowait_sleeptime; - } else { - if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - iowait = ktime_add(ts->iowait_sleeptime, delta); - } else { - iowait = ts->iowait_sleeptime; - } - } - return ktime_to_us(iowait); + return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, + nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); @@ -763,7 +863,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { @@ -771,7 +871,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } /* - * Reset to make sure next tick stop doesn't get fooled by past + * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; @@ -779,32 +879,55 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & BIT(TIMER_SOFTIRQ); + return local_timers_pending() & BIT(TIMER_SOFTIRQ); } -static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) +/* + * Read jiffies and the time when jiffies were updated last + */ +u64 get_jiffies_update(unsigned long *basej) { - u64 basemono, next_tick, delta, expires; unsigned long basejiff; unsigned int seq; + u64 basemono; - /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); + *basej = basejiff; + return basemono; +} + +/** + * tick_nohz_next_event() - return the clock monotonic based next event + * @ts: pointer to tick_sched struct + * @cpu: CPU number + * + * Return: + * *%0 - When the next event is a maximum of TICK_NSEC in the future + * and the tick is not stopped yet + * *%next_event - Next event based on clock monotonic + */ +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) +{ + u64 basemono, next_tick, delta, expires; + unsigned long basejiff; + int tick_cpu; + + basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. - * Aside of that check whether the local timer softirq is - * pending. If so its a bad idea to call get_next_timer_interrupt() + * Aside of that, check whether the local timer softirq is + * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a - * minimal delta which brings us back to this place + * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || @@ -822,6 +945,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) ts->next_timer = next_tick; } + /* Make sure next_tick is never before basemono! */ + if (WARN_ON_ONCE(basemono > next_tick)) + next_tick = basemono; + /* * If the tick is due in the next period, keep it ticking or * force prod the timer. @@ -829,15 +956,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* - * Tell the timer code that the base is not idle, i.e. undo - * the effect of get_next_timer_interrupt(): - */ - timer_clear_idle(); - /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ - if (!ts->tick_stopped) { + if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->timer_expires = 0; goto out; } @@ -845,12 +967,13 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* * If this CPU is the one which had the do_timer() duty last, we limit - * the sleep time to the timekeeping max_deferment value. + * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); - if (cpu != tick_do_timer_cpu && - (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + tick_cpu = READ_ONCE(tick_do_timer_cpu); + if (tick_cpu != cpu && + (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ @@ -868,76 +991,103 @@ out: static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; - u64 expires = ts->timer_expires; - ktime_t tick = expires; + bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); + int tick_cpu; + u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* + * Now the tick should be stopped definitely - so the timer base needs + * to be marked idle as well to not miss a newly queued timer. + */ + expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); + if (expires > ts->timer_expires) { + /* + * This path could only happen when the first timer was removed + * between calculating the possible sleep length and now (when + * high resolution mode is not active, timer could also be a + * hrtimer). + * + * We have to stick to the original calculated expiry value to + * not stop the tick for too long with a shallow C-state (which + * was programmed by cpuidle because of an early next expiration + * value). + */ + expires = ts->timer_expires; + } + + /* If the timer base is not idle, retain the not yet stopped tick. */ + if (!timer_idle) + return; + + /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we - * don't drop this here the jiffies might be stale and - * do_timer() never invoked. Keep track of the fact that it + * don't drop this here, the jiffies might be stale and + * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->do_timer_last = 1; - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - ts->do_timer_last = 0; + tick_cpu = READ_ONCE(tick_do_timer_cpu); + if (tick_cpu == cpu) { + WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); + tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); + } else if (tick_cpu != TICK_DO_TIMER_NONE) { + tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } - /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && (expires == ts->next_tick)) { + /* Skip reprogram of event if it's not changed */ + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ - if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) + if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; - WARN_ON_ONCE(1); - printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", - basemono, ts->next_tick, dev->next_event, - hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " + "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, + dev->next_event, hrtimer_active(&ts->sched_timer), + hrtimer_get_expires(&ts->sched_timer)); } /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. + * tick_nohz_stop_tick() can be called several times before + * tick_nohz_restart_sched_tick() is called. This happens when + * interrupts arrive which do not cause a reschedule. In the first + * call we save the current tick time, so we can restart the + * scheduler tick in tick_nohz_restart_sched_tick(). */ - if (!ts->tick_stopped) { + if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); - ts->tick_stopped = 1; + tick_sched_flag_set(ts, TS_FLAG_STOPPED); trace_tick_stop(1, TICK_DEP_MASK_NONE); } - ts->next_tick = tick; + ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, tick, + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { + hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { - hrtimer_set_expires(&ts->sched_timer, tick); - tick_program_event(tick, 1); + hrtimer_set_expires(&ts->sched_timer, expires); + tick_program_event(expires, 1); } } @@ -947,7 +1097,7 @@ static void tick_nohz_retain_tick(struct tick_sched *ts) } #ifdef CONFIG_NO_HZ_FULL -static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); @@ -969,10 +1119,9 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; + + /* Cancel the scheduled timer and restore the tick: */ + tick_sched_flag_clear(ts, TS_FLAG_STOPPED); tick_nohz_restart(ts, now); } @@ -983,8 +1132,8 @@ static void __tick_nohz_full_update_tick(struct tick_sched *ts, int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) - tick_nohz_stop_sched_tick(ts, cpu); - else if (ts->tick_stopped) + tick_nohz_full_stop_tick(ts, cpu); + else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_restart_sched_tick(ts, now); #endif } @@ -994,7 +1143,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!tick_nohz_full_cpu(smp_processor_id())) return; - if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return; __tick_nohz_full_update_tick(ts, ktime_get()); @@ -1003,11 +1152,11 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't - * reach here due to the need_resched() early check in can_stop_idle_tick(). + * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, - * triggering the below since wakep_softirqd() is ignored. + * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) @@ -1025,41 +1174,24 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit < 10) - return false; - - /* On RT, softirqs handling may be waiting on some lock */ - if (!local_bh_blocked()) + /* On RT, softirq handling may be waiting on some lock */ + if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { - /* - * If this CPU is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the CPU which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - /* - * Make sure the CPU doesn't get fooled by obsolete tick - * deadline if it comes back online later. - */ - ts->next_tick = 0; - return false; - } + WARN_ON_ONCE(cpu_is_offline(cpu)); - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) return false; if (need_resched()) @@ -1069,25 +1201,33 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; if (tick_nohz_full_enabled()) { + int tick_cpu = READ_ONCE(tick_do_timer_cpu); + /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ - if (tick_do_timer_cpu == cpu) + if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ - if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } -static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) +/** + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + */ +void tick_nohz_idle_stop_tick(void) { - ktime_t expires; + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); + ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the @@ -1103,14 +1243,14 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) ts->idle_calls++; if (expires > 0LL) { - int was_stopped = ts->tick_stopped; + int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; - if (!was_stopped && ts->tick_stopped) { + if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } @@ -1119,24 +1259,9 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) } } -/** - * tick_nohz_idle_stop_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - */ -void tick_nohz_idle_stop_tick(void) -{ - __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); -} - void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); - /* - * Undo the effect of get_next_timer_interrupt() called from - * tick_nohz_next_event(). - */ - timer_clear_idle(); } /** @@ -1156,25 +1281,36 @@ void tick_nohz_idle_enter(void) WARN_ON_ONCE(ts->timer_expires_base); - ts->inidle = 1; + tick_sched_flag_set(ts, TS_FLAG_INIDLE); tick_nohz_start_idle(ts); local_irq_enable(); } /** - * tick_nohz_irq_exit - update next tick event from interrupt exit + * tick_nohz_irq_exit - Notify the tick about IRQ exit + * + * A timer may have been added/modified/deleted either by the current IRQ, + * or by another place using this IRQ as a notification. This IRQ may have + * also updated the RCU callback list. These events may require a + * re-evaluation of the next tick. Depending on the context: * - * When an interrupt fires while we are idle and it doesn't cause - * a reschedule, it may still add, modify or delete a timer, enqueue - * an RCU callback, etc... - * So we need to re-calculate and reprogram the next tick event. + * 1) If the CPU is idle and no resched is pending, just proceed with idle + * time accounting. The next tick will be re-evaluated on the next idle + * loop iteration. + * + * 2) If the CPU is nohz_full: + * + * 2.1) If there is any tick dependency, restart the tick if stopped. + * + * 2.2) If there is no tick dependency, (re-)evaluate the next tick and + * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->inidle) + if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); @@ -1182,6 +1318,8 @@ void tick_nohz_irq_exit(void) /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + * + * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { @@ -1196,10 +1334,12 @@ bool tick_nohz_idle_got_tick(void) /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer - * or the tick, whatever that expires first. Note that, if the tick has been + * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled + * + * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { @@ -1215,6 +1355,8 @@ ktime_t tick_nohz_get_next_hrtimer(void) * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. + * + * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { @@ -1228,7 +1370,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) ktime_t now = ts->idle_entrytime; ktime_t next_event; - WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); *delta_next = ktime_sub(dev->next_event, now); @@ -1240,7 +1382,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) return *delta_next; /* - * If the next highres timer to expire is earlier than next_event, the + * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, @@ -1252,8 +1394,11 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. + * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. + * + * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { @@ -1262,18 +1407,6 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) return ts->idle_calls; } -/** - * tick_nohz_get_idle_calls - return the current idle calls counter value - * - * Called from the schedutil frequency scaling governor in scheduler context. - */ -unsigned long tick_nohz_get_idle_calls(void) -{ - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - - return ts->idle_calls; -} - static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { @@ -1284,9 +1417,9 @@ static void tick_nohz_account_idle_time(struct tick_sched *ts, if (vtime_accounting_enabled_this_cpu()) return; /* - * We stopped the tick in idle. Update process times would miss the - * time we slept as update_process_times does only a 1 tick - * accounting. Enforce that this is accounted to idle ! + * We stopped the tick in idle. update_process_times() would miss the + * time we slept, as it does only a 1 tick accounting. + * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* @@ -1300,7 +1433,7 @@ void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->tick_stopped) { + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); @@ -1318,11 +1451,20 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) } /** - * tick_nohz_idle_exit - restart the idle tick from the idle task + * tick_nohz_idle_exit - Update the tick upon idle task exit + * + * When the idle task exits, update the tick depending on the + * following situations: + * + * 1) If the CPU is not in nohz_full mode (most cases), then + * restart the tick. + * + * 2) If the CPU is in nohz_full mode (corner case): + * 2.1) If the tick can be kept stopped (no tick dependencies) + * then re-evaluate the next tick and try to keep it stopped + * as long as possible. + * 2.2) If the tick has dependencies, restart the tick. * - * Restart the idle tick when the CPU is woken up from idle - * This also exit the RCU extended quiescent state. The CPU - * can use RCU again after this function is called. */ void tick_nohz_idle_exit(void) { @@ -1332,12 +1474,12 @@ void tick_nohz_idle_exit(void) local_irq_disable(); - WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); WARN_ON_ONCE(ts->timer_expires_base); - ts->inidle = 0; - idle_active = ts->idle_active; - tick_stopped = ts->tick_stopped; + tick_sched_flag_clear(ts, TS_FLAG_INIDLE); + idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); + tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); if (idle_active || tick_stopped) now = ktime_get(); @@ -1352,69 +1494,47 @@ void tick_nohz_idle_exit(void) } /* - * The nohz low res interrupt handler + * In low-resolution mode, the tick handler must be implemented directly + * at the clockevent level. hrtimer can't be used instead, because its + * infrastructure actually relies on the tick itself as a backend in + * low-resolution mode (see hrtimer_run_queues()). */ -static void tick_nohz_handler(struct clock_event_device *dev) +static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - struct pt_regs *regs = get_irq_regs(); - ktime_t now = ktime_get(); dev->next_event = KTIME_MAX; - tick_sched_do_timer(ts, now); - tick_sched_handle(ts, regs); - - if (unlikely(ts->tick_stopped)) { - /* - * The clockevent device is not reprogrammed, so change the - * clock event device to ONESHOT_STOPPED to avoid spurious - * interrupts on devices which might not be truly one shot. - */ - tick_program_event(KTIME_MAX, 1); - return; - } - - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } -static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +static inline void tick_nohz_activate(struct tick_sched *ts) { if (!tick_nohz_enabled) return; - ts->nohz_mode = mode; + tick_sched_flag_set(ts, TS_FLAG_NOHZ); /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** - * tick_nohz_switch_to_nohz - switch to nohz mode + * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - ktime_t next; - if (!tick_nohz_enabled) return; - if (tick_switch_to_oneshot(tick_nohz_handler)) + if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* - * Recycle the hrtimer in ts, so we can share the - * hrtimer_forward with the highres code. + * Recycle the hrtimer in 'ts', so we can share the + * highres code. */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - /* Get the next period */ - next = tick_init_jiffy_update(); - - hrtimer_set_expires(&ts->sched_timer, next); - hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); - tick_nohz_activate(ts, NOHZ_MODE_LOWRES); + tick_setup_sched_timer(false); } static inline void tick_nohz_irq_enter(void) @@ -1422,19 +1542,19 @@ static inline void tick_nohz_irq_enter(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; - if (!ts->idle_active && !ts->tick_stopped) + if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) return; now = ktime_get(); - if (ts->idle_active) + if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) tick_nohz_stop_idle(ts, now); /* - * If all CPUs are idle. We may need to update a stale jiffies value. + * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ - if (ts->tick_stopped) + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_update_jiffies(now); } @@ -1442,12 +1562,12 @@ static inline void tick_nohz_irq_enter(void) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } -static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } +static inline void tick_nohz_activate(struct tick_sched *ts) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * Called from irq_enter to notify about the possible interruption of idle() + * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { @@ -1455,41 +1575,6 @@ void tick_irq_enter(void) tick_nohz_irq_enter(); } -/* - * High resolution timer specific code - */ -#ifdef CONFIG_HIGH_RES_TIMERS -/* - * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled. - */ -static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) -{ - struct tick_sched *ts = - container_of(timer, struct tick_sched, sched_timer); - struct pt_regs *regs = get_irq_regs(); - ktime_t now = ktime_get(); - - tick_sched_do_timer(ts, now); - - /* - * Do not call, when we are not in irq context and have - * no valid regs pointer - */ - if (regs) - tick_sched_handle(ts, regs); - else - ts->next_tick = 0; - - /* No need to reprogram if we are in idle or full dynticks mode */ - if (unlikely(ts->tick_stopped)) - return HRTIMER_NORESTART; - - hrtimer_forward(timer, now, TICK_NSEC); - - return HRTIMER_RESTART; -} - static int sched_skew_tick; static int __init skew_tick(char *str) @@ -1502,22 +1587,22 @@ early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer + * @hrtimer: whether to use the hrtimer or not */ -void tick_setup_sched_timer(void) +void tick_setup_sched_timer(bool hrtimer) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - ktime_t now = ktime_get(); - /* - * Emulate tick processing via per-CPU hrtimers: - */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - ts->sched_timer.function = tick_sched_timer; + /* Emulate tick processing via per-CPU hrtimers: */ + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) + tick_sched_flag_set(ts, TS_FLAG_HIGHRES); /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - /* Offset the tick to avert jiffies_lock contention. */ + /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); @@ -1525,25 +1610,38 @@ void tick_setup_sched_timer(void) hrtimer_add_expires_ns(&ts->sched_timer, offset); } - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); - tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); + hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); + else + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + tick_nohz_activate(ts); } -#endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS -void tick_cancel_sched_timer(int cpu) +/* + * Shut down the tick and make sure the CPU won't try to retake the timekeeping + * duty before disabling IRQs in idle for the last time. + */ +void tick_sched_timer_dying(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t idle_sleeptime, iowait_sleeptime; + unsigned long idle_calls, idle_sleeps; -# ifdef CONFIG_HIGH_RES_TIMERS - if (ts->sched_timer.base) + /* This must happen before hrtimers are migrated! */ + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); -# endif + idle_sleeptime = ts->idle_sleeptime; + iowait_sleeptime = ts->iowait_sleeptime; + idle_calls = ts->idle_calls; + idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); + ts->idle_sleeptime = idle_sleeptime; + ts->iowait_sleeptime = iowait_sleeptime; + ts->idle_calls = idle_calls; + ts->idle_sleeps = idle_sleeps; } -#endif /* * Async notification about clocksource changes @@ -1567,10 +1665,10 @@ void tick_oneshot_notify(void) } /* - * Check, if a change happened, which makes oneshot possible. + * Check if a change happened, which makes oneshot possible. * - * Called cyclic from the hrtimer softirq (driven by the timer - * softirq) allow_nohz signals, that we can switch into low-res nohz + * Called cyclically from the hrtimer softirq (driven by the timer + * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ @@ -1581,7 +1679,7 @@ int tick_check_oneshot_change(int allow_nohz) if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; - if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 504649513399..b4a7822f495d 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -14,82 +14,101 @@ struct tick_device { enum tick_device_mode mode; }; -enum tick_nohz_mode { - NOHZ_MODE_INACTIVE, - NOHZ_MODE_LOWRES, - NOHZ_MODE_HIGHRES, -}; +/* The CPU is in the tick idle mode */ +#define TS_FLAG_INIDLE BIT(0) +/* The idle tick has been stopped */ +#define TS_FLAG_STOPPED BIT(1) +/* + * Indicator that the CPU is actively in the tick idle mode; + * it is reset during irq handling phases. + */ +#define TS_FLAG_IDLE_ACTIVE BIT(2) +/* CPU was the last one doing do_timer before going idle */ +#define TS_FLAG_DO_TIMER_LAST BIT(3) +/* NO_HZ is enabled */ +#define TS_FLAG_NOHZ BIT(4) +/* High resolution tick mode */ +#define TS_FLAG_HIGHRES BIT(5) /** * struct tick_sched - sched tick emulation and no idle tick control/stats + * + * @flags: State flags gathering the TS_FLAG_* features + * @got_idle_tick: Tick timer function has run with @inidle set + * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @last_tick_jiffies: Value of jiffies seen on last tick * @sched_timer: hrtimer to schedule the periodic tick in high * resolution mode - * @check_clocks: Notification mechanism about clocksource changes - * @nohz_mode: Mode - one state of tick_nohz_mode - * @inidle: Indicator that the CPU is in the tick idle mode - * @tick_stopped: Indicator that the idle tick has been stopped - * @idle_active: Indicator that the CPU is actively in the tick idle mode; - * it is reset during irq handling phases. - * @do_timer_lst: CPU was the last one doing do_timer before going idle - * @got_idle_tick: Tick timer function has run with @inidle set * @last_tick: Store the last tick expiry time when the tick * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_waketime: Time when the idle was interrupted + * @idle_sleeptime_seq: sequence counter for data consistency + * @idle_entrytime: Time when the idle call was entered + * @last_jiffies: Base jiffies snapshot when next event was last computed + * @timer_expires_base: Base time clock monotonic for @timer_expires + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @idle_expires: Next tick in idle, for debugging purpose only * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped - * @idle_entrytime: Time when the idle call was entered - * @idle_waketime: Time when the idle was interrupted * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) - * @timer_expires_base: Base time clock monotonic for @timer_expires - * @next_timer: Expiry time of next expiring timer for debugging purpose only * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick - * @last_tick_jiffies: Value of jiffies seen on last tick - * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @check_clocks: Notification mechanism about clocksource changes */ struct tick_sched { - struct hrtimer sched_timer; - unsigned long check_clocks; - enum tick_nohz_mode nohz_mode; + /* Common flags */ + unsigned long flags; - unsigned int inidle : 1; - unsigned int tick_stopped : 1; - unsigned int idle_active : 1; - unsigned int do_timer_last : 1; - unsigned int got_idle_tick : 1; + /* Tick handling: jiffies stall check */ + unsigned int stalled_jiffies; + unsigned long last_tick_jiffies; + /* Tick handling */ + struct hrtimer sched_timer; ktime_t last_tick; ktime_t next_tick; unsigned long idle_jiffies; - unsigned long idle_calls; - unsigned long idle_sleeps; - ktime_t idle_entrytime; ktime_t idle_waketime; - ktime_t idle_exittime; - ktime_t idle_sleeptime; - ktime_t iowait_sleeptime; + unsigned int got_idle_tick; + + /* Idle entry */ + seqcount_t idle_sleeptime_seq; + ktime_t idle_entrytime; + + /* Tick stop */ unsigned long last_jiffies; - u64 timer_expires; u64 timer_expires_base; + u64 timer_expires; u64 next_timer; ktime_t idle_expires; + unsigned long idle_calls; + unsigned long idle_sleeps; + + /* Idle exit */ + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + + /* Full dynticks handling */ atomic_t tick_dep_mask; - unsigned long last_tick_jiffies; - unsigned int stalled_jiffies; + + /* Clocksource changes */ + unsigned long check_clocks; }; extern struct tick_sched *tick_get_tick_sched(int cpu); -extern void tick_setup_sched_timer(void); -#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS -extern void tick_cancel_sched_timer(int cpu); +extern void tick_setup_sched_timer(bool hrtimer); +#if defined CONFIG_TICK_ONESHOT +extern void tick_sched_timer_dying(int cpu); #else -static inline void tick_cancel_sched_timer(int cpu) { } +static inline void tick_sched_timer_dying(int cpu) { } #endif #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST diff --git a/kernel/time/time.c b/kernel/time/time.c index f4198af60fee..0ba8e3c50d62 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -365,11 +365,14 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) } #endif -/* - * Convert jiffies to milliseconds and back. +/** + * jiffies_to_msecs - Convert jiffies to milliseconds + * @j: jiffies value * * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: + * two most common HZ cases. + * + * Return: milliseconds value */ unsigned int jiffies_to_msecs(const unsigned long j) { @@ -388,6 +391,12 @@ unsigned int jiffies_to_msecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_msecs); +/** + * jiffies_to_usecs - Convert jiffies to microseconds + * @j: jiffies value + * + * Return: microseconds value + */ unsigned int jiffies_to_usecs(const unsigned long j) { /* @@ -408,8 +417,15 @@ unsigned int jiffies_to_usecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_usecs); -/* +/** * mktime64 - Converts date to seconds. + * @year0: year to convert + * @mon0: month to convert + * @day: day to convert + * @hour: hour to convert + * @min: minute to convert + * @sec: second to convert + * * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. @@ -427,6 +443,8 @@ EXPORT_SYMBOL(jiffies_to_usecs); * * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight * tomorrow - (allowable under ISO 8601) is supported. + * + * Return: seconds since the epoch time for the given input date */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, @@ -471,8 +489,7 @@ EXPORT_SYMBOL(ns_to_kernel_old_timeval); * Set seconds and nanoseconds field of a timespec variable and * normalize to the timespec storage format * - * Note: The tv_nsec part is always in the range of - * 0 <= tv_nsec < NSEC_PER_SEC + * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC. * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) @@ -501,7 +518,7 @@ EXPORT_SYMBOL(set_normalized_timespec64); * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * - * Returns the timespec64 representation of the nsec parameter. + * Return: the timespec64 representation of the nsec parameter. */ struct timespec64 ns_to_timespec64(s64 nsec) { @@ -539,15 +556,17 @@ EXPORT_SYMBOL(ns_to_timespec64); * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. - * for the details see __msecs_to_jiffies() + * for the details see _msecs_to_jiffies() * - * __msecs_to_jiffies() checks for the passed in value being a constant + * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The _msecs_to_jiffies helpers are the HZ dependent conversion * routines found in include/linux/jiffies.h + * + * Return: jiffies value */ unsigned long __msecs_to_jiffies(const unsigned int m) { @@ -560,6 +579,12 @@ unsigned long __msecs_to_jiffies(const unsigned int m) } EXPORT_SYMBOL(__msecs_to_jiffies); +/** + * __usecs_to_jiffies: - convert microseconds to jiffies + * @u: time in milliseconds + * + * Return: jiffies value + */ unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) @@ -568,7 +593,10 @@ unsigned long __usecs_to_jiffies(const unsigned int u) } EXPORT_SYMBOL(__usecs_to_jiffies); -/* +/** + * timespec64_to_jiffies - convert a timespec64 value to jiffies + * @value: pointer to &struct timespec64 + * * The TICK_NSEC - 1 rounds up the value to the next resolution. Note * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundaries. I.e. the line: @@ -582,8 +610,9 @@ EXPORT_SYMBOL(__usecs_to_jiffies); * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. + * + * Return: jiffies value */ - unsigned long timespec64_to_jiffies(const struct timespec64 *value) { @@ -601,6 +630,11 @@ timespec64_to_jiffies(const struct timespec64 *value) } EXPORT_SYMBOL(timespec64_to_jiffies); +/** + * jiffies_to_timespec64 - convert jiffies value to &struct timespec64 + * @jiffies: jiffies value + * @value: pointer to &struct timespec64 + */ void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { @@ -618,6 +652,13 @@ EXPORT_SYMBOL(jiffies_to_timespec64); /* * Convert jiffies/jiffies_64 to clock_t and back. */ + +/** + * jiffies_to_clock_t - Convert jiffies to clock_t + * @x: jiffies value + * + * Return: jiffies converted to clock_t (CLOCKS_PER_SEC) + */ clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 @@ -632,6 +673,12 @@ clock_t jiffies_to_clock_t(unsigned long x) } EXPORT_SYMBOL(jiffies_to_clock_t); +/** + * clock_t_to_jiffies - Convert clock_t to jiffies + * @x: clock_t value + * + * Return: clock_t value converted to jiffies + */ unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 @@ -649,6 +696,12 @@ unsigned long clock_t_to_jiffies(unsigned long x) } EXPORT_SYMBOL(clock_t_to_jiffies); +/** + * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t + * @x: jiffies_64 value + * + * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) + */ u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 @@ -671,6 +724,12 @@ u64 jiffies_64_to_clock_t(u64 x) } EXPORT_SYMBOL(jiffies_64_to_clock_t); +/** + * nsec_to_clock_t - Convert nsec value to clock_t + * @x: nsec value + * + * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) + */ u64 nsec_to_clock_t(u64 x) { #if (NSEC_PER_SEC % USER_HZ) == 0 @@ -687,6 +746,12 @@ u64 nsec_to_clock_t(u64 x) #endif } +/** + * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds + * @j: jiffies64 value + * + * Return: nanoseconds value + */ u64 jiffies64_to_nsecs(u64 j) { #if !(NSEC_PER_SEC % HZ) @@ -697,6 +762,12 @@ u64 jiffies64_to_nsecs(u64 j) } EXPORT_SYMBOL(jiffies64_to_nsecs); +/** + * jiffies64_to_msecs - Convert jiffies64 to milliseconds + * @j: jiffies64 value + * + * Return: milliseconds value + */ u64 jiffies64_to_msecs(const u64 j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) @@ -719,6 +790,8 @@ EXPORT_SYMBOL(jiffies64_to_msecs); * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + * + * Return: nsecs converted to jiffies64 value */ u64 nsecs_to_jiffies64(u64 n) { @@ -750,6 +823,8 @@ EXPORT_SYMBOL(nsecs_to_jiffies64); * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + * + * Return: nsecs converted to jiffies value */ unsigned long nsecs_to_jiffies(u64 n) { @@ -757,10 +832,16 @@ unsigned long nsecs_to_jiffies(u64 n) } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); -/* - * Add two timespec64 values and do a safety check for overflow. +/** + * timespec64_add_safe - Add two timespec64 values and do a safety check + * for overflow. + * @lhs: first (left) timespec64 to add + * @rhs: second (right) timespec64 to add + * * It's assumed that both values are valid (>= 0). * And, each timespec64 is in normalized form. + * + * Return: sum of @lhs + @rhs */ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs) @@ -777,7 +858,17 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } +EXPORT_SYMBOL_GPL(timespec64_add_safe); +/** + * get_timespec64 - get user's time value into kernel space + * @ts: destination &struct timespec64 + * @uts: user's time value as &struct __kernel_timespec + * + * Handles compat or 32-bit modes. + * + * Return: 0 on success or negative errno on error + */ int get_timespec64(struct timespec64 *ts, const struct __kernel_timespec __user *uts) { @@ -801,6 +892,14 @@ int get_timespec64(struct timespec64 *ts, } EXPORT_SYMBOL_GPL(get_timespec64); +/** + * put_timespec64 - convert timespec64 value to __kernel_timespec format and + * copy the latter to userspace + * @ts: input &struct timespec64 + * @uts: user's &struct __kernel_timespec + * + * Return: 0 on success or negative errno on error + */ int put_timespec64(const struct timespec64 *ts, struct __kernel_timespec __user *uts) { @@ -839,6 +938,15 @@ static int __put_old_timespec32(const struct timespec64 *ts64, return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } +/** + * get_old_timespec32 - get user's old-format time value into kernel space + * @ts: destination &struct timespec64 + * @uts: user's old-format time value (&struct old_timespec32) + * + * Handles X86_X32_ABI compatibility conversion. + * + * Return: 0 on success or negative errno on error + */ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) @@ -848,6 +956,16 @@ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) } EXPORT_SYMBOL_GPL(get_old_timespec32); +/** + * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and + * copy the latter to userspace + * @ts: input &struct timespec64 + * @uts: user's &struct old_timespec32 + * + * Handles X86_X32_ABI compatibility conversion. + * + * Return: 0 on success or negative errno on error + */ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) @@ -857,6 +975,13 @@ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) } EXPORT_SYMBOL_GPL(put_old_timespec32); +/** + * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space + * @it: destination &struct itimerspec64 + * @uit: user's &struct __kernel_itimerspec + * + * Return: 0 on success or negative errno on error + */ int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) { @@ -872,6 +997,14 @@ int get_itimerspec64(struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(get_itimerspec64); +/** + * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format + * and copy the latter to userspace + * @it: input &struct itimerspec64 + * @uit: user's &struct __kernel_itimerspec + * + * Return: 0 on success or negative errno on error + */ int put_itimerspec64(const struct itimerspec64 *it, struct __kernel_itimerspec __user *uit) { @@ -887,6 +1020,13 @@ int put_itimerspec64(const struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(put_itimerspec64); +/** + * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space + * @its: destination &struct itimerspec64 + * @uits: user's &struct old_itimerspec32 + * + * Return: 0 on success or negative errno on error + */ int get_old_itimerspec32(struct itimerspec64 *its, const struct old_itimerspec32 __user *uits) { @@ -898,6 +1038,14 @@ int get_old_itimerspec32(struct itimerspec64 *its, } EXPORT_SYMBOL_GPL(get_old_itimerspec32); +/** + * put_old_itimerspec32 - convert &struct itimerspec64 to &struct + * old_itimerspec32 and copy the latter to userspace + * @its: input &struct itimerspec64 + * @uits: user's &struct old_itimerspec32 + * + * Return: 0 on success or negative errno on error + */ int put_old_itimerspec32(const struct itimerspec64 *its, struct old_itimerspec32 __user *uits) { diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 831e8e779ace..2889763165e5 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -73,7 +73,7 @@ static void time64_to_tm_test_date_range(struct kunit *test) days = div_s64(secs, 86400); - #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \ + #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %lld", \ year, month, mdday, yday, days KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG); @@ -86,7 +86,7 @@ static void time64_to_tm_test_date_range(struct kunit *test) } static struct kunit_case time_test_cases[] = { - KUNIT_CASE(time64_to_tm_test_date_range), + KUNIT_CASE_SLOW(time64_to_tm_test_date_range), {} }; @@ -96,4 +96,5 @@ static struct kunit_suite time_test_suite = { }; kunit_test_suite(time_test_suite); +MODULE_DESCRIPTION("time unit test suite"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index e6285288d765..3d2a354cfe1c 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -6,7 +6,7 @@ #include <linux/timecounter.h> void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, + struct cyclecounter *cc, u64 start_tstamp) { tc->cc = cc; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5579ead449f2..3ec3daa4acab 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -6,6 +6,7 @@ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/kobject.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> @@ -25,13 +26,16 @@ #include <linux/audit.h> #include <linux/random.h> +#include <vdso/auxclock.h> + #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) -#define TK_MIRROR (1 << 1) -#define TK_CLOCK_WAS_SET (1 << 2) +#define TK_CLOCK_WAS_SET (1 << 1) + +#define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ @@ -41,20 +45,49 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; -DEFINE_RAW_SPINLOCK(timekeeper_lock); - /* * The most important data for readout fits into a single 64 byte * cache line. */ -static struct { +struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; -} tk_core ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), -}; + struct timekeeper shadow_timekeeper; + raw_spinlock_t lock; +} ____cacheline_aligned; + +static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; + +/* The core timekeeper */ +#define tk_core (timekeeper_data[TIMEKEEPER_CORE]) + +#ifdef CONFIG_POSIX_AUX_CLOCKS +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; +} +#else +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return false; +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return false; +} +#endif -static struct timekeeper shadow_timekeeper; +static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs) +{ + tk->offs_aux = offs; + tk->monotonic_to_aux = ktime_to_timespec64(offs); +} /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -114,6 +147,46 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void); +static void tk_aux_update_clocksource(void); +static void tk_aux_advance(void); +#else +static inline void tk_aux_setup(void) { } +static inline void tk_aux_update_clocksource(void) { } +static inline void tk_aux_advance(void) { } +#endif + +unsigned long timekeeper_lock_irqsave(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&tk_core.lock, flags); + return flags; +} + +void timekeeper_unlock_irqrestore(unsigned long flags) +{ + raw_spin_unlock_irqrestore(&tk_core.lock, flags); +} + +/* + * Multigrain timestamps require tracking the latest fine-grained timestamp + * that has been issued, and never returning a coarse-grained timestamp that is + * earlier than that value. + * + * mg_floor represents the latest fine-grained time that has been handed out as + * a file timestamp on the system. This is tracked as a monotonic ktime_t, and + * converted to a realtime clock value on an as-needed basis. + * + * Maintaining mg_floor ensures the multigrain interfaces never issue a + * timestamp earlier than one that has been previously issued. + * + * The exception to this rule is when there is a backward realtime clock jump. If + * such an event occurs, a timestamp can appear to be earlier than a previous one. + */ +static __cacheline_aligned_in_smp atomic64_t mg_floor; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -135,10 +208,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -146,6 +243,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -161,13 +259,15 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec64_to_ktime(tmp); - tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); + WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. @@ -184,7 +284,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. - * This isn't necessary to use when holding the timekeeper_lock or doing + * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ @@ -195,106 +295,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr) return clock->read(clock); } -#ifdef CONFIG_DEBUG_TIMEKEEPING -#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ - -static void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ - - u64 max_cycles = tk->tkr_mono.clock->max_cycles; - const char *name = tk->tkr_mono.clock->name; - - if (offset > max_cycles) { - printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", - offset, name, max_cycles); - printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); - } else { - if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", - offset, name, max_cycles >> 1); - printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); - } - } - - if (tk->underflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->underflow_seen = 0; - } - - if (tk->overflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->overflow_seen = 0; - } -} - -static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) -{ - struct timekeeper *tk = &tk_core.timekeeper; - u64 now, last, mask, max, delta; - unsigned int seq; - - /* - * Since we're called holding a seqcount, the data may shift - * under us while we're doing the calculation. This can cause - * false positives, since we'd note a problem but throw the - * results away. So nest another seqcount here to atomically - * grab the points we are checking with. - */ - do { - seq = read_seqcount_begin(&tk_core.seq); - now = tk_clock_read(tkr); - last = tkr->cycle_last; - mask = tkr->mask; - max = tkr->clock->max_cycles; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - delta = clocksource_delta(now, last, mask); - - /* - * Try to catch underflows by checking if we are seeing small - * mask-relative negative values. - */ - if (unlikely((~delta & mask) < (mask >> 3))) { - tk->underflow_seen = 1; - delta = 0; - } - - /* Cap delta value to the max_cycles values to avoid mult overflows */ - if (unlikely(delta > max)) { - tk->overflow_seen = 1; - delta = tkr->clock->max_cycles; - } - - return delta; -} -#else -static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ -} -static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) -{ - u64 cycle_now, delta; - - /* read clocksource */ - cycle_now = tk_clock_read(tkr); - - /* calculate the delta since the last update_wall_time */ - delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); - - return delta; -} -#endif - /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -370,32 +370,38 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) } /* Timekeeper helper functions. */ - -static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta) +static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) { - u64 nsec; - - nsec = delta * tkr->mult + tkr->xtime_nsec; - nsec >>= tkr->shift; - - return nsec; + return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } -static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) +static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { - u64 delta; + /* Calculate the delta since the last update_wall_time() */ + u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; + + /* + * This detects both negative motion and the case where the delta + * overflows the multiplication with tkr->mult. + */ + if (unlikely(delta > tkr->clock->max_cycles)) { + /* + * Handle clocksource inconsistency between CPUs to prevent + * time from going backwards by checking for the MSB of the + * mask being set in the delta. + */ + if (delta & ~(mask >> 1)) + return tkr->xtime_nsec >> tkr->shift; + + return delta_to_ns_safe(tkr, delta); + } - delta = timekeeping_get_delta(tkr); - return timekeeping_delta_to_ns(tkr, delta); + return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { - u64 delta; - - /* calculate the delta since the last update_wall_time */ - delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); - return timekeeping_delta_to_ns(tkr, delta); + return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } /** @@ -406,7 +412,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * - * Employ the latch technique; see @raw_write_seqcount_latch. + * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a @@ -419,24 +425,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); -} - -static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) -{ - u64 delta, cycles = tk_clock_read(tkr); - delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); - return timekeeping_delta_to_ns(tkr, delta); + write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) @@ -446,10 +446,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) u64 now; do { - seq = raw_read_seqcount_latch(&tkf->seq); + seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += fast_tk_get_delta_ns(tkr); + now += timekeeping_get_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -520,13 +520,13 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); - * timekeeping_update(tk, TK_CLEAR_NTP...); + * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. * - * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns() + * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) @@ -554,91 +554,30 @@ u64 notrace ktime_get_tai_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); -static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. + */ +u64 ktime_get_real_fast_ns(void) { + struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; - u64 basem, baser, delta; + u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - delta = fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + delta = timekeeping_get_ns(tkr); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); - if (mono) - *mono = basem + delta; return baser + delta; } - -/** - * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. - * - * See ktime_get_fast_ns() for documentation of the time stamp ordering. - */ -u64 ktime_get_real_fast_ns(void) -{ - return __ktime_get_real_fast(&tk_fast_mono, NULL); -} EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** - * ktime_get_fast_timestamps: - NMI safe timestamps - * @snapshot: Pointer to timestamp storage - * - * Stores clock monotonic, boottime and realtime timestamps. - * - * Boot time is a racy access on 32bit systems if the sleep time injection - * happens late during resume and not in timekeeping_resume(). That could - * be avoided by expanding struct tk_read_base with boot offset for 32bit - * and adding more overhead to the update. As this is a hard to observe - * once per resume event which can be filtered with reasonable effort using - * the accurate mono/real timestamps, it's probably not worth the trouble. - * - * Aside of that it might be possible on 32 and 64 bit to observe the - * following when the sleep time injection happens late: - * - * CPU 0 CPU 1 - * timekeeping_resume() - * ktime_get_fast_timestamps() - * mono, real = __ktime_get_real_fast() - * inject_sleep_time() - * update boot offset - * boot = mono + bootoffset; - * - * That means that boot time already has the sleep time adjustment, but - * real time does not. On the next readout both are in sync again. - * - * Preventing this for 64bit is not really feasible without destroying the - * careful cache layout of the timekeeper because the sequence count and - * struct tk_read_base would then need two cache lines instead of one. - * - * Access to the time keeper clock source is disabled across the innermost - * steps of suspend/resume. The accessors still work, but the timestamps - * are frozen until time keeping is resumed which happens very early. - * - * For regular suspend/resume there is no observable difference vs. sched - * clock, but it might affect some of the nasty low level debug printks. - * - * OTOH, access to sched clock is not guaranteed across suspend/resume on - * all systems either so it depends on the hardware in use. - * - * If that turns out to be a real problem then this could be mitigated by - * using sched clock in a similar way as during early boot. But it's not as - * trivial as on early boot because it needs some careful protection - * against the clock monotonic timestamp jumping backwards on resume. - */ -void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); - snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); -} - -/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * @@ -679,13 +618,11 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; int ret; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -698,14 +635,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - - return ret; + guard(raw_spinlock_irqsave)(&tk_core.lock); + return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); @@ -714,13 +645,25 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_leap_state(struct timekeeper *tk) { - tk->next_leap_ktime = ntp_get_next_leap(); + tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } /* + * Leap state update for both shadow and the real timekeeper + * Separate to spare a full memcpy() of the timekeeper. + */ +static void tk_update_leap_state_all(struct tk_data *tkd) +{ + write_seqcount_begin(&tkd->seq); + tk_update_leap_state(&tkd->shadow_timekeeper); + tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; + write_seqcount_end(&tkd->seq); +} + +/* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) @@ -753,34 +696,62 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } -/* must hold timekeeper_lock */ -static void timekeeping_update(struct timekeeper *tk, unsigned int action) +/* + * Restore the shadow timekeeper from the real timekeeper. + */ +static void timekeeping_restore_shadow(struct tk_data *tkd) +{ + lockdep_assert_held(&tkd->lock); + memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); +} + +static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { + struct timekeeper *tk = &tkd->shadow_timekeeper; + + lockdep_assert_held(&tkd->lock); + + /* + * Block out readers before running the updates below because that + * updates VDSO and other time related infrastructure. Not blocking + * the readers might let a reader see time going backwards when + * reading from the VDSO after the VDSO update and then reading in + * the kernel from the timekeeper before that got updated. + */ + write_seqcount_begin(&tkd->seq); + if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; - ntp_clear(); + ntp_clear(tk->id); } tk_update_leap_state(tk); tk_update_ktime_data(tk); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (tk->id == TIMEKEEPER_CORE) { + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } else if (tk_is_aux(tk)) { + vdso_time_update_aux(tk); + } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; + /* - * The mirroring of the data to the shadow-timekeeper needs - * to happen last here to ensure we don't over-write the - * timekeeper structure on the next update with stale data + * Update the real timekeeper. + * + * We could avoid this memcpy() by switching pointers, but that has + * the downside that the reader side does not longer benefit from + * the cacheline optimized data layout of the timekeeper and requires + * another indirection. */ - if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &tk_core.timekeeper, - sizeof(tk_core.timekeeper)); + memcpy(&tkd->timekeeper, tk, sizeof(*tk)); + write_seqcount_end(&tkd->seq); } /** @@ -796,14 +767,21 @@ static void timekeeping_forward_now(struct timekeeper *tk) u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; - tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; - tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + while (delta > 0) { + u64 max = tk->tkr_mono.clock->max_cycles; + u64 incr = delta < max ? delta : max; - tk_normalize_xtime(tk); + tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult; + tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult; + tk_normalize_xtime(tk); + delta -= incr; + } + tk_update_coarse_nsecs(tk); } /** @@ -900,8 +878,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; ktime_t base, *offset = offsets[offs]; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -909,7 +887,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -928,6 +906,14 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) unsigned int seq; ktime_t tconv; + if (IS_ENABLED(CONFIG_64BIT)) { + /* + * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and + * tk_update_sleep_time(). + */ + return ktime_add(tmono, READ_ONCE(*offset)); + } + do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); @@ -1037,9 +1023,14 @@ time64_t ktime_get_real_seconds(void) EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** - * __ktime_get_real_seconds - The same as ktime_get_real_seconds - * but without the sequence counter protect. This internal function - * is called just when timekeeping lock is already held. + * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds + * + * The same as ktime_get_real_seconds() but without the sequence counter + * protection. This function is used in restricted contexts like the x86 MCE + * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half + * completed modification and only to be used for such critical contexts. + * + * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { @@ -1058,6 +1049,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) unsigned int seq; ktime_t base_raw; ktime_t base_real; + ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; @@ -1072,6 +1064,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); + base_boot = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); @@ -1079,6 +1073,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); @@ -1180,17 +1175,121 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, } /* - * cycle_between - true if test occurs chronologically between before and after + * timestamp_in_interval - true if ts is chronologically in [start, end] + * + * True if ts occurs chronologically at or after start, and before or at end. */ -static bool cycle_between(u64 before, u64 test, u64 after) +static bool timestamp_in_interval(u64 start, u64 end, u64 ts) { - if (test > before && test < after) + if (ts >= start && ts <= end) return true; - if (test < before && before > after) + if (start > end && (ts >= start || ts <= end)) return true; return false; } +static bool convert_clock(u64 *val, u32 numerator, u32 denominator) +{ + u64 rem, res; + + if (!numerator || !denominator) + return false; + + res = div64_u64_rem(*val, denominator, &rem) * numerator; + *val = res + div_u64(rem * numerator, denominator); + return true; +} + +static bool convert_base_to_cs(struct system_counterval_t *scv) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + u32 num, den; + + /* The timestamp was taken from the time keeper clock source */ + if (cs->id == scv->cs_id) + return true; + + /* + * Check whether cs_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != scv->cs_id) + return false; + + num = scv->use_nsecs ? cs->freq_khz : base->numerator; + den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; + + if (!convert_clock(&scv->cycles, num, den)) + return false; + + scv->cycles += base->offset; + return true; +} + +static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + + /* + * Check whether base_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != base_id) + return false; + + *cycles -= base->offset; + if (!convert_clock(cycles, base->denominator, base->numerator)) + return false; + return true; +} + +static bool convert_ns_to_cs(u64 *delta) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + + if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) + return false; + + *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); + return true; +} + +/** + * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp + * @treal: CLOCK_REALTIME timestamp to convert + * @base_id: base clocksource id + * @cycles: pointer to store the converted base clock timestamp + * + * Converts a supplied, future realtime clock value to the corresponding base clock value. + * + * Return: true if the conversion is successful, false otherwise. + */ +bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 delta; + + do { + seq = read_seqcount_begin(&tk_core.seq); + if ((u64)treal < tk->tkr_mono.base_real) + return false; + delta = (u64)treal - tk->tkr_mono.base_real; + if (!convert_ns_to_cs(&delta)) + return false; + *cycles = tk->tkr_mono.cycle_last + delta; + if (!convert_cs_to_base(cycles, base_id)) + return false; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return true; +} +EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and @@ -1210,7 +1309,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { - struct system_counterval_t system_counterval; + struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; @@ -1232,11 +1331,12 @@ int get_device_system_crosststamp(int (*get_time_fn) return ret; /* - * Verify that the clocksource associated with the captured - * system counter value is the same as the currently installed - * timekeeper clocksource + * Verify that the clocksource ID associated with the captured + * system counter value is the same as for the currently + * installed timekeeper clocksource */ - if (tk->tkr_mono.clock != system_counterval.cs) + if (system_counterval.cs_id == CSID_GENERIC || + !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; @@ -1246,7 +1346,7 @@ int get_device_system_crosststamp(int (*get_time_fn) */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; - if (!cycle_between(interval_start, cycles, now)) { + if (!timestamp_in_interval(interval_start, now, cycles)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; @@ -1259,10 +1359,8 @@ int get_device_system_crosststamp(int (*get_time_fn) tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; - nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, - system_counterval.cycles); - nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, - system_counterval.cycles); + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); @@ -1277,13 +1375,13 @@ int get_device_system_crosststamp(int (*get_time_fn) bool discontinuity; /* - * Check that the counter value occurs after the provided + * Check that the counter value is not before the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || - !cycle_between(history_begin->cycles, - system_counterval.cycles, cycles) || + !timestamp_in_interval(history_begin->cycles, + cycles, system_counterval.cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; @@ -1304,6 +1402,30 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** + * timekeeping_clocksource_has_base - Check whether the current clocksource + * is based on given a base clock + * @id: base clocksource ID + * + * Note: The return value is a snapshot which can become invalid right + * after the function returns. + * + * Return: true if the timekeeper clocksource has a base clock with @id, + * false otherwise + */ +bool timekeeping_clocksource_has_base(enum clocksource_ids id) +{ + /* + * This is a snapshot, so no point in using the sequence + * count. Just prevent the compiler from re-evaluating @base as the + * clocksource might change concurrently. + */ + struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); + + return base ? base->id == id : false; +} +EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); + +/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -1311,89 +1433,102 @@ EXPORT_SYMBOL_GPL(get_device_system_crosststamp); */ int do_settimeofday64(const struct timespec64 *ts) { - struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt; - unsigned long flags; - int ret = 0; if (!timespec64_valid_settod(ts)) return -EINVAL; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - - timekeeping_forward_now(tk); - - xt = tk_xtime(tk); - ts_delta = timespec64_sub(*ts, xt); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { - ret = -EINVAL; - goto out; - } + timekeeping_forward_now(tks); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); + xt = tk_xtime(tks); + ts_delta = timespec64_sub(*ts, xt); - tk_set_xtime(tk, ts); -out: - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { + timekeeping_restore_shadow(&tk_core); + return -EINVAL; + } - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); + tk_set_xtime(tks, ts); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); - if (!ret) { - audit_tk_injoffset(ts_delta); - add_device_randomness(ts, sizeof(*ts)); - } - - return ret; + audit_tk_injoffset(ts_delta); + add_device_randomness(ts, sizeof(*ts)); + return 0; } EXPORT_SYMBOL(do_settimeofday64); +static inline bool timekeeper_is_core_tk(struct timekeeper *tk) +{ + return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; +} + /** - * timekeeping_inject_offset - Adds or subtracts from the current time. + * __timekeeping_inject_offset - Adds or subtracts from the current time. + * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; + struct timekeeper *tks = &tkd->shadow_timekeeper; struct timespec64 tmp; - int ret = 0; if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + timekeeping_forward_now(tks); - timekeeping_forward_now(tk); + if (timekeeper_is_core_tk(tks)) { + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tks), *ts); + if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + + tk_xtime_add(tks, ts); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); + } else { + struct tk_read_base *tkr_mono = &tks->tkr_mono; + ktime_t now, offs; - /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tk), *ts); - if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || - !timespec64_valid_settod(&tmp)) { - ret = -EINVAL; - goto error; + /* Get the current time */ + now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); + /* Add the relative offset change */ + offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); + + /* Prevent that the resulting time becomes negative */ + if (ktime_add(now, offs) < 0) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + tk_update_aux_offs(tks, offs); } - tk_xtime_add(tk, ts); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + return 0; +} -error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + int ret; - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) + ret = __timekeeping_inject_offset(&tk_core, ts); /* Signal hrtimers about time change */ - clock_was_set(CLOCK_SET_WALL); - + if (!ret) + clock_was_set(CLOCK_SET_WALL); return ret; } @@ -1447,43 +1582,36 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) */ static int change_clocksource(void *data) { - struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *new, *old = NULL; - unsigned long flags; - bool change = false; - - new = (struct clocksource *) data; + struct clocksource *new = data, *old = NULL; /* - * If the cs is in module, get a module reference. Succeeds - * for built-in code (owner == NULL) as well. + * If the clocksource is in a module, get a module reference. + * Succeeds for built-in code (owner == NULL) as well. Abort if the + * reference can't be acquired. */ - if (try_module_get(new->owner)) { - if (!new->enable || new->enable(new) == 0) - change = true; - else - module_put(new->owner); - } + if (!try_module_get(new->owner)) + return 0; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + /* Abort if the device can't be enabled */ + if (new->enable && new->enable(new) != 0) { + module_put(new->owner); + return 0; + } - timekeeping_forward_now(tk); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - if (change) { - old = tk->tkr_mono.clock; - tk_setup_internals(tk, new); + timekeeping_forward_now(tks); + old = tks->tkr_mono.clock; + tk_setup_internals(tks, new); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + tk_aux_update_clocksource(); if (old) { if (old->disable) old->disable(old); - module_put(old->owner); } @@ -1532,6 +1660,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); +/** + * ktime_get_clock_ts64 - Returns time of a clock in a timespec + * @id: POSIX clock ID of the clock to read + * @ts: Pointer to the timespec64 to be set + * + * The timestamp is invalidated (@ts->sec is set to -1) if the + * clock @id is not available. + */ +void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) +{ + /* Invalidate time stamp */ + ts->tv_sec = -1; + ts->tv_nsec = 0; + + switch (id) { + case CLOCK_REALTIME: + ktime_get_real_ts64(ts); + return; + case CLOCK_MONOTONIC: + ktime_get_ts64(ts); + return; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(ts); + return; + case CLOCK_AUX ... CLOCK_AUX_LAST: + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + ktime_get_aux_ts64(id, ts); + return; + default: + WARN_ON_ONCE(1); + } +} +EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres @@ -1608,6 +1769,14 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) +{ + raw_spin_lock_init(&tkd->lock); + seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); + tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; + tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; +} + /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * @@ -1632,9 +1801,11 @@ static bool persistent_clock_exists; void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; - struct timekeeper *tk = &tk_core.timekeeper; + struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - unsigned long flags; + + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); + tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -1654,24 +1825,21 @@ void __init timekeeping_init(void) */ wall_to_mono = timespec64_sub(boot_offset, wall_time); - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + guard(raw_spinlock_irqsave)(&tk_core.lock); + ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - tk_setup_internals(tk, clock); + tk_setup_internals(tks, clock); - tk_set_xtime(tk, &wall_time); - tk->raw_sec = 0; + tk_set_xtime(tks, &wall_time); + tks->raw_sec = 0; - tk_set_wall_to_mono(tk, wall_to_mono); + tk_set_wall_to_mono(tks, wall_to_mono); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ @@ -1749,22 +1917,14 @@ bool timekeeping_rtc_skipsuspend(void) */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - - suspend_timing_needed = false; + scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - timekeeping_forward_now(tk); - - __timekeeping_inject_sleeptime(tk, delta); - - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + suspend_timing_needed = false; + timekeeping_forward_now(tks); + __timekeeping_inject_sleeptime(tks, delta); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); @@ -1776,20 +1936,19 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) */ void timekeeping_resume(void) { - struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *clock = tk->tkr_mono.clock; - unsigned long flags; + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; - u64 cycle_now, nsec; bool inject_sleeptime = false; + u64 cycle_now, nsec; + unsigned long flags; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + raw_spin_lock_irqsave(&tk_core.lock, flags); /* * After system resumes, we need to calculate the suspended time and @@ -1803,7 +1962,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk_clock_read(&tk->tkr_mono); + cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); @@ -1815,18 +1974,17 @@ void timekeeping_resume(void) if (inject_sleeptime) { suspend_timing_needed = false; - __timekeeping_inject_sleeptime(tk, &ts_delta); + __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ - tk->tkr_mono.cycle_last = cycle_now; - tk->tkr_raw.cycle_last = cycle_now; + tks->tkr_mono.cycle_last = cycle_now; + tks->tkr_raw.cycle_last = cycle_now; - tk->ntp_error = 0; + tks->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); @@ -1836,13 +1994,18 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - struct timespec64 delta, delta_delta; - static struct timespec64 old_delta; + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; struct clocksource *curr_clock; + unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); @@ -1857,9 +2020,8 @@ int timekeeping_suspend(void) suspend_timing_needed = true; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - timekeeping_forward_now(tk); + raw_spin_lock_irqsave(&tk_core.lock, flags); + timekeeping_forward_now(tks); timekeeping_suspended = 1; /* @@ -1867,8 +2029,8 @@ int timekeeping_suspend(void) * just read from the current clocksource. Save this to potentially * use in suspend timing. */ - curr_clock = tk->tkr_mono.clock; - cycle_now = tk->tkr_mono.cycle_last; + curr_clock = tks->tkr_mono.clock; + cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { @@ -1878,7 +2040,7 @@ int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -1893,10 +2055,9 @@ int timekeeping_suspend(void) } } - timekeeping_update(tk, TK_MIRROR); - halt_fast_timekeeper(tk); - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, 0); + halt_fast_timekeeper(tks); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); @@ -1905,15 +2066,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); @@ -2001,16 +2171,17 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { + u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ - if (likely(tk->ntp_tick == ntp_tick_length())) { + if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { - tk->ntp_tick = ntp_tick_length(); + tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } @@ -2081,7 +2252,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) } /* Figure out if its a leap sec and apply if needed */ - leap = second_overflow(tk->xtime_sec); + leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; @@ -2147,30 +2318,25 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { - struct timekeeper *real_tk = &tk_core.timekeeper; - struct timekeeper *tk = &shadow_timekeeper; - u64 offset; - int shift = 0, maxshift; + struct timekeeper *tk = &tkd->shadow_timekeeper; + struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); + int shift = 0, maxshift; + u64 offset, orig_offset; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) - goto out; + return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), - tk->tkr_mono.cycle_last, tk->tkr_mono.mask); - + tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) - goto out; - - /* Do some additional sanity checking */ - timekeeping_check_update(tk, offset); + return false; /* * With NO_HZ we may have to accumulate many cycle_intervals @@ -2183,11 +2349,10 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, - &clock_set); + offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } @@ -2201,35 +2366,35 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); - write_seqcount_begin(&tk_core.seq); /* - * Update the real timekeeper. - * - * We could avoid this memcpy by switching pointers, but that - * requires changes to all other timekeeper usage sites as - * well, i.e. move the timekeeper pointer getter into the - * spinlocked/seqcount protected sections. And we trade this - * memcpy under the tk_core.seq against one before we start - * updating. + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time */ - timekeeping_update(tk, clock_set); - memcpy(real_tk, tk, sizeof(*tk)); - /* The memcpy must come last. Do not put anything here! */ - write_seqcount_end(&tk_core.seq); -out: - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + + timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + guard(raw_spinlock_irqsave)(&tk_core.lock); + return __timekeeping_advance(&tk_core, mode); +} + /** * update_wall_time - Uses the current clocksource to increment the wall time * + * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); + tk_aux_advance(); } /** @@ -2260,11 +2425,99 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); +/** + * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor + * @ts: timespec64 to be filled + * + * Fetch the global mg_floor value, convert it to realtime and compare it + * to the current coarse-grained time. Fill @ts with whichever is + * latest. Note that this is a filesystem-specific interface and should be + * avoided outside of that context. + */ +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 floor = atomic64_read(&mg_floor); + ktime_t f_real, offset, coarse; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + *ts = tk_xtime_coarse(tk); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + coarse = timespec64_to_ktime(*ts); + f_real = ktime_add(floor, offset); + if (ktime_after(f_real, coarse)) + *ts = ktime_to_timespec64(f_real); +} + +/** + * ktime_get_real_ts64_mg - attempt to update floor value and return result + * @ts: pointer to the timespec to be set + * + * Get a monotonic fine-grained time value and attempt to swap it into + * mg_floor. If that succeeds then accept the new floor value. If it fails + * then another task raced in during the interim time and updated the + * floor. Since any update to the floor must be later than the previous + * floor, either outcome is acceptable. + * + * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), + * and determining that the resulting coarse-grained timestamp did not effect + * a change in ctime. Any more recent floor value would effect a change to + * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. + * + * @ts will be filled with the latest floor value, regardless of the outcome of + * the cmpxchg. Note that this is a filesystem specific interface and should be + * avoided outside of that context. + */ +void ktime_get_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t old = atomic64_read(&mg_floor); + ktime_t offset, mono; + unsigned int seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + mono = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + mono = ktime_add_ns(mono, nsecs); + + /* + * Attempt to update the floor with the new time value. As any + * update must be later then the existing floor, and would effect + * a change to ctime from the perspective of the current task, + * accept the resulting floor value regardless of the outcome of + * the swap. + */ + if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); + timekeeping_inc_mg_floor_swaps(); + } else { + /* + * Another task changed mg_floor since "old" was fetched. + * "old" has been updated with the latest value of "mg_floor". + * That value is newer than the previous floor value, which + * is enough to effect a change to ctime. Accept it. + */ + *ts = ktime_to_timespec64(ktime_add(old, offset)); + } +} + void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; @@ -2274,12 +2527,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); @@ -2339,7 +2592,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct __kernel_timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2398,6 +2651,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return -EINVAL; } + if (aux_clock) { + /* Auxiliary clocks are similar to TAI and do not have leap seconds */ + if (txc->status & (STA_INS | STA_DEL)) + return -EINVAL; + + /* No TAI offset setting */ + if (txc->modes & ADJ_TAI) + return -EINVAL; + + /* No PPS support either */ + if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) + return -EINVAL; + } + return 0; } @@ -2416,88 +2683,431 @@ unsigned long random_get_entropy_fallback(void) } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); -/** - * do_adjtimex() - Accessor function to NTP __do_adjtimex function - */ -int do_adjtimex(struct __kernel_timex *txc) +struct adjtimex_result { + struct audit_ntp_data ad; + struct timespec64 delta; + bool clock_set; +}; + +static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, + struct adjtimex_result *result) { - struct timekeeper *tk = &tk_core.timekeeper; - struct audit_ntp_data ad; - bool clock_set = false; + struct timekeeper *tks = &tkd->shadow_timekeeper; + bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; - unsigned long flags; s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ - ret = timekeeping_validate_timex(txc); + ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); + if (!aux_clock) + ktime_get_real_ts64(&ts); + else + tk_get_aux_ts64(tkd->timekeeper.id, &ts); + + add_device_randomness(&ts, sizeof(ts)); + + guard(raw_spinlock_irqsave)(&tkd->lock); + + if (!tks->clock_valid) + return -ENODEV; + if (txc->modes & ADJ_SETOFFSET) { - struct timespec64 delta; - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; + result->delta.tv_sec = txc->time.tv_sec; + result->delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = timekeeping_inject_offset(&delta); + result->delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(tkd, &result->delta); if (ret) return ret; - - audit_tk_injoffset(delta); + result->clock_set = true; } - audit_ntp_init(&ad); + orig_tai = tai = tks->tai_offset; + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); - ktime_get_real_ts64(&ts); - add_device_randomness(&ts, sizeof(ts)); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); + result->clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); + } - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); - orig_tai = tai = tk->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + return ret; +} - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tk, tai); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - clock_set = true; - } - tk_update_leap_state(tk); +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct adjtimex_result result = { }; + int ret; - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + ret = __do_adjtimex(&tk_core, txc, &result); + if (ret < 0) + return ret; - audit_ntp_log(&ad); + if (txc->modes & ADJ_SETOFFSET) + audit_tk_injoffset(result.delta); - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= timekeeping_advance(TK_ADV_FREQ); + audit_ntp_log(&result.ad); - if (clock_set) - clock_was_set(CLOCK_REALTIME); + if (result.clock_set) + clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(); + ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } +/* + * Invoked from NTP with the time keeper lock held, so lockless access is + * fine. + */ +long ktime_get_ntp_seconds(unsigned int id) +{ + return timekeeper_data[id].timekeeper.xtime_sec; +} + #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function + * @phase_ts: Pointer to timespec64 structure representing phase timestamp + * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - + guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ + +#ifdef CONFIG_POSIX_AUX_CLOCKS +#include "posix-timers.h" + +/* + * Bitmap for the activated auxiliary timekeepers to allow lockless quick + * checks in the hot paths without touching extra cache lines. If set, then + * the state of the corresponding timekeeper has to be re-checked under + * timekeeper::lock. + */ +static unsigned long aux_timekeepers; + +static inline unsigned int clockid_to_tkid(unsigned int id) +{ + return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; +} + +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + if (!clockid_aux_valid(id)) + return NULL; + return &timekeeper_data[clockid_to_tkid(id)]; +} + +/* Invoked from timekeeping after a clocksource change */ +static void tk_aux_update_clocksource(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + struct timekeeper *tks = &tkd->shadow_timekeeper; + + guard(raw_spinlock_irqsave)(&tkd->lock); + if (!tks->clock_valid) + continue; + + timekeeping_forward_now(tks); + tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + } +} + +static void tk_aux_advance(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + /* Lockless quick check to avoid extra cache lines */ + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + + guard(raw_spinlock)(&aux_tkd->lock); + if (aux_tkd->shadow_timekeeper.clock_valid) + __timekeeping_advance(aux_tkd, TK_ADV_TICK); + } +} + +/** + * ktime_get_aux - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @kt: Pointer to ktime_t to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux(clockid_t id, ktime_t *kt) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tk; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + if (!aux_tkd) + return false; + + aux_tk = &aux_tkd->timekeeper; + do { + seq = read_seqcount_begin(&aux_tkd->seq); + if (!aux_tk->clock_valid) + return false; + + base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); + nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); + } while (read_seqcount_retry(&aux_tkd->seq, seq)); + + *kt = ktime_add_ns(base, nsecs); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux); + +/** + * ktime_get_aux_ts64 - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @ts: Pointer to timespec64 to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) +{ + ktime_t now; + + if (!ktime_get_aux(id, &now)) + return false; + *ts = ktime_to_timespec64(now); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); + +static int aux_get_res(clockid_t id, struct timespec64 *tp) +{ + if (!clockid_aux_valid(id)) + return -ENODEV; + + tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; + tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; + return 0; +} + +static int aux_get_timespec(clockid_t id, struct timespec64 *tp) +{ + return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; +} + +static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks; + ktime_t tnow, nsecs; + + if (!timespec64_valid_settod(tnew)) + return -EINVAL; + if (!aux_tkd) + return -ENODEV; + + aux_tks = &aux_tkd->shadow_timekeeper; + + guard(raw_spinlock_irq)(&aux_tkd->lock); + if (!aux_tks->clock_valid) + return -ENODEV; + + /* Forward the timekeeper base time */ + timekeeping_forward_now(aux_tks); + /* + * Get the updated base time. tkr_mono.base has not been + * updated yet, so do that first. That makes the update + * in timekeeping_update_from_shadow() redundant, but + * that's harmless. After that @tnow can be calculated + * by using tkr_mono::cycle_last, which has been set + * by timekeeping_forward_now(). + */ + tk_update_ktime_data(aux_tks); + nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); + tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); + + /* + * Calculate the new AUX offset as delta to @tnow ("monotonic"). + * That avoids all the tk::xtime back and forth conversions as + * xtime ("realtime") is not applicable for auxiliary clocks and + * kept in sync with "monotonic". + */ + tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow)); + + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); + return 0; +} + +static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct adjtimex_result result = { }; + + if (!aux_tkd) + return -ENODEV; + + /* + * @result is ignored for now as there are neither hrtimers nor a + * RTC related to auxiliary clocks for now. + */ + return __do_adjtimex(aux_tkd, txc, &result); +} + +const struct k_clock clock_aux = { + .clock_getres = aux_get_res, + .clock_get_timespec = aux_get_timespec, + .clock_set = aux_clock_set, + .clock_adj = aux_clock_adj, +}; + +static void aux_clock_enable(clockid_t id) +{ + struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; + + /* Prevent the core timekeeper from changing. */ + guard(raw_spinlock_irq)(&tk_core.lock); + + /* + * Setup the auxiliary clock assuming that the raw core timekeeper + * clock frequency conversion is close enough. Userspace has to + * adjust for the deviation via clock_adjtime(2). + */ + guard(raw_spinlock_nested)(&aux_tkd->lock); + + /* Remove leftovers of a previous registration */ + memset(aux_tks, 0, sizeof(*aux_tks)); + /* Restore the timekeeper id */ + aux_tks->id = aux_tkd->timekeeper.id; + /* Setup the timekeeper based on the current system clocksource */ + tk_setup_internals(aux_tks, tkr_raw->clock); + + /* Mark it valid and set it live */ + aux_tks->clock_valid = true; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static void aux_clock_disable(clockid_t id) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + + guard(raw_spinlock_irq)(&aux_tkd->lock); + aux_tkd->shadow_timekeeper.clock_valid = false; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static DEFINE_MUTEX(aux_clock_mutex); + +static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + bool enable; + + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (kstrtobool(buf, &enable) < 0) + return -EINVAL; + + guard(mutex)(&aux_clock_mutex); + if (enable == test_bit(id, &aux_timekeepers)) + return count; + + if (enable) { + aux_clock_enable(CLOCK_AUX + id); + set_bit(id, &aux_timekeepers); + } else { + aux_clock_disable(CLOCK_AUX + id); + clear_bit(id, &aux_timekeepers); + } + return count; +} + +static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + + return sysfs_emit(buf, "%d\n", test_bit(id, &active)); +} + +static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); + +static struct attribute *aux_clock_enable_attrs[] = { + &aux_clock_enable_attr.attr, + NULL +}; + +static const struct attribute_group aux_clock_enable_attr_group = { + .attrs = aux_clock_enable_attrs, +}; + +static int __init tk_aux_sysfs_init(void) +{ + struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; + + if (!tko) + return ret; + + auxo = kobject_create_and_add("aux_clocks", tko); + if (!auxo) + goto err_clean; + + for (int i = 0; i < MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (ret) + goto err_clean; + } + return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; +} +late_initcall(tk_aux_sysfs_init); + +static __init void tk_aux_setup(void) +{ + for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) + tkd_basic_setup(&timekeeper_data[i], i, false); +} +#endif /* CONFIG_POSIX_AUX_CLOCKS */ diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index b73e8850e58d..badeb222eab9 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -17,6 +17,9 @@ #define NUM_BINS 32 +/* Incremented every time mg_floor is updated */ +DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_sleep_time_show(struct seq_file *s, void *data) @@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t) (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); } +unsigned long timekeeping_get_mg_floor_swaps(void) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu)); + + return sum; +} diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ca2787d1642..973ede670a36 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -10,30 +10,42 @@ * timekeeping debug functions */ #ifdef CONFIG_DEBUG_FS + +DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ + this_cpu_inc(timekeeping_mg_floor_swaps); +} + extern void tk_debug_account_sleep_time(const struct timespec64 *t); + #else + #define tk_debug_account_sleep_time(x) + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ +} + #endif -#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) { u64 ret = (now - last) & mask; /* - * Prevent time going backwards by checking the MSB of mask in - * the result. If set, return 0. + * Prevent time going backwards by checking the result against + * @max_delta. If greater, return 0. */ - return ret & ~(mask >> 1) ? 0 : ret; -} -#else -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) -{ - return (now - last) & mask; + return ret > max_delta ? 0 : ret; } -#endif /* Semi public for serialization of non timekeeper VDSO updates. */ -extern raw_spinlock_t timekeeper_lock; +unsigned long timekeeper_lock_irqsave(void); +void timekeeper_unlock_irqrestore(unsigned long flags); + +/* NTP specific interface to access the current seconds value */ +long ktime_get_ntp_seconds(unsigned int id); #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 63a8ce7177dd..1f2364126894 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -37,7 +37,6 @@ #include <linux/tick.h> #include <linux/kallsyms.h> #include <linux/irq_work.h> -#include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> @@ -53,6 +52,7 @@ #include <asm/io.h> #include "tick-internal.h" +#include "timer_migration.h" #define CREATE_TRACE_POINTS #include <trace/events/timer.h> @@ -63,15 +63,15 @@ EXPORT_SYMBOL(jiffies_64); /* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of - * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * LVL_SIZE buckets. Each level is driven by its own clock and therefore each * level has a different granularity. * - * The level granularity is: LVL_CLK_DIV ^ lvl + * The level granularity is: LVL_CLK_DIV ^ level * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and - * therefor the granularity becomes. + * therefore the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading @@ -187,15 +187,66 @@ EXPORT_SYMBOL(jiffies_64); #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) #ifdef CONFIG_NO_HZ_COMMON -# define NR_BASES 2 -# define BASE_STD 0 -# define BASE_DEF 1 +/* + * If multiple bases need to be locked, use the base ordering for lock + * nesting, i.e. lowest number first. + */ +# define NR_BASES 3 +# define BASE_LOCAL 0 +# define BASE_GLOBAL 1 +# define BASE_DEF 2 #else # define NR_BASES 1 -# define BASE_STD 0 +# define BASE_LOCAL 0 +# define BASE_GLOBAL 0 # define BASE_DEF 0 #endif +/** + * struct timer_base - Per CPU timer base (number of base depends on config) + * @lock: Lock protecting the timer_base + * @running_timer: When expiring timers, the lock is dropped. To make + * sure not to race against deleting/modifying a + * currently running timer, the pointer is set to the + * timer, which expires at the moment. If no timer is + * running, the pointer is NULL. + * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around + * timer expiry callback execution and when trying to + * delete a running timer and it wasn't successful in + * the first glance. It prevents priority inversion + * when callback was preempted on a remote CPU and a + * caller tries to delete the running timer. It also + * prevents a life lock, when the task which tries to + * delete a timer preempted the softirq thread which + * is running the timer callback function. + * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter + * waiting for the end of the timer callback function + * execution. + * @clk: clock of the timer base; is updated before enqueue + * of a timer; during expiry, it is 1 offset ahead of + * jiffies to avoid endless requeuing to current + * jiffies + * @next_expiry: expiry value of the first timer; it is updated when + * finding the next timer and during enqueue; the + * value is not valid, when next_expiry_recalc is set + * @cpu: Number of CPU the timer base belongs to + * @next_expiry_recalc: States, whether a recalculation of next_expiry is + * required. Value is set true, when a timer was + * deleted. + * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ + * code. This state is only used in standard + * base. Deferrable timers, which are enqueued remotely + * never wake up an idle CPU. So no matter of supporting it + * for this base. + * @timers_pending: Is set, when a timer is pending in the base. It is only + * reliable when next_expiry_recalc is not set. + * @pending_map: bitmap of the timer wheel; each bit reflects a + * bucket of the wheel. When a bit is set, at least a + * single timer is enqueued in the related bucket. + * @vectors: Array of lists; Each array member reflects a bucket + * of the timer wheel. The list contains all timers + * which are enqueued into a specific bucket. + */ struct timer_base { raw_spinlock_t lock; struct timer_list *running_timer; @@ -237,7 +288,7 @@ static void timers_update_migration(void) } #ifdef CONFIG_SYSCTL -static int timer_migration_handler(struct ctl_table *table, int write, +static int timer_migration_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -250,7 +301,7 @@ static int timer_migration_handler(struct ctl_table *table, int write, return ret; } -static struct ctl_table timer_sysctl[] = { +static const struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, @@ -260,7 +311,6 @@ static struct ctl_table timer_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init timer_sysctl_init(void) @@ -314,7 +364,7 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, rem = j % HZ; /* - * If the target jiffie is just after a whole second (which can happen + * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. @@ -336,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, } /** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -433,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j) EXPORT_SYMBOL_GPL(round_jiffies_relative); /** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -571,26 +579,29 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { - if (!is_timers_nohz_active()) - return; - /* - * TODO: This wants some optimizing similar to the code below, but we - * will do that when we switch from push to pull for deferrable timers. + * Deferrable timers do not prevent the CPU from entering dynticks and + * are not taken into account on the idle/nohz_full path. An IPI when a + * new deferrable timer is enqueued will wake up the remote CPU but + * nothing will be done with the deferrable timer base. Therefore skip + * the remote IPI for deferrable timers completely. */ - if (timer->flags & TIMER_DEFERRABLE) { - if (tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); + if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return; - } /* * We might have to IPI the remote CPU if the base is idle and the - * timer is not deferrable. If the other CPU is on the way to idle - * then it can't set base->is_idle as we hold the base lock: + * timer is pinned. If it is a non pinned timer, it is only queued + * on the remote CPU, when timer was running during queueing. Then + * everything is handled by remote CPU anyway. If the other CPU is + * on the way to idle then it can't set base->is_idle as we hold + * the base lock: */ - if (base->is_idle) + if (base->is_idle) { + WARN_ON_ONCE(!(timer->flags & TIMER_PINNED || + tick_nohz_full_cpu(base->cpu))); wake_up_nohz_cpu(base->cpu); + } } /* @@ -606,7 +617,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); - trace_timer_start(timer, timer->expires, timer->flags); + trace_timer_start(timer, bucket_expiry); /* * Check whether this is the new first expiring timer. The @@ -618,7 +629,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ - base->next_expiry = bucket_expiry; + WRITE_ONCE(base->next_expiry, bucket_expiry); base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); @@ -682,7 +693,7 @@ static bool timer_is_static_object(void *addr) } /* - * fixup_init is called when: + * timer_fixup_init is called when: * - an active object is initialized */ static bool timer_fixup_init(void *addr, enum debug_obj_state state) @@ -691,7 +702,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: @@ -706,7 +717,7 @@ static void stub_timer(struct timer_list *unused) } /* - * fixup_activate is called when: + * timer_fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ @@ -728,7 +739,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) } /* - * fixup_free is called when: + * timer_fixup_free is called when: * - an active object is freed */ static bool timer_fixup_free(void *addr, enum debug_obj_state state) @@ -737,7 +748,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: @@ -746,7 +757,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) } /* - * fixup_assert_init is called when: + * timer_fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) @@ -797,7 +808,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, +void timer_init_key_on_stack(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) @@ -805,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer, debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL_GPL(init_timer_on_stack_key); +EXPORT_SYMBOL_GPL(timer_init_key_on_stack); -void destroy_timer_on_stack(struct timer_list *timer) +void timer_destroy_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); +EXPORT_SYMBOL_GPL(timer_destroy_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } @@ -851,7 +862,7 @@ static void do_init_timer(struct timer_list *timer, } /** - * init_timer_key - initialize a timer + * timer_init_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags @@ -859,17 +870,17 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior calling *any* of the + * timer_init_key() must be done to a timer prior to calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, +void timer_init_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL(init_timer_key); +EXPORT_SYMBOL(timer_init_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { @@ -902,28 +913,30 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { - struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); + int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); - return base; + index = BASE_DEF; + + return per_cpu_ptr(&timer_bases[index], cpu); } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = this_cpu_ptr(&timer_bases[BASE_DEF]); - return base; + index = BASE_DEF; + + return this_cpu_ptr(&timer_bases[index]); } static inline struct timer_base *get_timer_base(u32 tflags) @@ -931,42 +944,34 @@ static inline struct timer_base *get_timer_base(u32 tflags) return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) - if (static_branch_likely(&timers_migration_enabled) && - !(tflags & TIMER_PINNED)) - return get_timer_cpu_base(tflags, get_nohz_timer_target()); -#endif - return get_timer_this_cpu_base(tflags); -} - -static inline void forward_timer_base(struct timer_base *base) +static inline void __forward_timer_base(struct timer_base *base, + unsigned long basej) { - unsigned long jnow = READ_ONCE(jiffies); - /* - * No need to forward if we are close enough below jiffies. - * Also while executing timers, base->clk is 1 offset ahead - * of jiffies to avoid endless requeuing to current jiffies. + * Check whether we can forward the base. We can only do that when + * @basej is past base->clk otherwise we might rewind base->clk. */ - if ((long)(jnow - base->clk) < 1) + if (time_before_eq(basej, base->clk)) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) { - base->clk = jnow; + if (time_after(base->next_expiry, basej)) { + base->clk = basej; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } + } +static inline void forward_timer_base(struct timer_base *base) +{ + __forward_timer_base(base, READ_ONCE(jiffies)); +} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -1093,7 +1098,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; - new_base = get_target_base(base, timer->flags); + new_base = get_timer_this_cpu_base(timer->flags); if (base != new_base) { /* @@ -1165,10 +1170,10 @@ EXPORT_SYMBOL(mod_timer_pending); * * mod_timer(timer, expires) is equivalent to: * - * del_timer(timer); timer->expires = expires; add_timer(timer); + * timer_delete(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In - * case that the timer is inactive, the del_timer() part is a NOP. The + * case that the timer is inactive, the timer_delete() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the @@ -1246,11 +1251,48 @@ void add_timer(struct timer_list *timer) EXPORT_SYMBOL(add_timer); /** + * add_timer_local() - Start a timer on the local CPU + * @timer: The timer to be started + * + * Same as add_timer() except that the timer flag TIMER_PINNED is set. + * + * See add_timer() for further details. + */ +void add_timer_local(struct timer_list *timer) +{ + if (WARN_ON_ONCE(timer_pending(timer))) + return; + timer->flags |= TIMER_PINNED; + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); +} +EXPORT_SYMBOL(add_timer_local); + +/** + * add_timer_global() - Start a timer without TIMER_PINNED flag set + * @timer: The timer to be started + * + * Same as add_timer() except that the timer flag TIMER_PINNED is unset. + * + * See add_timer() for further details. + */ +void add_timer_global(struct timer_list *timer) +{ + if (WARN_ON_ONCE(timer_pending(timer))) + return; + timer->flags &= ~TIMER_PINNED; + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); +} +EXPORT_SYMBOL(add_timer_global); + +/** * add_timer_on - Start a timer on a particular CPU * @timer: The timer to be started * @cpu: The CPU to start it on * - * Same as add_timer() except that it starts the timer on the given CPU. + * Same as add_timer() except that it starts the timer on the given CPU and + * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in + * the next round, add_timer_global() should be used instead as it unsets + * the TIMER_PINNED flag. * * See add_timer() for further details. */ @@ -1264,6 +1306,9 @@ void add_timer_on(struct timer_list *timer, int cpu) if (WARN_ON_ONCE(timer_pending(timer))) return; + /* Make sure timer flags have TIMER_PINNED flag set */ + timer->flags |= TIMER_PINNED; + new_base = get_timer_cpu_base(timer->flags, cpu); /* @@ -1324,7 +1369,7 @@ static int __timer_delete(struct timer_list *timer, bool shutdown) * If @shutdown is set then the lock has to be taken whether the * timer is pending or not to protect against a concurrent rearm * which might hit between the lockless pending check and the lock - * aquisition. By taking the lock it is ensured that such a newly + * acquisition. By taking the lock it is ensured that such a newly * enqueued timer is dequeued and cannot end up with * timer->function == NULL in the expiry code. * @@ -1413,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); @@ -1424,7 +1470,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) } /** - * try_to_del_timer_sync - Try to deactivate a timer + * timer_delete_sync_try - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not @@ -1439,11 +1485,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ -int try_to_del_timer_sync(struct timer_list *timer) +int timer_delete_sync_try(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } -EXPORT_SYMBOL(try_to_del_timer_sync); +EXPORT_SYMBOL(timer_delete_sync_try); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) @@ -1469,6 +1515,8 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) + __releases(&base->lock) __releases(&base->expiry_lock) + __acquires(&base->expiry_lock) __acquires(&base->lock) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); @@ -1803,13 +1851,15 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. + * + * Store next expiry time in base->next_expiry. */ -static unsigned long __next_timer_interrupt(struct timer_base *base) +static void timer_recalc_next_expiry(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; - next = base->clk + NEXT_TIMER_MAX_DELTA; + next = base->clk + TIMER_NEXT_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); @@ -1834,7 +1884,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next - * expiring jiffie. So in case of: + * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 @@ -1870,10 +1920,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk += adj; } + WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; - base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); - - return next; + base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON @@ -1900,7 +1949,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) return basem; /* - * Round up to the next jiffie. High resolution timers are + * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. @@ -1910,63 +1959,357 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } +static unsigned long next_timer_interrupt(struct timer_base *base, + unsigned long basej) +{ + if (base->next_expiry_recalc) + timer_recalc_next_expiry(base); + + /* + * Move next_expiry for the empty base into the future to prevent an + * unnecessary raise of the timer softirq when the next_expiry value + * will be reached even if there is no timer pending. + * + * This update is also required to make timer_base::next_expiry values + * easy comparable to find out which base holds the first pending timer. + */ + if (!base->timers_pending) + WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA); + + return base->next_expiry; +} + +static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem, + struct timer_base *base_local, + struct timer_base *base_global, + struct timer_events *tevt) +{ + unsigned long nextevt, nextevt_local, nextevt_global; + bool local_first; + + nextevt_local = next_timer_interrupt(base_local, basej); + nextevt_global = next_timer_interrupt(base_global, basej); + + local_first = time_before_eq(nextevt_local, nextevt_global); + + nextevt = local_first ? nextevt_local : nextevt_global; + + /* + * If the @nextevt is at max. one tick away, use @nextevt and store + * it in the local expiry value. The next global event is irrelevant in + * this case and can be left as KTIME_MAX. + */ + if (time_before_eq(nextevt, basej + 1)) { + /* If we missed a tick already, force 0 delta */ + if (time_before(nextevt, basej)) + nextevt = basej; + tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC; + + /* + * This is required for the remote check only but it doesn't + * hurt, when it is done for both call sites: + * + * * The remote callers will only take care of the global timers + * as local timers will be handled by CPU itself. When not + * updating tevt->global with the already missed first global + * timer, it is possible that it will be missed completely. + * + * * The local callers will ignore the tevt->global anyway, when + * nextevt is max. one tick away. + */ + if (!local_first) + tevt->global = tevt->local; + return nextevt; + } + + /* + * Update tevt.* values: + * + * If the local queue expires first, then the global event can be + * ignored. If the global queue is empty, nothing to do either. + */ + if (!local_first && base_global->timers_pending) + tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC; + + if (base_local->timers_pending) + tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC; + + return nextevt; +} + +# ifdef CONFIG_SMP /** - * get_next_timer_interrupt - return the time (clock mono) of the next timer + * fetch_next_timer_interrupt_remote() - Store next timers into @tevt * @basej: base time jiffies * @basem: base time clock monotonic + * @tevt: Pointer to the storage for the expiry values + * @cpu: Remote CPU * - * Returns the tick aligned clock monotonic time of the next pending - * timer or KTIME_MAX if no timer is pending. + * Stores the next pending local and global timer expiry values in the + * struct pointed to by @tevt. If a queue is empty the corresponding + * field is set to KTIME_MAX. If local event expires before global + * event, global event is set to KTIME_MAX as well. + * + * Caller needs to make sure timer base locks are held (use + * timer_lock_remote_bases() for this purpose). */ -u64 get_next_timer_interrupt(unsigned long basej, u64 basem) +void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, + struct timer_events *tevt, + unsigned int cpu) +{ + struct timer_base *base_local, *base_global; + + /* Preset local / global events */ + tevt->local = tevt->global = KTIME_MAX; + + base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); + base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); + + lockdep_assert_held(&base_local->lock); + lockdep_assert_held(&base_global->lock); + + fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt); +} + +/** + * timer_unlock_remote_bases - unlock timer bases of cpu + * @cpu: Remote CPU + * + * Unlocks the remote timer bases. + */ +void timer_unlock_remote_bases(unsigned int cpu) + __releases(timer_bases[BASE_LOCAL]->lock) + __releases(timer_bases[BASE_GLOBAL]->lock) +{ + struct timer_base *base_local, *base_global; + + base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); + base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); + + raw_spin_unlock(&base_global->lock); + raw_spin_unlock(&base_local->lock); +} + +/** + * timer_lock_remote_bases - lock timer bases of cpu + * @cpu: Remote CPU + * + * Locks the remote timer bases. + */ +void timer_lock_remote_bases(unsigned int cpu) + __acquires(timer_bases[BASE_LOCAL]->lock) + __acquires(timer_bases[BASE_GLOBAL]->lock) +{ + struct timer_base *base_local, *base_global; + + base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); + base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); + + lockdep_assert_irqs_disabled(); + + raw_spin_lock(&base_local->lock); + raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); +} + +/** + * timer_base_is_idle() - Return whether timer base is set idle + * + * Returns value of local timer base is_idle value. + */ +bool timer_base_is_idle(void) +{ + return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle); +} + +static void __run_timer_base(struct timer_base *base); + +/** + * timer_expire_remote() - expire global timers of cpu + * @cpu: Remote CPU + * + * Expire timers of global base of remote CPU. + */ +void timer_expire_remote(unsigned int cpu) +{ + struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); + + __run_timer_base(base); +} + +static void timer_use_tmigr(unsigned long basej, u64 basem, + unsigned long *nextevt, bool *tick_stop_path, + bool timer_base_idle, struct timer_events *tevt) +{ + u64 next_tmigr; + + if (timer_base_idle) + next_tmigr = tmigr_cpu_new_timer(tevt->global); + else if (tick_stop_path) + next_tmigr = tmigr_cpu_deactivate(tevt->global); + else + next_tmigr = tmigr_quick_check(tevt->global); + + /* + * If the CPU is the last going idle in timer migration hierarchy, make + * sure the CPU will wake up in time to handle remote timers. + * next_tmigr == KTIME_MAX if other CPUs are still active. + */ + if (next_tmigr < tevt->local) { + u64 tmp; + + /* If we missed a tick already, force 0 delta */ + if (next_tmigr < basem) + next_tmigr = basem; + + tmp = div_u64(next_tmigr - basem, TICK_NSEC); + + *nextevt = basej + (unsigned long)tmp; + tevt->local = next_tmigr; + } +} +# else +static void timer_use_tmigr(unsigned long basej, u64 basem, + unsigned long *nextevt, bool *tick_stop_path, + bool timer_base_idle, struct timer_events *tevt) +{ + /* + * Make sure first event is written into tevt->local to not miss a + * timer on !SMP systems. + */ + tevt->local = min_t(u64, tevt->local, tevt->global); +} +# endif /* CONFIG_SMP */ + +static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, + bool *idle) { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - u64 expires = KTIME_MAX; + struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX }; + struct timer_base *base_local, *base_global; unsigned long nextevt; + bool idle_is_possible; /* - * Pretend that there is no timer pending if the cpu is offline. - * Possible pending timers will be migrated later to an active cpu. + * When the CPU is offline, the tick is cancelled and nothing is supposed + * to try to stop it. */ - if (cpu_is_offline(smp_processor_id())) - return expires; + if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) { + if (idle) + *idle = true; + return tevt.local; + } - raw_spin_lock(&base->lock); - if (base->next_expiry_recalc) - base->next_expiry = __next_timer_interrupt(base); - nextevt = base->next_expiry; + base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]); + base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]); + + raw_spin_lock(&base_local->lock); + raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); + + nextevt = fetch_next_timer_interrupt(basej, basem, base_local, + base_global, &tevt); + + /* + * If the next event is only one jiffy ahead there is no need to call + * timer migration hierarchy related functions. The value for the next + * global timer in @tevt struct equals then KTIME_MAX. This is also + * true, when the timer base is idle. + * + * The proper timer migration hierarchy function depends on the callsite + * and whether timer base is idle or not. @nextevt will be updated when + * this CPU needs to handle the first timer migration hierarchy + * event. See timer_use_tmigr() for detailed information. + */ + idle_is_possible = time_after(nextevt, basej + 1); + if (idle_is_possible) + timer_use_tmigr(basej, basem, &nextevt, idle, + base_local->is_idle, &tevt); /* * We have a fresh next event. Check whether we can forward the - * base. We can only do that when @basej is past base->clk - * otherwise we might rewind base->clk. + * base. */ - if (time_after(basej, base->clk)) { - if (time_after(nextevt, basej)) - base->clk = basej; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; - } + __forward_timer_base(base_local, basej); + __forward_timer_base(base_global, basej); + + /* + * Set base->is_idle only when caller is timer_base_try_to_set_idle() + */ + if (idle) { + /* + * Bases are idle if the next event is more than a tick + * away. Caution: @nextevt could have changed by enqueueing a + * global timer into timer migration hierarchy. Therefore a new + * check is required here. + * + * If the base is marked idle then any timer add operation must + * forward the base clk itself to keep granularity small. This + * idle logic is only maintained for the BASE_LOCAL and + * BASE_GLOBAL base, deferrable timers may still see large + * granularity skew (by design). + */ + if (!base_local->is_idle && time_after(nextevt, basej + 1)) { + base_local->is_idle = true; + /* + * Global timers queued locally while running in a task + * in nohz_full mode need a self-IPI to kick reprogramming + * in IRQ tail. + */ + if (tick_nohz_full_cpu(base_local->cpu)) + base_global->is_idle = true; + trace_timer_base_idle(true, base_local->cpu); + } + *idle = base_local->is_idle; - if (time_before_eq(nextevt, basej)) { - expires = basem; - base->is_idle = false; - } else { - if (base->timers_pending) - expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* - * If we expect to sleep more than a tick, mark the base idle. - * Also the tick is stopped so any added timer must forward - * the base clk itself to keep granularity small. This idle - * logic is only maintained for the BASE_STD base, deferrable - * timers may still see large granularity skew (by design). + * When timer base is not set idle, undo the effect of + * tmigr_cpu_deactivate() to prevent inconsistent states - active + * timer base but inactive timer migration hierarchy. + * + * When timer base was already marked idle, nothing will be + * changed here. */ - if ((expires - basem) > TICK_NSEC) - base->is_idle = true; + if (!base_local->is_idle && idle_is_possible) + tmigr_cpu_activate(); } - raw_spin_unlock(&base->lock); - return cmp_next_hrtimer_event(basem, expires); + raw_spin_unlock(&base_global->lock); + raw_spin_unlock(&base_local->lock); + + return cmp_next_hrtimer_event(basem, tevt.local); +} + +/** + * get_next_timer_interrupt() - return the time (clock mono) of the next timer + * @basej: base time jiffies + * @basem: base time clock monotonic + * + * Returns the tick aligned clock monotonic time of the next pending timer or + * KTIME_MAX if no timer is pending. If timer of global base was queued into + * timer migration hierarchy, first global timer is not taken into account. If + * it was the last CPU of timer migration hierarchy going idle, first global + * event is taken into account. + */ +u64 get_next_timer_interrupt(unsigned long basej, u64 basem) +{ + return __get_next_timer_interrupt(basej, basem, NULL); +} + +/** + * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases + * @basej: base time jiffies + * @basem: base time clock monotonic + * @idle: pointer to store the value of timer_base->is_idle on return; + * *idle contains the information whether tick was already stopped + * + * Returns the tick aligned clock monotonic time of the next pending timer or + * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is + * returned as well. + */ +u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) +{ + if (*idle) + return KTIME_MAX; + + return __get_next_timer_interrupt(basej, basem, idle); } /** @@ -1976,15 +2319,20 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) */ void timer_clear_idle(void) { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - /* - * We do this unlocked. The worst outcome is a remote enqueue sending - * a pointless IPI, but taking the lock would just make the window for - * sending the IPI a few instructions smaller for the cost of taking - * the lock in the exit from idle path. + * We do this unlocked. The worst outcome is a remote pinned timer + * enqueue sending a pointless IPI, but taking the lock would just + * make the window for sending the IPI a few instructions smaller + * for the cost of taking the lock in the exit from idle + * path. Required for BASE_LOCAL only. */ - base->is_idle = false; + __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); + if (tick_nohz_full_cpu(smp_processor_id())) + __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); + trace_timer_base_idle(false, smp_processor_id()); + + /* Activate without holding the timer_base->lock */ + tmigr_cpu_activate(); } #endif @@ -1997,11 +2345,10 @@ static inline void __run_timers(struct timer_base *base) struct hlist_head heads[LVL_DEPTH]; int levels; - if (time_before(jiffies, base->next_expiry)) - return; + lockdep_assert_held(&base->lock); - timer_base_lock_expiry(base); - raw_spin_lock_irq(&base->lock); + if (base->running_timer) + return; while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { @@ -2011,30 +2358,55 @@ static inline void __run_timers(struct timer_base *base) * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + - * NEXT_TIMER_MAX_DELTA. + * TIMER_NEXT_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); + /* + * While executing timers, base->clk is set 1 offset ahead of + * jiffies to avoid endless requeuing to current jiffies. + */ base->clk++; - base->next_expiry = __next_timer_interrupt(base); + timer_recalc_next_expiry(base); while (levels--) expire_timers(base, heads + levels); } +} + +static void __run_timer_base(struct timer_base *base) +{ + /* Can race against a remote CPU updating next_expiry under the lock */ + if (time_before(jiffies, READ_ONCE(base->next_expiry))) + return; + + timer_base_lock_expiry(base); + raw_spin_lock_irq(&base->lock); + __run_timers(base); raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } +static void run_timer_base(int index) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[index]); + + __run_timer_base(base); +} + /* * This function runs timers and the timer-tq in bottom half context. */ -static __latent_entropy void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(void) { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + run_timer_base(BASE_LOCAL); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { + run_timer_base(BASE_GLOBAL); + run_timer_base(BASE_DEF); - __run_timers(base); - if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) - __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); + if (is_timers_nohz_active()) + tmigr_handle_remote(); + } } /* @@ -2042,19 +2414,50 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) */ static void run_local_timers(void) { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]); hrtimer_run_queues(); - /* Raise the softirq only if required. */ - if (time_before(jiffies, base->next_expiry)) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) - return; - /* CPU is awake, so check the deferrable base. */ - base++; - if (time_before(jiffies, base->next_expiry)) + + for (int i = 0; i < NR_BASES; i++, base++) { + /* + * Raise the softirq only if required. + * + * timer_base::next_expiry can be written by a remote CPU while + * holding the lock. If this write happens at the same time than + * the lockless local read, sanity checker could complain about + * data corruption. + * + * There are two possible situations where + * timer_base::next_expiry is written by a remote CPU: + * + * 1. Remote CPU expires global timers of this CPU and updates + * timer_base::next_expiry of BASE_GLOBAL afterwards in + * next_timer_interrupt() or timer_recalc_next_expiry(). The + * worst outcome is a superfluous raise of the timer softirq + * when the not yet updated value is read. + * + * 2. A new first pinned timer is enqueued by a remote CPU + * and therefore timer_base::next_expiry of BASE_LOCAL is + * updated. When this update is missed, this isn't a + * problem, as an IPI is executed nevertheless when the CPU + * was idle before. When the CPU wasn't idle but the update + * is missed, then the timer would expire one jiffy late - + * bad luck. + * + * Those unlikely corner cases where the worst outcome is only a + * one jiffy delay or a superfluous raise of the softirq are + * not that expensive as doing the check always while holding + * the lock. + * + * Possible remote writers are using WRITE_ONCE(). Local reader + * uses therefore READ_ONCE(). + */ + if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || + (i == BASE_DEF && tmigr_requires_handle_remote())) { + raise_timer_softirq(TIMER_SOFTIRQ); return; + } } - raise_softirq(TIMER_SOFTIRQ); } /* @@ -2070,149 +2473,14 @@ void update_process_times(int user_tick) run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK - if (in_irq()) + if (in_hardirq()) irq_work_tick(); #endif - scheduler_tick(); + sched_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } -/* - * Since schedule_timeout()'s timer is defined on the stack, it must store - * the target task on the stack as well. - */ -struct process_timer { - struct timer_list timer; - struct task_struct *task; -}; - -static void process_timeout(struct timer_list *t) -{ - struct process_timer *timeout = from_timer(timeout, t, timer); - - wake_up_process(timeout->task); -} - -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have elapsed. - * The function behavior depends on the current task state - * (see also set_current_state() description): - * - * %TASK_RUNNING - the scheduler is called, but the task does not sleep - * at all. That happens because sched_submit_work() does nothing for - * tasks in %TASK_RUNNING state. - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be %TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * Returns 0 when the timer has expired otherwise the remaining time in - * jiffies will be returned. In all cases the return value is guaranteed - * to be non-negative. - */ -signed long __sched schedule_timeout(signed long timeout) -{ - struct process_timer timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx\n", timeout); - dump_stack(); - __set_current_state(TASK_RUNNING); - goto out; - } - } - - expire = timeout + jiffies; - - timer.task = current; - timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); - schedule(); - del_timer_sync(&timer.timer); - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; -} -EXPORT_SYMBOL(schedule_timeout); - -/* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. - */ -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_interruptible); - -signed long __sched schedule_timeout_killable(signed long timeout) -{ - __set_current_state(TASK_KILLABLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_killable); - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_uninterruptible); - -/* - * Like schedule_timeout_uninterruptible(), except this task will not contribute - * to load average. - */ -signed long __sched schedule_timeout_idle(signed long timeout) -{ - __set_current_state(TASK_IDLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_idle); - #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { @@ -2235,7 +2503,7 @@ int timers_prepare_cpu(unsigned int cpu) for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; @@ -2290,7 +2558,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; timer_base_init_expiry_lock(base); } } @@ -2303,65 +2571,9 @@ static void __init init_timer_cpus(void) init_timer_cpu(cpu); } -void __init init_timers(void) +void __init timers_init(void) { init_timer_cpus(); posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -} - -EXPORT_SYMBOL(msleep); - -/** - * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for - */ -unsigned long msleep_interruptible(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); - return jiffies_to_msecs(timeout); -} - -EXPORT_SYMBOL(msleep_interruptible); - -/** - * usleep_range_state - Sleep for an approximate time in a given state - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep - * @state: State of the current task that will be while sleeping - * - * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range_state() instead of udelay(). The sleep improves responsiveness - * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces - * power usage by allowing hrtimers to take advantage of an already- - * scheduled interrupt instead of scheduling a new one just for this sleep. - */ -void __sched usleep_range_state(unsigned long min, unsigned long max, - unsigned int state) -{ - ktime_t exp = ktime_add_us(ktime_get(), min); - u64 delta = (u64)(max - min) * NSEC_PER_USEC; - - for (;;) { - __set_current_state(state); - /* Do not return before the requested sleep time has elapsed */ - if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) - break; - } -} -EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ed7d6ad694fb..488e47e96e93 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,12 +98,10 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); @@ -147,11 +145,15 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) # define P_ns(x) \ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ (unsigned long long)(ktime_to_ns(ts->x))) +# define P_flag(x, f) \ + SEQ_printf(m, " .%-15s: %d\n", #x, !!(ts->flags & (f))) + { struct tick_sched *ts = tick_get_tick_sched(cpu); - P(nohz_mode); + P_flag(nohz, TS_FLAG_NOHZ); + P_flag(highres, TS_FLAG_HIGHRES); P_ns(last_tick); - P(tick_stopped); + P_flag(tick_stopped, TS_FLAG_STOPPED); P(idle_jiffies); P(idle_calls); P(idle_sleeps); @@ -256,7 +258,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) static inline void timer_list_header(struct seq_file *m, u64 now) { - SEQ_printf(m, "Timer List Version: v0.9\n"); + SEQ_printf(m, "Timer List Version: v0.10\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c new file mode 100644 index 000000000000..18dda1aa782d --- /dev/null +++ b/kernel/time/timer_migration.c @@ -0,0 +1,2023 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Infrastructure for migratable timers + * + * Copyright(C) 2022 linutronix GmbH + */ +#include <linux/cpuhotplug.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/timerqueue.h> +#include <trace/events/ipi.h> +#include <linux/sched/isolation.h> + +#include "timer_migration.h" +#include "tick-internal.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/timer_migration.h> + +/* + * The timer migration mechanism is built on a hierarchy of groups. The + * lowest level group contains CPUs, the next level groups of CPU groups + * and so forth. The CPU groups are kept per node so for the normal case + * lock contention won't happen across nodes. Depending on the number of + * CPUs per node even the next level might be kept as groups of CPU groups + * per node and only the levels above cross the node topology. + * + * Example topology for a two node system with 24 CPUs each. + * + * LVL 2 [GRP2:0] + * GRP1:0 = GRP1:M + * + * LVL 1 [GRP1:0] [GRP1:1] + * GRP0:0 - GRP0:2 GRP0:3 - GRP0:5 + * + * LVL 0 [GRP0:0] [GRP0:1] [GRP0:2] [GRP0:3] [GRP0:4] [GRP0:5] + * CPUS 0-7 8-15 16-23 24-31 32-39 40-47 + * + * The groups hold a timer queue of events sorted by expiry time. These + * queues are updated when CPUs go in idle. When they come out of idle + * ignore flag of events is set. + * + * Each group has a designated migrator CPU/group as long as a CPU/group is + * active in the group. This designated role is necessary to avoid that all + * active CPUs in a group try to migrate expired timers from other CPUs, + * which would result in massive lock bouncing. + * + * When a CPU is awake, it checks in it's own timer tick the group + * hierarchy up to the point where it is assigned the migrator role or if + * no CPU is active, it also checks the groups where no migrator is set + * (TMIGR_NONE). + * + * If it finds expired timers in one of the group queues it pulls them over + * from the idle CPU and runs the timer function. After that it updates the + * group and the parent groups if required. + * + * CPUs which go idle arm their CPU local timer hardware for the next local + * (pinned) timer event. If the next migratable timer expires after the + * next local timer or the CPU has no migratable timer pending then the + * CPU does not queue an event in the LVL0 group. If the next migratable + * timer expires before the next local timer then the CPU queues that timer + * in the LVL0 group. In both cases the CPU marks itself idle in the LVL0 + * group. + * + * When CPU comes out of idle and when a group has at least a single active + * child, the ignore flag of the tmigr_event is set. This indicates, that + * the event is ignored even if it is still enqueued in the parent groups + * timer queue. It will be removed when touching the timer queue the next + * time. This spares locking in active path as the lock protects (after + * setup) only event information. For more information about locking, + * please read the section "Locking rules". + * + * If the CPU is the migrator of the group then it delegates that role to + * the next active CPU in the group or sets migrator to TMIGR_NONE when + * there is no active CPU in the group. This delegation needs to be + * propagated up the hierarchy so hand over from other leaves can happen at + * all hierarchy levels w/o doing a search. + * + * When the last CPU in the system goes idle, then it drops all migrator + * duties up to the top level of the hierarchy (LVL2 in the example). It + * then has to make sure, that it arms it's own local hardware timer for + * the earliest event in the system. + * + * + * Lifetime rules: + * --------------- + * + * The groups are built up at init time or when CPUs come online. They are + * not destroyed when a group becomes empty due to offlining. The group + * just won't participate in the hierarchy management anymore. Destroying + * groups would result in interesting race conditions which would just make + * the whole mechanism slow and complex. + * + * + * Locking rules: + * -------------- + * + * For setting up new groups and handling events it's required to lock both + * child and parent group. The lock ordering is always bottom up. This also + * includes the per CPU locks in struct tmigr_cpu. For updating the migrator and + * active CPU/group information atomic_try_cmpxchg() is used instead and only + * the per CPU tmigr_cpu->lock is held. + * + * During the setup of groups tmigr_level_list is required. It is protected by + * @tmigr_mutex. + * + * When @timer_base->lock as well as tmigr related locks are required, the lock + * ordering is: first @timer_base->lock, afterwards tmigr related locks. + * + * + * Protection of the tmigr group state information: + * ------------------------------------------------ + * + * The state information with the list of active children and migrator needs to + * be protected by a sequence counter. It prevents a race when updates in child + * groups are propagated in changed order. The state update is performed + * lockless and group wise. The following scenario describes what happens + * without updating the sequence counter: + * + * Therefore, let's take three groups and four CPUs (CPU2 and CPU3 as well + * as GRP0:1 will not change during the scenario): + * + * LVL 1 [GRP1:0] + * migrator = GRP0:1 + * active = GRP0:0, GRP0:1 + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = CPU0 migrator = CPU2 + * active = CPU0 active = CPU2 + * / \ / \ + * CPUs 0 1 2 3 + * active idle active idle + * + * + * 1. CPU0 goes idle. As the update is performed group wise, in the first step + * only GRP0:0 is updated. The update of GRP1:0 is pending as CPU0 has to + * walk the hierarchy. + * + * LVL 1 [GRP1:0] + * migrator = GRP0:1 + * active = GRP0:0, GRP0:1 + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * --> migrator = TMIGR_NONE migrator = CPU2 + * --> active = active = CPU2 + * / \ / \ + * CPUs 0 1 2 3 + * --> idle idle active idle + * + * 2. While CPU0 goes idle and continues to update the state, CPU1 comes out of + * idle. CPU1 updates GRP0:0. The update for GRP1:0 is pending as CPU1 also + * has to walk the hierarchy. Both CPUs (CPU0 and CPU1) now walk the + * hierarchy to perform the needed update from their point of view. The + * currently visible state looks the following: + * + * LVL 1 [GRP1:0] + * migrator = GRP0:1 + * active = GRP0:0, GRP0:1 + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * --> migrator = CPU1 migrator = CPU2 + * --> active = CPU1 active = CPU2 + * / \ / \ + * CPUs 0 1 2 3 + * idle --> active active idle + * + * 3. Here is the race condition: CPU1 managed to propagate its changes (from + * step 2) through the hierarchy to GRP1:0 before CPU0 (step 1) did. The + * active members of GRP1:0 remain unchanged after the update since it is + * still valid from CPU1 current point of view: + * + * LVL 1 [GRP1:0] + * --> migrator = GRP0:1 + * --> active = GRP0:0, GRP0:1 + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = CPU1 migrator = CPU2 + * active = CPU1 active = CPU2 + * / \ / \ + * CPUs 0 1 2 3 + * idle active active idle + * + * 4. Now CPU0 finally propagates its changes (from step 1) to GRP1:0. + * + * LVL 1 [GRP1:0] + * --> migrator = GRP0:1 + * --> active = GRP0:1 + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = CPU1 migrator = CPU2 + * active = CPU1 active = CPU2 + * / \ / \ + * CPUs 0 1 2 3 + * idle active active idle + * + * + * The race of CPU0 vs. CPU1 led to an inconsistent state in GRP1:0. CPU1 is + * active and is correctly listed as active in GRP0:0. However GRP1:0 does not + * have GRP0:0 listed as active, which is wrong. The sequence counter has been + * added to avoid inconsistent states during updates. The state is updated + * atomically only if all members, including the sequence counter, match the + * expected value (compare-and-exchange). + * + * Looking back at the previous example with the addition of the sequence + * counter: The update as performed by CPU0 in step 4 will fail. CPU1 changed + * the sequence number during the update in step 3 so the expected old value (as + * seen by CPU0 before starting the walk) does not match. + * + * Prevent race between new event and last CPU going inactive + * ---------------------------------------------------------- + * + * When the last CPU is going idle and there is a concurrent update of a new + * first global timer of an idle CPU, the group and child states have to be read + * while holding the lock in tmigr_update_events(). The following scenario shows + * what happens, when this is not done. + * + * 1. Only CPU2 is active: + * + * LVL 1 [GRP1:0] + * migrator = GRP0:1 + * active = GRP0:1 + * next_expiry = KTIME_MAX + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = TMIGR_NONE migrator = CPU2 + * active = active = CPU2 + * next_expiry = KTIME_MAX next_expiry = KTIME_MAX + * / \ / \ + * CPUs 0 1 2 3 + * idle idle active idle + * + * 2. Now CPU 2 goes idle (and has no global timer, that has to be handled) and + * propagates that to GRP0:1: + * + * LVL 1 [GRP1:0] + * migrator = GRP0:1 + * active = GRP0:1 + * next_expiry = KTIME_MAX + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = TMIGR_NONE --> migrator = TMIGR_NONE + * active = --> active = + * next_expiry = KTIME_MAX next_expiry = KTIME_MAX + * / \ / \ + * CPUs 0 1 2 3 + * idle idle --> idle idle + * + * 3. Now the idle state is propagated up to GRP1:0. As this is now the last + * child going idle in top level group, the expiry of the next group event + * has to be handed back to make sure no event is lost. As there is no event + * enqueued, KTIME_MAX is handed back to CPU2. + * + * LVL 1 [GRP1:0] + * --> migrator = TMIGR_NONE + * --> active = + * next_expiry = KTIME_MAX + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = TMIGR_NONE migrator = TMIGR_NONE + * active = active = + * next_expiry = KTIME_MAX next_expiry = KTIME_MAX + * / \ / \ + * CPUs 0 1 2 3 + * idle idle --> idle idle + * + * 4. CPU 0 has a new timer queued from idle and it expires at TIMER0. CPU0 + * propagates that to GRP0:0: + * + * LVL 1 [GRP1:0] + * migrator = TMIGR_NONE + * active = + * next_expiry = KTIME_MAX + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = TMIGR_NONE migrator = TMIGR_NONE + * active = active = + * --> next_expiry = TIMER0 next_expiry = KTIME_MAX + * / \ / \ + * CPUs 0 1 2 3 + * idle idle idle idle + * + * 5. GRP0:0 is not active, so the new timer has to be propagated to + * GRP1:0. Therefore the GRP1:0 state has to be read. When the stalled value + * (from step 2) is read, the timer is enqueued into GRP1:0, but nothing is + * handed back to CPU0, as it seems that there is still an active child in + * top level group. + * + * LVL 1 [GRP1:0] + * migrator = TMIGR_NONE + * active = + * --> next_expiry = TIMER0 + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = TMIGR_NONE migrator = TMIGR_NONE + * active = active = + * next_expiry = TIMER0 next_expiry = KTIME_MAX + * / \ / \ + * CPUs 0 1 2 3 + * idle idle idle idle + * + * This is prevented by reading the state when holding the lock (when a new + * timer has to be propagated from idle path):: + * + * CPU2 (tmigr_inactive_up()) CPU0 (tmigr_new_timer_up()) + * -------------------------- --------------------------- + * // step 3: + * cmpxchg(&GRP1:0->state); + * tmigr_update_events() { + * spin_lock(&GRP1:0->lock); + * // ... update events ... + * // hand back first expiry when GRP1:0 is idle + * spin_unlock(&GRP1:0->lock); + * // ^^^ release state modification + * } + * tmigr_update_events() { + * spin_lock(&GRP1:0->lock) + * // ^^^ acquire state modification + * group_state = atomic_read(&GRP1:0->state) + * // .... update events ... + * // hand back first expiry when GRP1:0 is idle + * spin_unlock(&GRP1:0->lock) <3> + * // ^^^ makes state visible for other + * // callers of tmigr_new_timer_up() + * } + * + * When CPU0 grabs the lock directly after cmpxchg, the first timer is reported + * back to CPU0 and also later on to CPU2. So no timer is missed. A concurrent + * update of the group state from active path is no problem, as the upcoming CPU + * will take care of the group events. + * + * Required event and timerqueue update after a remote expiry: + * ----------------------------------------------------------- + * + * After expiring timers of a remote CPU, a walk through the hierarchy and + * update of events and timerqueues is required. It is obviously needed if there + * is a 'new' global timer but also if there is no new global timer but the + * remote CPU is still idle. + * + * 1. CPU0 and CPU1 are idle and have both a global timer expiring at the same + * time. So both have an event enqueued in the timerqueue of GRP0:0. CPU3 is + * also idle and has no global timer pending. CPU2 is the only active CPU and + * thus also the migrator: + * + * LVL 1 [GRP1:0] + * migrator = GRP0:1 + * active = GRP0:1 + * --> timerqueue = evt-GRP0:0 + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = TMIGR_NONE migrator = CPU2 + * active = active = CPU2 + * groupevt.ignore = false groupevt.ignore = true + * groupevt.cpu = CPU0 groupevt.cpu = + * timerqueue = evt-CPU0, timerqueue = + * evt-CPU1 + * / \ / \ + * CPUs 0 1 2 3 + * idle idle active idle + * + * 2. CPU2 starts to expire remote timers. It starts with LVL0 group + * GRP0:1. There is no event queued in the timerqueue, so CPU2 continues with + * the parent of GRP0:1: GRP1:0. In GRP1:0 it dequeues the first event. It + * looks at tmigr_event::cpu struct member and expires the pending timer(s) + * of CPU0. + * + * LVL 1 [GRP1:0] + * migrator = GRP0:1 + * active = GRP0:1 + * --> timerqueue = + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = TMIGR_NONE migrator = CPU2 + * active = active = CPU2 + * groupevt.ignore = false groupevt.ignore = true + * --> groupevt.cpu = CPU0 groupevt.cpu = + * timerqueue = evt-CPU0, timerqueue = + * evt-CPU1 + * / \ / \ + * CPUs 0 1 2 3 + * idle idle active idle + * + * 3. Some work has to be done after expiring the timers of CPU0. If we stop + * here, then CPU1's pending global timer(s) will not expire in time and the + * timerqueue of GRP0:0 has still an event for CPU0 enqueued which has just + * been processed. So it is required to walk the hierarchy from CPU0's point + * of view and update it accordingly. CPU0's event will be removed from the + * timerqueue because it has no pending timer. If CPU0 would have a timer + * pending then it has to expire after CPU1's first timer because all timers + * from this period were just expired. Either way CPU1's event will be first + * in GRP0:0's timerqueue and therefore set in the CPU field of the group + * event which is then enqueued in GRP1:0's timerqueue as GRP0:0 is still not + * active: + * + * LVL 1 [GRP1:0] + * migrator = GRP0:1 + * active = GRP0:1 + * --> timerqueue = evt-GRP0:0 + * / \ + * LVL 0 [GRP0:0] [GRP0:1] + * migrator = TMIGR_NONE migrator = CPU2 + * active = active = CPU2 + * groupevt.ignore = false groupevt.ignore = true + * --> groupevt.cpu = CPU1 groupevt.cpu = + * --> timerqueue = evt-CPU1 timerqueue = + * / \ / \ + * CPUs 0 1 2 3 + * idle idle active idle + * + * Now CPU2 (migrator) will continue step 2 at GRP1:0 and will expire the + * timer(s) of CPU1. + * + * The hierarchy walk in step 3 can be skipped if the migrator notices that a + * CPU of GRP0:0 is active again. The CPU will mark GRP0:0 active and take care + * of the group as migrator and any needed updates within the hierarchy. + */ + +static DEFINE_MUTEX(tmigr_mutex); +static struct list_head *tmigr_level_list __read_mostly; + +static unsigned int tmigr_hierarchy_levels __read_mostly; +static unsigned int tmigr_crossnode_level __read_mostly; + +static struct tmigr_group *tmigr_root; + +static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); + +/* + * CPUs available for timer migration. + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * Additionally tmigr_available_mutex serializes set/clear operations with each other. + */ +static cpumask_var_t tmigr_available_cpumask; +static DEFINE_MUTEX(tmigr_available_mutex); + +/* Enabled during late initcall */ +static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated); + +#define TMIGR_NONE 0xFF +#define BIT_CNT 8 + +static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) +{ + return !(tmc->tmgroup && tmc->available); +} + +/* + * Returns true if @cpu should be excluded from the hierarchy as isolated. + * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs + * are still part of the hierarchy but become idle (from a tick and timer + * migration perspective) when they stop their tick. This lets the timekeeping + * CPU handle their global timers. Marking also isolated CPUs as idle would be + * too costly, hence they are completely excluded from the hierarchy. + * This check is necessary, for instance, to prevent offline isolated CPUs from + * being incorrectly marked as available once getting back online. + * + * This function returns false during early boot and the isolation logic is + * enabled only after isolated CPUs are marked as unavailable at late boot. + * The tick CPU can be isolated at boot, however we cannot mark it as + * unavailable to avoid having no global migrator for the nohz_full CPUs. This + * should be ensured by the callers of this function: implicitly from hotplug + * callbacks and explicitly in tmigr_init_isolation() and + * tmigr_isolated_exclude_cpumask(). + */ +static inline bool tmigr_is_isolated(int cpu) +{ + if (!static_branch_unlikely(&tmigr_exclude_isolated)) + return false; + return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) || + cpuset_cpu_is_isolated(cpu)) && + housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE); +} + +/* + * Returns true, when @childmask corresponds to the group migrator or when the + * group is not active - so no migrator is set. + */ +static bool tmigr_check_migrator(struct tmigr_group *group, u8 childmask) +{ + union tmigr_state s; + + s.state = atomic_read(&group->migr_state); + + if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) + return true; + + return false; +} + +static bool tmigr_check_migrator_and_lonely(struct tmigr_group *group, u8 childmask) +{ + bool lonely, migrator = false; + unsigned long active; + union tmigr_state s; + + s.state = atomic_read(&group->migr_state); + + if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) + migrator = true; + + active = s.active; + lonely = bitmap_weight(&active, BIT_CNT) <= 1; + + return (migrator && lonely); +} + +static bool tmigr_check_lonely(struct tmigr_group *group) +{ + unsigned long active; + union tmigr_state s; + + s.state = atomic_read(&group->migr_state); + + active = s.active; + + return bitmap_weight(&active, BIT_CNT) <= 1; +} + +/** + * struct tmigr_walk - data required for walking the hierarchy + * @nextexp: Next CPU event expiry information which is handed into + * the timer migration code by the timer code + * (get_next_timer_interrupt()) + * @firstexp: Contains the first event expiry information when + * hierarchy is completely idle. When CPU itself was the + * last going idle, information makes sure, that CPU will + * be back in time. When using this value in the remote + * expiry case, firstexp is stored in the per CPU tmigr_cpu + * struct of CPU which expires remote timers. It is updated + * in top level group only. Be aware, there could occur a + * new top level of the hierarchy between the 'top level + * call' in tmigr_update_events() and the check for the + * parent group in walk_groups(). Then @firstexp might + * contain a value != KTIME_MAX even if it was not the + * final top level. This is not a problem, as the worst + * outcome is a CPU which might wake up a little early. + * @evt: Pointer to tmigr_event which needs to be queued (of idle + * child group) + * @childmask: groupmask of child group + * @remote: Is set, when the new timer path is executed in + * tmigr_handle_remote_cpu() + * @basej: timer base in jiffies + * @now: timer base monotonic + * @check: is set if there is the need to handle remote timers; + * required in tmigr_requires_handle_remote() only + */ +struct tmigr_walk { + u64 nextexp; + u64 firstexp; + struct tmigr_event *evt; + u8 childmask; + bool remote; + unsigned long basej; + u64 now; + bool check; +}; + +typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); + +static void __walk_groups_from(up_f up, struct tmigr_walk *data, + struct tmigr_group *child, struct tmigr_group *group) +{ + do { + WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); + + if (up(group, child, data)) + break; + + child = group; + /* + * Pairs with the store release on group connection + * to make sure group initialization is visible. + */ + group = READ_ONCE(group->parent); + data->childmask = child->groupmask; + WARN_ON_ONCE(!data->childmask); + } while (group); +} + +static void __walk_groups(up_f up, struct tmigr_walk *data, + struct tmigr_cpu *tmc) +{ + __walk_groups_from(up, data, NULL, tmc->tmgroup); +} + +static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) +{ + lockdep_assert_held(&tmc->lock); + + __walk_groups(up, data, tmc); +} + +/* + * Returns the next event of the timerqueue @group->events + * + * Removes timers with ignore flag and update next_expiry of the group. Values + * of the group event are updated in tmigr_update_events() only. + */ +static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) +{ + struct timerqueue_node *node = NULL; + struct tmigr_event *evt = NULL; + + lockdep_assert_held(&group->lock); + + WRITE_ONCE(group->next_expiry, KTIME_MAX); + + while ((node = timerqueue_getnext(&group->events))) { + evt = container_of(node, struct tmigr_event, nextevt); + + if (!READ_ONCE(evt->ignore)) { + WRITE_ONCE(group->next_expiry, evt->nextevt.expires); + return evt; + } + + /* + * Remove next timers with ignore flag, because the group lock + * is held anyway + */ + if (!timerqueue_del(&group->events, node)) + break; + } + + return NULL; +} + +/* + * Return the next event (with the expiry equal or before @now) + * + * Event, which is returned, is also removed from the queue. + */ +static struct tmigr_event *tmigr_next_expired_groupevt(struct tmigr_group *group, + u64 now) +{ + struct tmigr_event *evt = tmigr_next_groupevt(group); + + if (!evt || now < evt->nextevt.expires) + return NULL; + + /* + * The event is ready to expire. Remove it and update next group event. + */ + timerqueue_del(&group->events, &evt->nextevt); + tmigr_next_groupevt(group); + + return evt; +} + +static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) +{ + struct tmigr_event *evt; + + evt = tmigr_next_groupevt(group); + + if (!evt) + return KTIME_MAX; + else + return evt->nextevt.expires; +} + +static bool tmigr_active_up(struct tmigr_group *group, + struct tmigr_group *child, + struct tmigr_walk *data) +{ + union tmigr_state curstate, newstate; + bool walk_done; + u8 childmask; + + childmask = data->childmask; + /* + * No memory barrier is required here in contrast to + * tmigr_inactive_up(), as the group state change does not depend on the + * child state. + */ + curstate.state = atomic_read(&group->migr_state); + + do { + newstate = curstate; + walk_done = true; + + if (newstate.migrator == TMIGR_NONE) { + newstate.migrator = childmask; + + /* Changes need to be propagated */ + walk_done = false; + } + + newstate.active |= childmask; + newstate.seq++; + + } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); + + trace_tmigr_group_set_cpu_active(group, newstate, childmask); + + /* + * The group is active (again). The group event might be still queued + * into the parent group's timerqueue but can now be handled by the + * migrator of this group. Therefore the ignore flag for the group event + * is updated to reflect this. + * + * The update of the ignore flag in the active path is done lockless. In + * worst case the migrator of the parent group observes the change too + * late and expires remotely all events belonging to this group. The + * lock is held while updating the ignore flag in idle path. So this + * state change will not be lost. + */ + WRITE_ONCE(group->groupevt.ignore, true); + + return walk_done; +} + +static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) +{ + struct tmigr_walk data; + + data.childmask = tmc->groupmask; + + trace_tmigr_cpu_active(tmc); + + tmc->cpuevt.ignore = true; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + walk_groups(&tmigr_active_up, &data, tmc); +} + +/** + * tmigr_cpu_activate() - set this CPU active in timer migration hierarchy + * + * Call site timer_clear_idle() is called with interrupts disabled. + */ +void tmigr_cpu_activate(void) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + if (tmigr_is_not_available(tmc)) + return; + + if (WARN_ON_ONCE(!tmc->idle)) + return; + + raw_spin_lock(&tmc->lock); + tmc->idle = false; + __tmigr_cpu_activate(tmc); + raw_spin_unlock(&tmc->lock); +} + +/* + * Returns true, if there is nothing to be propagated to the next level + * + * @data->firstexp is set to expiry of first global event of the (top level of + * the) hierarchy, but only when hierarchy is completely idle. + * + * The child and group states need to be read under the lock, to prevent a race + * against a concurrent tmigr_inactive_up() run when the last CPU goes idle. See + * also section "Prevent race between new event and last CPU going inactive" in + * the documentation at the top. + * + * This is the only place where the group event expiry value is set. + */ +static +bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, + struct tmigr_walk *data) +{ + struct tmigr_event *evt, *first_childevt; + union tmigr_state childstate, groupstate; + bool remote = data->remote; + bool walk_done = false; + bool ignore; + u64 nextexp; + + if (child) { + raw_spin_lock(&child->lock); + raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING); + + childstate.state = atomic_read(&child->migr_state); + groupstate.state = atomic_read(&group->migr_state); + + if (childstate.active) { + walk_done = true; + goto unlock; + } + + first_childevt = tmigr_next_groupevt(child); + nextexp = child->next_expiry; + evt = &child->groupevt; + + /* + * This can race with concurrent idle exit (activate). + * If the current writer wins, a useless remote expiration may + * be scheduled. If the activate wins, the event is properly + * ignored. + */ + ignore = (nextexp == KTIME_MAX) ? true : false; + WRITE_ONCE(evt->ignore, ignore); + } else { + nextexp = data->nextexp; + + first_childevt = evt = data->evt; + ignore = evt->ignore; + + /* + * Walking the hierarchy is required in any case when a + * remote expiry was done before. This ensures to not lose + * already queued events in non active groups (see section + * "Required event and timerqueue update after a remote + * expiry" in the documentation at the top). + * + * The two call sites which are executed without a remote expiry + * before, are not prevented from propagating changes through + * the hierarchy by the return: + * - When entering this path by tmigr_new_timer(), @evt->ignore + * is never set. + * - tmigr_inactive_up() takes care of the propagation by + * itself and ignores the return value. But an immediate + * return is possible if there is a parent, sparing group + * locking at this level, because the upper walking call to + * the parent will take care about removing this event from + * within the group and update next_expiry accordingly. + * + * However if there is no parent, ie: the hierarchy has only a + * single level so @group is the top level group, make sure the + * first event information of the group is updated properly and + * also handled properly, so skip this fast return path. + */ + if (ignore && !remote && group->parent) + return true; + + raw_spin_lock(&group->lock); + + childstate.state = 0; + groupstate.state = atomic_read(&group->migr_state); + } + + /* + * If the child event is already queued in the group, remove it from the + * queue when the expiry time changed only or when it could be ignored. + */ + if (timerqueue_node_queued(&evt->nextevt)) { + if ((evt->nextevt.expires == nextexp) && !ignore) { + /* Make sure not to miss a new CPU event with the same expiry */ + evt->cpu = first_childevt->cpu; + goto check_toplvl; + } + + if (!timerqueue_del(&group->events, &evt->nextevt)) + WRITE_ONCE(group->next_expiry, KTIME_MAX); + } + + if (ignore) { + /* + * When the next child event could be ignored (nextexp is + * KTIME_MAX) and there was no remote timer handling before or + * the group is already active, there is no need to walk the + * hierarchy even if there is a parent group. + * + * The other way round: even if the event could be ignored, but + * if a remote timer handling was executed before and the group + * is not active, walking the hierarchy is required to not miss + * an enqueued timer in the non active group. The enqueued timer + * of the group needs to be propagated to a higher level to + * ensure it is handled. + */ + if (!remote || groupstate.active) + walk_done = true; + } else { + evt->nextevt.expires = nextexp; + evt->cpu = first_childevt->cpu; + + if (timerqueue_add(&group->events, &evt->nextevt)) + WRITE_ONCE(group->next_expiry, nextexp); + } + +check_toplvl: + if (!group->parent && (groupstate.migrator == TMIGR_NONE)) { + walk_done = true; + + /* + * Nothing to do when update was done during remote timer + * handling. First timer in top level group which needs to be + * handled when top level group is not active, is calculated + * directly in tmigr_handle_remote_up(). + */ + if (remote) + goto unlock; + + /* + * The top level group is idle and it has to be ensured the + * global timers are handled in time. (This could be optimized + * by keeping track of the last global scheduled event and only + * arming it on the CPU if the new event is earlier. Not sure if + * its worth the complexity.) + */ + data->firstexp = tmigr_next_groupevt_expires(group); + } + + trace_tmigr_update_events(child, group, childstate, groupstate, + nextexp); + +unlock: + raw_spin_unlock(&group->lock); + + if (child) + raw_spin_unlock(&child->lock); + + return walk_done; +} + +static bool tmigr_new_timer_up(struct tmigr_group *group, + struct tmigr_group *child, + struct tmigr_walk *data) +{ + return tmigr_update_events(group, child, data); +} + +/* + * Returns the expiry of the next timer that needs to be handled. KTIME_MAX is + * returned, if an active CPU will handle all the timer migration hierarchy + * timers. + */ +static u64 tmigr_new_timer(struct tmigr_cpu *tmc, u64 nextexp) +{ + struct tmigr_walk data = { .nextexp = nextexp, + .firstexp = KTIME_MAX, + .evt = &tmc->cpuevt }; + + lockdep_assert_held(&tmc->lock); + + if (tmc->remote) + return KTIME_MAX; + + trace_tmigr_cpu_new_timer(tmc); + + tmc->cpuevt.ignore = false; + data.remote = false; + + walk_groups(&tmigr_new_timer_up, &data, tmc); + + /* If there is a new first global event, make sure it is handled */ + return data.firstexp; +} + +static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, + unsigned long jif) +{ + struct timer_events tevt; + struct tmigr_walk data; + struct tmigr_cpu *tmc; + + tmc = per_cpu_ptr(&tmigr_cpu, cpu); + + raw_spin_lock_irq(&tmc->lock); + + /* + * If the remote CPU is offline then the timers have been migrated to + * another CPU. + * + * If tmigr_cpu::remote is set, at the moment another CPU already + * expires the timers of the remote CPU. + * + * If tmigr_event::ignore is set, then the CPU returns from idle and + * takes care of its timers. + * + * If the next event expires in the future, then the event has been + * updated and there are no timers to expire right now. The CPU which + * updated the event takes care when hierarchy is completely + * idle. Otherwise the migrator does it as the event is enqueued. + */ + if (!tmc->available || tmc->remote || tmc->cpuevt.ignore || + now < tmc->cpuevt.nextevt.expires) { + raw_spin_unlock_irq(&tmc->lock); + return; + } + + trace_tmigr_handle_remote_cpu(tmc); + + tmc->remote = true; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* Drop the lock to allow the remote CPU to exit idle */ + raw_spin_unlock_irq(&tmc->lock); + + if (cpu != smp_processor_id()) + timer_expire_remote(cpu); + + /* + * Lock ordering needs to be preserved - timer_base locks before tmigr + * related locks (see section "Locking rules" in the documentation at + * the top). During fetching the next timer interrupt, also tmc->lock + * needs to be held. Otherwise there is a possible race window against + * the CPU itself when it comes out of idle, updates the first timer in + * the hierarchy and goes back to idle. + * + * timer base locks are dropped as fast as possible: After checking + * whether the remote CPU went offline in the meantime and after + * fetching the next remote timer interrupt. Dropping the locks as fast + * as possible keeps the locking region small and prevents holding + * several (unnecessary) locks during walking the hierarchy for updating + * the timerqueue and group events. + */ + local_irq_disable(); + timer_lock_remote_bases(cpu); + raw_spin_lock(&tmc->lock); + + /* + * When the CPU went offline in the meantime, no hierarchy walk has to + * be done for updating the queued events, because the walk was + * already done during marking the CPU offline in the hierarchy. + * + * When the CPU is no longer idle, the CPU takes care of the timers and + * also of the timers in the hierarchy. + * + * (See also section "Required event and timerqueue update after a + * remote expiry" in the documentation at the top) + */ + if (!tmc->available || !tmc->idle) { + timer_unlock_remote_bases(cpu); + goto unlock; + } + + /* next event of CPU */ + fetch_next_timer_interrupt_remote(jif, now, &tevt, cpu); + timer_unlock_remote_bases(cpu); + + data.nextexp = tevt.global; + data.firstexp = KTIME_MAX; + data.evt = &tmc->cpuevt; + data.remote = true; + + /* + * The update is done even when there is no 'new' global timer pending + * on the remote CPU (see section "Required event and timerqueue update + * after a remote expiry" in the documentation at the top) + */ + walk_groups(&tmigr_new_timer_up, &data, tmc); + +unlock: + tmc->remote = false; + raw_spin_unlock_irq(&tmc->lock); +} + +static bool tmigr_handle_remote_up(struct tmigr_group *group, + struct tmigr_group *child, + struct tmigr_walk *data) +{ + struct tmigr_event *evt; + unsigned long jif; + u8 childmask; + u64 now; + + jif = data->basej; + now = data->now; + + childmask = data->childmask; + + trace_tmigr_handle_remote(group); +again: + /* + * Handle the group only if @childmask is the migrator or if the + * group has no migrator. Otherwise the group is active and is + * handled by its own migrator. + */ + if (!tmigr_check_migrator(group, childmask)) + return true; + + raw_spin_lock_irq(&group->lock); + + evt = tmigr_next_expired_groupevt(group, now); + + if (evt) { + unsigned int remote_cpu = evt->cpu; + + raw_spin_unlock_irq(&group->lock); + + tmigr_handle_remote_cpu(remote_cpu, now, jif); + + /* check if there is another event, that needs to be handled */ + goto again; + } + + /* + * Keep track of the expiry of the first event that needs to be handled + * (group->next_expiry was updated by tmigr_next_expired_groupevt(), + * next was set by tmigr_handle_remote_cpu()). + */ + data->firstexp = group->next_expiry; + + raw_spin_unlock_irq(&group->lock); + + return false; +} + +/** + * tmigr_handle_remote() - Handle global timers of remote idle CPUs + * + * Called from the timer soft interrupt with interrupts enabled. + */ +void tmigr_handle_remote(void) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_walk data; + + if (tmigr_is_not_available(tmc)) + return; + + data.childmask = tmc->groupmask; + data.firstexp = KTIME_MAX; + + /* + * NOTE: This is a doubled check because the migrator test will be done + * in tmigr_handle_remote_up() anyway. Keep this check to speed up the + * return when nothing has to be done. + */ + if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) { + /* + * If this CPU was an idle migrator, make sure to clear its wakeup + * value so it won't chase timers that have already expired elsewhere. + * This avoids endless requeue from tmigr_new_timer(). + */ + if (READ_ONCE(tmc->wakeup) == KTIME_MAX) + return; + } + + data.now = get_jiffies_update(&data.basej); + + /* + * Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to + * KTIME_MAX. Even if tmc->lock is not held during the whole remote + * handling, tmc->wakeup is fine to be stale as it is called in + * interrupt context and tick_nohz_next_event() is executed in interrupt + * exit path only after processing the last pending interrupt. + */ + + __walk_groups(&tmigr_handle_remote_up, &data, tmc); + + raw_spin_lock_irq(&tmc->lock); + WRITE_ONCE(tmc->wakeup, data.firstexp); + raw_spin_unlock_irq(&tmc->lock); +} + +static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, + struct tmigr_group *child, + struct tmigr_walk *data) +{ + u8 childmask; + + childmask = data->childmask; + + /* + * Handle the group only if the child is the migrator or if the group + * has no migrator. Otherwise the group is active and is handled by its + * own migrator. + */ + if (!tmigr_check_migrator(group, childmask)) + return true; + /* + * The lock is required on 32bit architectures to read the variable + * consistently with a concurrent writer. On 64bit the lock is not + * required because the read operation is not split and so it is always + * consistent. + */ + if (IS_ENABLED(CONFIG_64BIT)) { + data->firstexp = READ_ONCE(group->next_expiry); + if (data->now >= data->firstexp) { + data->check = true; + return true; + } + } else { + raw_spin_lock(&group->lock); + data->firstexp = group->next_expiry; + if (data->now >= group->next_expiry) { + data->check = true; + raw_spin_unlock(&group->lock); + return true; + } + raw_spin_unlock(&group->lock); + } + + return false; +} + +/** + * tmigr_requires_handle_remote() - Check the need of remote timer handling + * + * Must be called with interrupts disabled. + */ +bool tmigr_requires_handle_remote(void) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_walk data; + unsigned long jif; + bool ret = false; + + if (tmigr_is_not_available(tmc)) + return ret; + + data.now = get_jiffies_update(&jif); + data.childmask = tmc->groupmask; + data.firstexp = KTIME_MAX; + data.check = false; + + /* + * If the CPU is active, walk the hierarchy to check whether a remote + * expiry is required. + * + * Check is done lockless as interrupts are disabled and @tmc->idle is + * set only by the local CPU. + */ + if (!tmc->idle) { + __walk_groups(&tmigr_requires_handle_remote_up, &data, tmc); + + return data.check; + } + + /* + * When the CPU is idle, compare @tmc->wakeup with @data.now. The lock + * is required on 32bit architectures to read the variable consistently + * with a concurrent writer. On 64bit the lock is not required because + * the read operation is not split and so it is always consistent. + */ + if (IS_ENABLED(CONFIG_64BIT)) { + if (data.now >= READ_ONCE(tmc->wakeup)) + return true; + } else { + raw_spin_lock(&tmc->lock); + if (data.now >= tmc->wakeup) + ret = true; + raw_spin_unlock(&tmc->lock); + } + + return ret; +} + +/** + * tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc) + * @nextexp: Next expiry of global timer (or KTIME_MAX if not) + * + * The CPU is already deactivated in the timer migration + * hierarchy. tick_nohz_get_sleep_length() calls tick_nohz_next_event() + * and thereby the timer idle path is executed once more. @tmc->wakeup + * holds the first timer, when the timer migration hierarchy is + * completely idle. + * + * Returns the first timer that needs to be handled by this CPU or KTIME_MAX if + * nothing needs to be done. + */ +u64 tmigr_cpu_new_timer(u64 nextexp) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + u64 ret; + + if (tmigr_is_not_available(tmc)) + return nextexp; + + raw_spin_lock(&tmc->lock); + + ret = READ_ONCE(tmc->wakeup); + if (nextexp != KTIME_MAX) { + if (nextexp != tmc->cpuevt.nextevt.expires || + tmc->cpuevt.ignore) { + ret = tmigr_new_timer(tmc, nextexp); + /* + * Make sure the reevaluation of timers in idle path + * will not miss an event. + */ + WRITE_ONCE(tmc->wakeup, ret); + } + } + trace_tmigr_cpu_new_timer_idle(tmc, nextexp); + raw_spin_unlock(&tmc->lock); + return ret; +} + +static bool tmigr_inactive_up(struct tmigr_group *group, + struct tmigr_group *child, + struct tmigr_walk *data) +{ + union tmigr_state curstate, newstate, childstate; + bool walk_done; + u8 childmask; + + childmask = data->childmask; + childstate.state = 0; + + /* + * The memory barrier is paired with the cmpxchg() in tmigr_active_up() + * to make sure the updates of child and group states are ordered. The + * ordering is mandatory, as the group state change depends on the child + * state. + */ + curstate.state = atomic_read_acquire(&group->migr_state); + + for (;;) { + if (child) + childstate.state = atomic_read(&child->migr_state); + + newstate = curstate; + walk_done = true; + + /* Reset active bit when the child is no longer active */ + if (!childstate.active) + newstate.active &= ~childmask; + + if (newstate.migrator == childmask) { + /* + * Find a new migrator for the group, because the child + * group is idle! + */ + if (!childstate.active) { + unsigned long new_migr_bit, active = newstate.active; + + new_migr_bit = find_first_bit(&active, BIT_CNT); + + if (new_migr_bit != BIT_CNT) { + newstate.migrator = BIT(new_migr_bit); + } else { + newstate.migrator = TMIGR_NONE; + + /* Changes need to be propagated */ + walk_done = false; + } + } + } + + newstate.seq++; + + WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); + + if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) { + trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); + break; + } + + /* + * The memory barrier is paired with the cmpxchg() in + * tmigr_active_up() to make sure the updates of child and group + * states are ordered. It is required only when the above + * try_cmpxchg() fails. + */ + smp_mb__after_atomic(); + } + + data->remote = false; + + /* Event Handling */ + tmigr_update_events(group, child, data); + + return walk_done; +} + +static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) +{ + struct tmigr_walk data = { .nextexp = nextexp, + .firstexp = KTIME_MAX, + .evt = &tmc->cpuevt, + .childmask = tmc->groupmask }; + + /* + * If nextexp is KTIME_MAX, the CPU event will be ignored because the + * local timer expires before the global timer, no global timer is set + * or CPU goes offline. + */ + if (nextexp != KTIME_MAX) + tmc->cpuevt.ignore = false; + + walk_groups(&tmigr_inactive_up, &data, tmc); + return data.firstexp; +} + +/** + * tmigr_cpu_deactivate() - Put current CPU into inactive state + * @nextexp: The next global timer expiry of the current CPU + * + * Must be called with interrupts disabled. + * + * Return: the next event expiry of the current CPU or the next event expiry + * from the hierarchy if this CPU is the top level migrator or the hierarchy is + * completely idle. + */ +u64 tmigr_cpu_deactivate(u64 nextexp) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + u64 ret; + + if (tmigr_is_not_available(tmc)) + return nextexp; + + raw_spin_lock(&tmc->lock); + + ret = __tmigr_cpu_deactivate(tmc, nextexp); + + tmc->idle = true; + + /* + * Make sure the reevaluation of timers in idle path will not miss an + * event. + */ + WRITE_ONCE(tmc->wakeup, ret); + + trace_tmigr_cpu_idle(tmc, nextexp); + raw_spin_unlock(&tmc->lock); + return ret; +} + +/** + * tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to + * go idle + * @nextevt: The next global timer expiry of the current CPU + * + * Return: + * * KTIME_MAX - when it is probable that nothing has to be done (not + * the only one in the level 0 group; and if it is the + * only one in level 0 group, but there are more than a + * single group active on the way to top level) + * * nextevt - when CPU is offline and has to handle timer on its own + * or when on the way to top in every group only a single + * child is active but @nextevt is before the lowest + * next_expiry encountered while walking up to top level. + * * next_expiry - value of lowest expiry encountered while walking groups + * if only a single child is active on each and @nextevt + * is after this lowest expiry. + */ +u64 tmigr_quick_check(u64 nextevt) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_group *group = tmc->tmgroup; + + if (tmigr_is_not_available(tmc)) + return nextevt; + + if (WARN_ON_ONCE(tmc->idle)) + return nextevt; + + if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask)) + return KTIME_MAX; + + do { + if (!tmigr_check_lonely(group)) + return KTIME_MAX; + + /* + * Since current CPU is active, events may not be sorted + * from bottom to the top because the CPU's event is ignored + * up to the top and its sibling's events not propagated upwards. + * Thus keep track of the lowest observed expiry. + */ + nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); + group = group->parent; + } while (group); + + return nextevt; +} + +/* + * tmigr_trigger_active() - trigger a CPU to become active again + * + * This function is executed on a CPU which is part of cpu_online_mask, when the + * last active CPU in the hierarchy is offlining. With this, it is ensured that + * the other CPU is active and takes over the migrator duty. + */ +static long tmigr_trigger_active(void *unused) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + WARN_ON_ONCE(!tmc->available || tmc->idle); + + return 0; +} + +static int tmigr_clear_cpu_available(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + int migrator; + u64 firstexp; + + guard(mutex)(&tmigr_available_mutex); + + cpumask_clear_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (!tmc->available) + return 0; + tmc->available = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_unavailable(tmc); + } + + if (firstexp != KTIME_MAX) { + migrator = cpumask_any(tmigr_available_cpumask); + work_on_cpu(migrator, tmigr_trigger_active, NULL); + } + + return 0; +} + +static int tmigr_set_cpu_available(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + /* Check whether CPU data was successfully initialized */ + if (WARN_ON_ONCE(!tmc->tmgroup)) + return -EINVAL; + + if (tmigr_is_isolated(cpu)) + return 0; + + guard(mutex)(&tmigr_available_mutex); + + cpumask_set_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (tmc->available) + return 0; + trace_tmigr_cpu_available(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->available = true; + } + return 0; +} + +static void tmigr_cpu_isolate(struct work_struct *ignored) +{ + tmigr_clear_cpu_available(smp_processor_id()); +} + +static void tmigr_cpu_unisolate(struct work_struct *ignored) +{ + tmigr_set_cpu_available(smp_processor_id()); +} + +/** + * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy + * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy + * + * This function can be called from cpuset code to provide the new set of + * isolated CPUs that should be excluded from the hierarchy. + * Online CPUs not present in exclude_cpumask but already excluded are brought + * back to the hierarchy. + * Functions to isolate/unisolate need to be called locally and can sleep. + */ +int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + struct work_struct __percpu *works __free(free_percpu) = + alloc_percpu(struct work_struct); + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + int cpu; + + lockdep_assert_cpus_held(); + + if (!works) + return -ENOMEM; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + /* + * First set previously isolated CPUs as available (unisolate). + * This cpumask contains only CPUs that switched to available now. + */ + cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); + cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_unisolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + /* + * Then clear previously available CPUs (isolate). + * This cpumask contains only CPUs that switched to not available now. + * There cannot be overlap with the newly available ones. + */ + cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask); + cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)); + /* + * Handle this here and not in the cpuset code because exclude_cpumask + * might include also the tick CPU if included in isolcpus. + */ + for_each_cpu(cpu, cpumask) { + if (!tick_nohz_cpu_hotpluggable(cpu)) { + cpumask_clear_cpu(cpu, cpumask); + break; + } + } + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_isolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + return 0; +} + +static int __init tmigr_init_isolation(void) +{ + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + + static_branch_enable(&tmigr_exclude_isolated); + + if (!housekeeping_enabled(HK_TYPE_DOMAIN)) + return 0; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + + /* Protect against RCU torture hotplug testing */ + guard(cpus_read_lock)(); + return tmigr_isolated_exclude_cpumask(cpumask); +} +late_initcall(tmigr_init_isolation); + +static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, + int node) +{ + union tmigr_state s; + + raw_spin_lock_init(&group->lock); + + group->level = lvl; + group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE; + + group->num_children = 0; + + s.migrator = TMIGR_NONE; + s.active = 0; + s.seq = 0; + atomic_set(&group->migr_state, s.state); + + timerqueue_init_head(&group->events); + timerqueue_init(&group->groupevt.nextevt); + group->groupevt.nextevt.expires = KTIME_MAX; + WRITE_ONCE(group->next_expiry, KTIME_MAX); + group->groupevt.ignore = true; +} + +static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) +{ + struct tmigr_group *tmp, *group = NULL; + + lockdep_assert_held(&tmigr_mutex); + + /* Try to attach to an existing group first */ + list_for_each_entry(tmp, &tmigr_level_list[lvl], list) { + /* + * If @lvl is below the cross NUMA node level, check whether + * this group belongs to the same NUMA node. + */ + if (lvl < tmigr_crossnode_level && tmp->numa_node != node) + continue; + + /* Capacity left? */ + if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP) + continue; + + /* + * TODO: A possible further improvement: Make sure that all CPU + * siblings end up in the same group of the lowest level of the + * hierarchy. Rely on the topology sibling mask would be a + * reasonable solution. + */ + + group = tmp; + break; + } + + if (group) + return group; + + /* Allocate and set up a new group */ + group = kzalloc_node(sizeof(*group), GFP_KERNEL, node); + if (!group) + return ERR_PTR(-ENOMEM); + + tmigr_init_group(group, lvl, node); + + /* Setup successful. Add it to the hierarchy */ + list_add(&group->list, &tmigr_level_list[lvl]); + trace_tmigr_group_set(group); + return group; +} + +static bool tmigr_init_root(struct tmigr_group *group, bool activate) +{ + if (!group->parent && group != tmigr_root) { + /* + * This is the new top-level, prepare its groupmask in advance + * to avoid accidents where yet another new top-level is + * created in the future and made visible before this groupmask. + */ + group->groupmask = BIT(0); + WARN_ON_ONCE(activate); + + return true; + } + + return false; + +} + +static void tmigr_connect_child_parent(struct tmigr_group *child, + struct tmigr_group *parent, + bool activate) +{ + if (tmigr_init_root(parent, activate)) { + /* + * The previous top level had prepared its groupmask already, + * simply account it in advance as the first child. If some groups + * have been created between the old and new root due to node + * mismatch, the new root's child will be intialized accordingly. + */ + parent->num_children = 1; + } + + /* Connecting old root to new root ? */ + if (!parent->parent && activate) { + /* + * @child is the old top, or in case of node mismatch, some + * intermediate group between the old top and the new one in + * @parent. In this case the @child must be pre-accounted above + * as the first child. Its new inactive sibling corresponding + * to the CPU going up has been accounted as the second child. + */ + WARN_ON_ONCE(parent->num_children != 2); + child->groupmask = BIT(0); + } else { + /* Common case adding @child for the CPU going up to @parent. */ + child->groupmask = BIT(parent->num_children++); + } + + /* + * Make sure parent initialization is visible before publishing it to a + * racing CPU entering/exiting idle. This RELEASE barrier enforces an + * address dependency that pairs with the READ_ONCE() in __walk_groups(). + */ + smp_store_release(&child->parent, parent); + + trace_tmigr_connect_child_parent(child); +} + +static int tmigr_setup_groups(unsigned int cpu, unsigned int node, + struct tmigr_group *start, bool activate) +{ + struct tmigr_group *group, *child, **stack; + int i, top = 0, err = 0, start_lvl = 0; + bool root_mismatch = false; + + stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); + if (!stack) + return -ENOMEM; + + if (start) { + stack[start->level] = start; + start_lvl = start->level + 1; + } + + if (tmigr_root) + root_mismatch = tmigr_root->numa_node != node; + + for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { + group = tmigr_get_group(node, i); + if (IS_ERR(group)) { + err = PTR_ERR(group); + i--; + break; + } + + top = i; + stack[i] = group; + + /* + * When booting only less CPUs of a system than CPUs are + * available, not all calculated hierarchy levels are required, + * unless a node mismatch is detected. + * + * The loop is aborted as soon as the highest level, which might + * be different from tmigr_hierarchy_levels, contains only a + * single group, unless the nodes mismatch below tmigr_crossnode_level + */ + if (group->parent) + break; + if ((!root_mismatch || i >= tmigr_crossnode_level) && + list_is_singular(&tmigr_level_list[i])) + break; + } + + /* Assert single root without parent */ + if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) + return -EINVAL; + + for (; i >= start_lvl; i--) { + group = stack[i]; + + if (err < 0) { + list_del(&group->list); + kfree(group); + continue; + } + + WARN_ON_ONCE(i != group->level); + + /* + * Update tmc -> group / child -> group connection + */ + if (i == 0) { + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); + + tmc->tmgroup = group; + tmc->groupmask = BIT(group->num_children++); + + tmigr_init_root(group, activate); + + trace_tmigr_connect_cpu_parent(tmc); + + /* There are no children that need to be connected */ + continue; + } else { + child = stack[i - 1]; + tmigr_connect_child_parent(child, group, activate); + } + } + + if (err < 0) + goto out; + + if (activate) { + struct tmigr_walk data; + union tmigr_state state; + + /* + * To prevent inconsistent states, active children need to be active in + * the new parent as well. Inactive children are already marked inactive + * in the parent group: + * + * * When new groups were created by tmigr_setup_groups() starting from + * the lowest level, then they are not active. They will be set active + * when the new online CPU comes active. + * + * * But if new groups above the current top level are required, it is + * mandatory to propagate the active state of the already existing + * child to the new parents. So tmigr_active_up() activates the + * new parents while walking up from the old root to the new. + * + * * It is ensured that @start is active, as this setup path is + * executed in hotplug prepare callback. This is executed by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. + */ + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(!state.active); + WARN_ON_ONCE(!start->parent); + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } + + /* Root update */ + if (list_is_singular(&tmigr_level_list[top])) { + group = list_first_entry(&tmigr_level_list[top], + typeof(*group), list); + WARN_ON_ONCE(group->parent); + if (tmigr_root) { + /* Old root should be the same or below */ + WARN_ON_ONCE(tmigr_root->level > top); + } + tmigr_root = group; + } +out: + kfree(stack); + + return err; +} + +static int tmigr_add_cpu(unsigned int cpu) +{ + struct tmigr_group *old_root = tmigr_root; + int node = cpu_to_node(cpu); + int ret; + + guard(mutex)(&tmigr_mutex); + + ret = tmigr_setup_groups(cpu, node, NULL, false); + + /* Root has changed? Connect the old one to the new */ + if (ret >= 0 && old_root && old_root != tmigr_root) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + /* + * The (likely) current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); + ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + } + + return ret; +} + +static int tmigr_cpu_prepare(unsigned int cpu) +{ + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); + int ret = 0; + + /* Not first online attempt? */ + if (tmc->tmgroup) + return ret; + + raw_spin_lock_init(&tmc->lock); + timerqueue_init(&tmc->cpuevt.nextevt); + tmc->cpuevt.nextevt.expires = KTIME_MAX; + tmc->cpuevt.ignore = true; + tmc->cpuevt.cpu = cpu; + tmc->remote = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + ret = tmigr_add_cpu(cpu); + if (ret < 0) + return ret; + + if (tmc->groupmask == 0) + return -EINVAL; + + return ret; +} + +static int __init tmigr_init(void) +{ + unsigned int cpulvl, nodelvl, cpus_per_node, i; + unsigned int nnodes = num_possible_nodes(); + unsigned int ncpus = num_possible_cpus(); + int ret = -ENOMEM; + + BUILD_BUG_ON_NOT_POWER_OF_2(TMIGR_CHILDREN_PER_GROUP); + + /* Nothing to do if running on UP */ + if (ncpus == 1) + return 0; + + if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err; + } + + /* + * Calculate the required hierarchy levels. Unfortunately there is no + * reliable information available, unless all possible CPUs have been + * brought up and all NUMA nodes are populated. + * + * Estimate the number of levels with the number of possible nodes and + * the number of possible CPUs. Assume CPUs are spread evenly across + * nodes. We cannot rely on cpumask_of_node() because it only works for + * online CPUs. + */ + cpus_per_node = DIV_ROUND_UP(ncpus, nnodes); + + /* Calc the hierarchy levels required to hold the CPUs of a node */ + cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node), + ilog2(TMIGR_CHILDREN_PER_GROUP)); + + /* Calculate the extra levels to connect all nodes */ + nodelvl = DIV_ROUND_UP(order_base_2(nnodes), + ilog2(TMIGR_CHILDREN_PER_GROUP)); + + tmigr_hierarchy_levels = cpulvl + nodelvl; + + /* + * If a NUMA node spawns more than one CPU level group then the next + * level(s) of the hierarchy contains groups which handle all CPU groups + * of the same NUMA node. The level above goes across NUMA nodes. Store + * this information for the setup code to decide in which level node + * matching is no longer required. + */ + tmigr_crossnode_level = cpulvl; + + tmigr_level_list = kcalloc(tmigr_hierarchy_levels, sizeof(struct list_head), GFP_KERNEL); + if (!tmigr_level_list) + goto err; + + for (i = 0; i < tmigr_hierarchy_levels; i++) + INIT_LIST_HEAD(&tmigr_level_list[i]); + + pr_info("Timer migration: %d hierarchy levels; %d children per group;" + " %d crossnode level\n", + tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, + tmigr_crossnode_level); + + ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare", + tmigr_cpu_prepare, NULL); + if (ret) + goto err; + + ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", + tmigr_set_cpu_available, tmigr_clear_cpu_available); + if (ret) + goto err; + + return 0; + +err: + pr_err("Timer migration setup failed\n"); + return ret; +} +early_initcall(tmigr_init); diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h new file mode 100644 index 000000000000..70879cde6fdd --- /dev/null +++ b/kernel/time/timer_migration.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _KERNEL_TIME_MIGRATION_H +#define _KERNEL_TIME_MIGRATION_H + +/* Per group capacity. Must be a power of 2! */ +#define TMIGR_CHILDREN_PER_GROUP 8 + +/** + * struct tmigr_event - a timer event associated to a CPU + * @nextevt: The node to enqueue an event in the parent group queue + * @cpu: The CPU to which this event belongs + * @ignore: Hint whether the event could be ignored; it is set when + * CPU or group is active; + */ +struct tmigr_event { + struct timerqueue_node nextevt; + unsigned int cpu; + bool ignore; +}; + +/** + * struct tmigr_group - timer migration hierarchy group + * @lock: Lock protecting the event information and group hierarchy + * information during setup + * @parent: Pointer to the parent group. Pointer is updated when a + * new hierarchy level is added because of a CPU coming + * online the first time. Once it is set, the pointer will + * not be removed or updated. When accessing parent pointer + * lock less to decide whether to abort a propagation or + * not, it is not a problem. The worst outcome is an + * unnecessary/early CPU wake up. But do not access parent + * pointer several times in the same 'action' (like + * activation, deactivation, check for remote expiry,...) + * without holding the lock as it is not ensured that value + * will not change. + * @groupevt: Next event of the group which is only used when the + * group is !active. The group event is then queued into + * the parent timer queue. + * Ignore bit of @groupevt is set when the group is active. + * @next_expiry: Base monotonic expiry time of the next event of the + * group; It is used for the racy lockless check whether a + * remote expiry is required; it is always reliable + * @events: Timer queue for child events queued in the group + * @migr_state: State of the group (see union tmigr_state) + * @level: Hierarchy level of the group; Required during setup + * @numa_node: Required for setup only to make sure CPU and low level + * group information is NUMA local. It is set to NUMA node + * as long as the group level is per NUMA node (level < + * tmigr_crossnode_level); otherwise it is set to + * NUMA_NO_NODE + * @num_children: Counter of group children to make sure the group is only + * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup + * only + * @groupmask: mask of the group in the parent group; is set during + * setup and will never change; can be read lockless + * @list: List head that is added to the per level + * tmigr_level_list; is required during setup when a + * new group needs to be connected to the existing + * hierarchy groups + */ +struct tmigr_group { + raw_spinlock_t lock; + struct tmigr_group *parent; + struct tmigr_event groupevt; + u64 next_expiry; + struct timerqueue_head events; + atomic_t migr_state; + unsigned int level; + int numa_node; + unsigned int num_children; + u8 groupmask; + struct list_head list; +}; + +/** + * struct tmigr_cpu - timer migration per CPU group + * @lock: Lock protecting the tmigr_cpu group information + * @online: Indicates whether the CPU is online; In deactivate path + * it is required to know whether the migrator in the top + * level group is to be set offline, while a timer is + * pending. Then another online CPU needs to be notified to + * take over the migrator role. Furthermore the information + * is required in CPU hotplug path as the CPU is able to go + * idle before the timer migration hierarchy hotplug AP is + * reached. During this phase, the CPU has to handle the + * global timers on its own and must not act as a migrator. + * @idle: Indicates whether the CPU is idle in the timer migration + * hierarchy + * @remote: Is set when timers of the CPU are expired remotely + * @tmgroup: Pointer to the parent group + * @groupmask: mask of tmigr_cpu in the parent group + * @wakeup: Stores the first timer when the timer migration + * hierarchy is completely idle and remote expiry was done; + * is returned to timer code in the idle path and is only + * used in idle path. + * @cpuevt: CPU event which could be enqueued into the parent group + */ +struct tmigr_cpu { + raw_spinlock_t lock; + bool available; + bool idle; + bool remote; + struct tmigr_group *tmgroup; + u8 groupmask; + u64 wakeup; + struct tmigr_event cpuevt; +}; + +/** + * union tmigr_state - state of tmigr_group + * @state: Combined version of the state - only used for atomic + * read/cmpxchg function + * &anon struct: Split version of the state - only use the struct members to + * update information to stay independent of endianness + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator + * @seq: Sequence counter needs to be increased when an update + * to the tmigr_state is done. It prevents a race when + * updates in the child groups are propagated in changed + * order. Detailed information about the scenario is + * given in the documentation at the begin of + * timer_migration.c. + */ +union tmigr_state { + u32 state; + struct { + u8 active; + u8 migrator; + u16 seq; + } __packed; +}; + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void tmigr_handle_remote(void); +extern bool tmigr_requires_handle_remote(void); +extern void tmigr_cpu_activate(void); +extern u64 tmigr_cpu_deactivate(u64 nextevt); +extern u64 tmigr_cpu_new_timer(u64 nextevt); +extern u64 tmigr_quick_check(u64 nextevt); +#else +static inline void tmigr_handle_remote(void) { } +static inline bool tmigr_requires_handle_remote(void) { return false; } +static inline void tmigr_cpu_activate(void) { } +#endif + +#endif diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index f0d5062d9cbc..aa59919b8f2c 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,23 +15,28 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) { + vc->cycle_last = base->cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vc->max_cycles = base->clock->max_cycles; +#endif + vc->mask = base->mask; + vc->mult = base->mult; + vc->shift = base->shift; +} + +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) +{ + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); + fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -49,7 +54,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -59,19 +64,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -80,55 +86,95 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); - __arch_update_vsyscall(vdata, tk); + __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); + __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; + + __arch_sync_vdso_time_data(vdata); +} + +#ifdef CONFIG_POSIX_AUX_CLOCKS +void vdso_time_update_aux(struct timekeeper *tk) +{ + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_timestamp *vdso_ts; + struct vdso_clock *vc; + s32 clock_mode; + u64 nsec; + + vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; + vdso_ts = &vc->basetime[VDSO_BASE_AUX]; + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + if (!tk->clock_valid) + clock_mode = VDSO_CLOCKMODE_NONE; + + /* copy vsyscall data */ + vdso_write_begin_clock(vc); + + vc->clock_mode = clock_mode; + + if (clock_mode != VDSO_CLOCKMODE_NONE) { + fill_clock_configuration(vc, &tk->tkr_mono); + + vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec; + + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->monotonic_to_aux.tv_nsec; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + nsec = nsec << tk->tkr_mono.shift; + vdso_ts->nsec = nsec; + } + + __arch_update_vdso_clock(vc); - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vdso_write_end_clock(vc); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } +#endif /** * vdso_update_begin - Start of a VDSO update section @@ -144,10 +190,9 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); - unsigned long flags; + struct vdso_time_data *vdata = vdso_k_time_data; + unsigned long flags = timekeeper_lock_irqsave(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); vdso_write_begin(vdata); return flags; } @@ -162,9 +207,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + __arch_sync_vdso_time_data(vdata); + timekeeper_unlock_irqrestore(flags); } |
