summaryrefslogtreecommitdiff
path: root/kernel/time
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/Kconfig33
-rw-r--r--kernel/time/Makefile13
-rw-r--r--kernel/time/alarmtimer.c176
-rw-r--r--kernel/time/clockevents.c50
-rw-r--r--kernel/time/clocksource-wdtest.c17
-rw-r--r--kernel/time/clocksource.c280
-rw-r--r--kernel/time/hrtimer.c568
-rw-r--r--kernel/time/itimer.c21
-rw-r--r--kernel/time/jiffies.c130
-rw-r--r--kernel/time/namespace.c62
-rw-r--r--kernel/time/ntp.c888
-rw-r--r--kernel/time/ntp_internal.h17
-rw-r--r--kernel/time/posix-clock.c77
-rw-r--r--kernel/time/posix-cpu-timers.c372
-rw-r--r--kernel/time/posix-stubs.c47
-rw-r--r--kernel/time/posix-timers.c1122
-rw-r--r--kernel/time/posix-timers.h10
-rw-r--r--kernel/time/sched_clock.c79
-rw-r--r--kernel/time/sleep_timeout.c377
-rw-r--r--kernel/time/test_udelay.c3
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c32
-rw-r--r--kernel/time/tick-broadcast.c152
-rw-r--r--kernel/time/tick-common.c119
-rw-r--r--kernel/time/tick-internal.h20
-rw-r--r--kernel/time/tick-oneshot.c20
-rw-r--r--kernel/time/tick-sched.c818
-rw-r--r--kernel/time/tick-sched.h101
-rw-r--r--kernel/time/time.c174
-rw-r--r--kernel/time/time_test.c5
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c1582
-rw-r--r--kernel/time/timekeeping_debug.c13
-rw-r--r--kernel/time/timekeeping_internal.h36
-rw-r--r--kernel/time/timer.c984
-rw-r--r--kernel/time/timer_list.c16
-rw-r--r--kernel/time/timer_migration.c2023
-rw-r--r--kernel/time/timer_migration.h146
-rw-r--r--kernel/time/vsyscall.c119
38 files changed, 7429 insertions, 3275 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a41753be1a2b..7c6a52f7836c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -17,11 +17,6 @@ config ARCH_CLOCKSOURCE_DATA
config ARCH_CLOCKSOURCE_INIT
bool
-# Clocksources require validation of the clocksource against the last
-# cycle update - x86/TSC misfeature
-config CLOCKSOURCE_VALIDATE_LAST_CYCLE
- bool
-
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
@@ -39,6 +34,11 @@ config GENERIC_CLOCKEVENTS_BROADCAST
bool
depends on GENERIC_CLOCKEVENTS
+# Handle broadcast in default_idle_call()
+config GENERIC_CLOCKEVENTS_BROADCAST_IDLE
+ bool
+ depends on GENERIC_CLOCKEVENTS_BROADCAST
+
# Automatically adjust the min. reprogramming time for
# clock event device
config GENERIC_CLOCKEVENTS_MIN_ADJUST
@@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE
help
Tracks idle state on behalf of RCU.
-if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
+if GENERIC_CLOCKEVENTS
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
# are supported independent of this.
@@ -197,13 +197,28 @@ config HIGH_RES_TIMERS
the size of the kernel image.
config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
- int "Clocksource watchdog maximum allowable skew (in μs)"
+ int "Clocksource watchdog maximum allowable skew (in microseconds)"
depends on CLOCKSOURCE_WATCHDOG
range 50 1000
- default 100
+ default 125
help
Specify the maximum amount of allowable watchdog skew in
microseconds before reporting the clocksource to be unstable.
+ The default is based on a half-second clocksource watchdog
+ interval and NTP's maximum frequency drift of 500 parts
+ per million. If the clocksource is good enough for NTP,
+ it is good enough for the clocksource watchdog!
+endif
+
+config POSIX_AUX_CLOCKS
+ bool "Enable auxiliary POSIX clocks"
+ depends on POSIX_TIMERS
+ help
+ Auxiliary POSIX clocks are clocks which can be steered
+ independently of the core timekeeper, which controls the
+ MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to
+ provide e.g. lockless time accessors to independent PTP clocks
+ and other clock domains, which are not correlated to the TAI/NTP
+ notion of time.
endmenu
-endif
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 7e875e63ff3b..f7d52d9543cc 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,11 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y += time.o timer.o hrtimer.o
+
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING
+endif
+
+obj-y += time.o timer.o hrtimer.o sleep_timeout.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o timecounter.o alarmtimer.o
@@ -17,7 +23,10 @@ endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
-obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
+ifeq ($(CONFIG_SMP),y)
+ obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o
+endif
+obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5897828b9d7e..069d93bfb0c7 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -35,7 +35,7 @@
/**
* struct alarm_base - Alarm timer bases
- * @lock: Lock for syncrhonized access to the base
+ * @lock: Lock for synchronized access to the base
* @timerqueue: Timerqueue head managing the list of events
* @get_ktime: Function to read the time correlating to the base
* @get_timespec: Function to read the namespace time correlating to the base
@@ -70,21 +70,17 @@ static DEFINE_SPINLOCK(rtcdev_lock);
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
- unsigned long flags;
struct rtc_device *ret;
- spin_lock_irqsave(&rtcdev_lock, flags);
+ guard(spinlock_irqsave)(&rtcdev_lock);
ret = rtcdev;
- spin_unlock_irqrestore(&rtcdev_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
-static int alarmtimer_rtc_add_device(struct device *dev,
- struct class_interface *class_intf)
+static int alarmtimer_rtc_add_device(struct device *dev)
{
- unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
struct platform_device *pdev;
int ret = 0;
@@ -102,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (!IS_ERR(pdev))
device_init_wakeup(&pdev->dev, true);
- spin_lock_irqsave(&rtcdev_lock, flags);
- if (!IS_ERR(pdev) && !rtcdev) {
- if (!try_module_get(rtc->owner)) {
+ scoped_guard(spinlock_irqsave, &rtcdev_lock) {
+ if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) {
+ rtcdev = rtc;
+ /* hold a reference so it doesn't go away */
+ get_device(dev);
+ pdev = NULL;
+ } else {
ret = -1;
- goto unlock;
}
-
- rtcdev = rtc;
- /* hold a reference so it doesn't go away */
- get_device(dev);
- pdev = NULL;
- } else {
- ret = -1;
}
-unlock:
- spin_unlock_irqrestore(&rtcdev_lock, flags);
platform_device_unregister(pdev);
-
return ret;
}
@@ -135,7 +124,7 @@ static struct class_interface alarmtimer_rtc_interface = {
static int alarmtimer_rtc_interface_setup(void)
{
- alarmtimer_rtc_interface.class = rtc_class;
+ alarmtimer_rtc_interface.class = &rtc_class;
return class_interface_register(&alarmtimer_rtc_interface);
}
static void alarmtimer_rtc_interface_remove(void)
@@ -198,28 +187,15 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
struct alarm *alarm = container_of(timer, struct alarm, timer);
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- int ret = HRTIMER_NORESTART;
- int restart = ALARMTIMER_NORESTART;
- spin_lock_irqsave(&base->lock, flags);
- alarmtimer_dequeue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock)
+ alarmtimer_dequeue(base, alarm);
if (alarm->function)
- restart = alarm->function(alarm, base->get_ktime());
-
- spin_lock_irqsave(&base->lock, flags);
- if (restart != ALARMTIMER_NORESTART) {
- hrtimer_set_expires(&alarm->timer, alarm->node.expires);
- alarmtimer_enqueue(base, alarm);
- ret = HRTIMER_RESTART;
- }
- spin_unlock_irqrestore(&base->lock, flags);
+ alarm->function(alarm, base->get_ktime());
trace_alarmtimer_fired(alarm, base->get_ktime());
- return ret;
-
+ return HRTIMER_NORESTART;
}
ktime_t alarm_expires_remaining(const struct alarm *alarm)
@@ -242,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
static int alarmtimer_suspend(struct device *dev)
{
ktime_t min, now, expires;
- int i, ret, type;
struct rtc_device *rtc;
- unsigned long flags;
struct rtc_time tm;
+ int i, ret, type;
- spin_lock_irqsave(&freezer_delta_lock, flags);
- min = freezer_delta;
- expires = freezer_expires;
- type = freezer_alarmtype;
- freezer_delta = 0;
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
+ scoped_guard(spinlock_irqsave, &freezer_delta_lock) {
+ min = freezer_delta;
+ expires = freezer_expires;
+ type = freezer_alarmtype;
+ freezer_delta = 0;
+ }
rtc = alarmtimer_get_rtcdev();
/* If we have no rtcdev, just return */
@@ -265,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev)
struct timerqueue_node *next;
ktime_t delta;
- spin_lock_irqsave(&base->lock, flags);
- next = timerqueue_getnext(&base->timerqueue);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock)
+ next = timerqueue_getnext(&base->timerqueue);
if (!next)
continue;
delta = ktime_sub(next->expires, base->get_ktime());
@@ -291,6 +265,17 @@ static int alarmtimer_suspend(struct device *dev)
rtc_timer_cancel(rtc, &rtctimer);
rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
+
+ /*
+ * If the RTC alarm timer only supports a limited time offset, set the
+ * alarm time to the maximum supported value.
+ * The system may wake up earlier (possibly much earlier) than expected
+ * when the alarmtimer runs. This is the best the kernel can do if
+ * the alarmtimer exceeds the time that the rtc device can be programmed
+ * for.
+ */
+ min = rtc_bound_alarmtime(rtc, min);
+
now = ktime_add(now, min);
/* Set alarm, if in the past reject suspend briefly to handle */
@@ -324,10 +309,9 @@ static int alarmtimer_resume(struct device *dev)
static void
__alarm_init(struct alarm *alarm, enum alarmtimer_type type,
- enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+ void (*function)(struct alarm *, ktime_t))
{
timerqueue_init(&alarm->node);
- alarm->timer.function = alarmtimer_fired;
alarm->function = function;
alarm->type = type;
alarm->state = ALARMTIMER_STATE_INACTIVE;
@@ -340,10 +324,10 @@ __alarm_init(struct alarm *alarm, enum alarmtimer_type type,
* @function: callback that is run when the alarm fires
*/
void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
- enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+ void (*function)(struct alarm *, ktime_t))
{
- hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
- HRTIMER_MODE_ABS);
+ hrtimer_setup(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid,
+ HRTIMER_MODE_ABS);
__alarm_init(alarm, type, function);
}
EXPORT_SYMBOL_GPL(alarm_init);
@@ -356,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init);
void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
- alarm->node.expires = start;
- alarmtimer_enqueue(base, alarm);
- hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ alarm->node.expires = start;
+ alarmtimer_enqueue(base, alarm);
+ hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
+ }
trace_alarmtimer_start(alarm, base->get_ktime());
}
@@ -385,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative);
void alarm_restart(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
+ guard(spinlock_irqsave)(&base->lock);
hrtimer_set_expires(&alarm->timer, alarm->node.expires);
hrtimer_restart(&alarm->timer);
alarmtimer_enqueue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(alarm_restart);
@@ -405,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart);
int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
int ret;
- spin_lock_irqsave(&base->lock, flags);
- ret = hrtimer_try_to_cancel(&alarm->timer);
- if (ret >= 0)
- alarmtimer_dequeue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
+ }
trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
@@ -483,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now);
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
{
struct alarm_base *base;
- unsigned long flags;
ktime_t delta;
switch(type) {
@@ -502,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
delta = ktime_sub(absexp, base->get_ktime());
- spin_lock_irqsave(&freezer_delta_lock, flags);
+ guard(spinlock_irqsave)(&freezer_delta_lock);
if (!freezer_delta || (delta < freezer_delta)) {
freezer_delta = delta;
freezer_expires = absexp;
freezer_alarmtype = type;
}
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
}
/**
@@ -519,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
{
if (clockid == CLOCK_REALTIME_ALARM)
return ALARM_REALTIME;
- if (clockid == CLOCK_BOOTTIME_ALARM)
- return ALARM_BOOTTIME;
- return -1;
+
+ WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM);
+ return ALARM_BOOTTIME;
}
/**
@@ -533,34 +511,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
*
* Return: whether the timer is to be restarted
*/
-static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
- ktime_t now)
+static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
{
- struct k_itimer *ptr = container_of(alarm, struct k_itimer,
- it.alarm.alarmtimer);
- enum alarmtimer_restart result = ALARMTIMER_NORESTART;
- unsigned long flags;
- int si_private = 0;
-
- spin_lock_irqsave(&ptr->it_lock, flags);
-
- ptr->it_active = 0;
- if (ptr->it_interval)
- si_private = ++ptr->it_requeue_pending;
+ struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer);
- if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
- /*
- * Handle ignored signals and rearm the timer. This will go
- * away once we handle ignored signals proper.
- */
- ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
- ++ptr->it_requeue_pending;
- ptr->it_active = 1;
- result = ALARMTIMER_RESTART;
- }
- spin_unlock_irqrestore(&ptr->it_lock, flags);
-
- return result;
+ guard(spinlock_irqsave)(&ptr->it_lock);
+ posix_timer_queue_signal(ptr);
}
/**
@@ -721,18 +677,14 @@ static int alarm_timer_create(struct k_itimer *new_timer)
* @now: time at the timer expiration
*
* Wakes up the task that set the alarmtimer
- *
- * Return: ALARMTIMER_NORESTART
*/
-static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
- ktime_t now)
+static void alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now)
{
- struct task_struct *task = (struct task_struct *)alarm->data;
+ struct task_struct *task = alarm->data;
alarm->data = NULL;
if (task)
wake_up_process(task);
- return ALARMTIMER_NORESTART;
}
/**
@@ -784,10 +736,10 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
static void
alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type,
- enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+ void (*function)(struct alarm *, ktime_t))
{
- hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid,
- HRTIMER_MODE_ABS);
+ hrtimer_setup_on_stack(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid,
+ HRTIMER_MODE_ABS);
__alarm_init(alarm, type, function);
}
@@ -823,7 +775,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
struct restart_block *restart = &current->restart_block;
struct alarm alarm;
ktime_t exp;
- int ret = 0;
+ int ret;
if (!alarmtimer_get_rtcdev())
return -EOPNOTSUPP;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 960143b183cd..a59bc75ab7c5 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -190,7 +190,7 @@ int clockevents_tick_resume(struct clock_event_device *dev)
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
-/* Limit min_delta to a jiffie */
+/* Limit min_delta to a jiffy */
#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
/**
@@ -337,13 +337,21 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
}
/*
- * Called after a notify add to make devices available which were
- * released from the notifier call.
+ * Called after a clockevent has been added which might
+ * have replaced a current regular or broadcast device. A
+ * released normal device might be a suitable replacement
+ * for the current broadcast device. Similarly a released
+ * broadcast device might be a suitable replacement for a
+ * normal device.
*/
static void clockevents_notify_released(void)
{
struct clock_event_device *dev;
+ /*
+ * Keep iterating as long as tick_check_new_device()
+ * replaces a device.
+ */
while (!list_empty(&clockevents_released)) {
dev = list_entry(clockevents_released.next,
struct clock_event_device, list);
@@ -610,39 +618,30 @@ void clockevents_resume(void)
#ifdef CONFIG_HOTPLUG_CPU
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
/**
- * tick_offline_cpu - Take CPU out of the broadcast mechanism
+ * tick_offline_cpu - Shutdown all clock events related
+ * to this CPU and take it out of the
+ * broadcast mechanism.
* @cpu: The outgoing CPU
*
- * Called on the outgoing CPU after it took itself offline.
+ * Called by the dying CPU during teardown.
*/
void tick_offline_cpu(unsigned int cpu)
{
- raw_spin_lock(&clockevents_lock);
- tick_broadcast_offline(cpu);
- raw_spin_unlock(&clockevents_lock);
-}
-# endif
-
-/**
- * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
- * @cpu: The dead CPU
- */
-void tick_cleanup_dead_cpu(int cpu)
-{
struct clock_event_device *dev, *tmp;
- unsigned long flags;
- raw_spin_lock_irqsave(&clockevents_lock, flags);
+ raw_spin_lock(&clockevents_lock);
+
+ tick_broadcast_offline(cpu);
+ tick_shutdown();
- tick_shutdown(cpu);
/*
* Unregister the clock event devices which were
- * released from the users in the notify chain.
+ * released above.
*/
list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
list_del(&dev->list);
+
/*
* Now check whether the CPU has left unused per cpu devices
*/
@@ -654,12 +653,13 @@ void tick_cleanup_dead_cpu(int cpu)
list_del(&dev->list);
}
}
- raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+
+ raw_spin_unlock(&clockevents_lock);
}
#endif
#ifdef CONFIG_SYSFS
-static struct bus_type clockevents_subsys = {
+static const struct bus_type clockevents_subsys = {
.name = "clockevents",
.dev_name = "clockevent",
};
@@ -677,7 +677,7 @@ static ssize_t current_device_show(struct device *dev,
raw_spin_lock_irq(&clockevents_lock);
td = tick_get_tick_dev(dev);
if (td && td->evtdev)
- count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
+ count = sysfs_emit(buf, "%s\n", td->evtdev->name);
raw_spin_unlock_irq(&clockevents_lock);
return count;
}
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index df922f49d171..38dae590b29f 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -22,6 +22,7 @@
#include "tick-internal.h"
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Clocksource watchdog unit test");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
@@ -104,8 +105,8 @@ static void wdtest_ktime_clocksource_reset(void)
static int wdtest_func(void *arg)
{
unsigned long j1, j2;
+ int i, max_retries;
char *s;
- int i;
schedule_timeout_uninterruptible(holdoff * HZ);
@@ -136,21 +137,23 @@ static int wdtest_func(void *arg)
udelay(1);
j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
- WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
+ WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC),
+ "Expected at least 1000ns, got %lu.\n", j2 - j1);
/* Verify tsc-like stability with various numbers of errors injected. */
- for (i = 0; i <= max_cswd_read_retries + 1; i++) {
- if (i <= 1 && i < max_cswd_read_retries)
+ max_retries = clocksource_get_max_watchdog_retry();
+ for (i = 0; i <= max_retries + 1; i++) {
+ if (i <= 1 && i < max_retries)
s = "";
- else if (i <= max_cswd_read_retries)
+ else if (i <= max_retries)
s = ", expect message";
else
s = ", expect clock skew";
- pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s);
+ pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s);
WRITE_ONCE(wdtest_ktime_read_ndelays, i);
schedule_timeout_uninterruptible(2 * HZ);
WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
- WARN_ON_ONCE((i <= max_cswd_read_retries) !=
+ WARN_ON_ONCE((i <= max_retries) !=
!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
wdtest_ktime_clocksource_reset();
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9cf32ccda715..a1890a073196 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -20,6 +20,18 @@
#include "tick-internal.h"
#include "timekeeping_internal.h"
+static void clocksource_enqueue(struct clocksource *cs);
+
+static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
+{
+ u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta);
+
+ if (likely(delta < cs->max_cycles))
+ return clocksource_cyc2ns(delta, cs->mult, cs->shift);
+
+ return mul_u64_u32_shr(delta, cs->mult, cs->shift);
+}
+
/**
* clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
* @mult: pointer to mult variable
@@ -96,8 +108,13 @@ static int finished_booting;
static u64 suspend_start;
/*
+ * Interval: 0.5sec.
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
+
+/*
* Threshold: 0.0312s, when doubled: 0.0625s.
- * Also a default for cs->uncertainty_margin when registering clocks.
*/
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
@@ -106,13 +123,30 @@ static u64 suspend_start;
* clocksource surrounding a read of the clocksource being validated.
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
+ *
+ * The default of 500 parts per million is based on NTP's limits.
+ * If a clocksource is good enough for NTP, it is good enough for us!
+ *
+ * In other words, by default, even if a clocksource is extremely
+ * precise (for example, with a sub-nanosecond period), the maximum
+ * permissible skew between the clocksource watchdog and the clocksource
+ * under test is not permitted to go below the 500ppm minimum defined
+ * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the
+ * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option.
*/
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#else
-#define MAX_SKEW_USEC 100
+#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
#endif
+/*
+ * Default for maximum permissible skew when cs->uncertainty_margin is
+ * not specified, and the lower bound even when cs->uncertainty_margin
+ * is specified. This is also the default that is used when registering
+ * clocks with unspecified cs->uncertainty_margin, so this macro is used
+ * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
+ */
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
@@ -126,6 +160,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
+static int64_t watchdog_max_interval;
static inline void clocksource_watchdog_lock(unsigned long *flags)
{
@@ -138,12 +173,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
}
static int clocksource_watchdog_kthread(void *data);
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
-
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
static void clocksource_watchdog_work(struct work_struct *work)
{
@@ -163,6 +192,13 @@ static void clocksource_watchdog_work(struct work_struct *work)
kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
}
+static void clocksource_change_rating(struct clocksource *cs, int rating)
+{
+ list_del(&cs->list);
+ cs->rating = rating;
+ clocksource_enqueue(cs);
+}
+
static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
@@ -205,9 +241,6 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-ulong max_cswd_read_retries = 2;
-module_param(max_cswd_read_retries, ulong, 0644);
-EXPORT_SYMBOL_GPL(max_cswd_read_retries);
static int verify_n_cpus = 8;
module_param(verify_n_cpus, int, 0644);
@@ -219,11 +252,13 @@ enum wd_read_status {
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
{
- unsigned int nretries;
- u64 wd_end, wd_end2, wd_delta;
+ int64_t md = 2 * watchdog->uncertainty_margin;
+ unsigned int nretries, max_retries;
int64_t wd_delay, wd_seq_delay;
+ u64 wd_end, wd_end2;
- for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
+ max_retries = clocksource_get_max_watchdog_retry();
+ for (nretries = 0; nretries <= max_retries; nretries++) {
local_irq_disable();
*wdnow = watchdog->read(watchdog);
*csnow = cs->read(cs);
@@ -231,11 +266,9 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
wd_end2 = watchdog->read(watchdog);
local_irq_enable();
- wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
- wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
- watchdog->shift);
- if (wd_delay <= WATCHDOG_MAX_SKEW) {
- if (nretries > 1 || nretries >= max_cswd_read_retries) {
+ wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
+ if (wd_delay <= md + cs->uncertainty_margin) {
+ if (nretries > 1 && nretries >= max_retries) {
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
smp_processor_id(), watchdog->name, nretries);
}
@@ -247,18 +280,17 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
* there is too much external interferences that cause
* significant delay in reading both clocksource and watchdog.
*
- * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
- * report system busy, reinit the watchdog and skip the current
+ * If consecutive WD read-back delay > md, report
+ * system busy, reinit the watchdog and skip the current
* watchdog test.
*/
- wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
- wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
- if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
+ wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
+ if (wd_seq_delay > md)
goto skip_test;
}
- pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
- smp_processor_id(), watchdog->name, wd_delay, nretries);
+ pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
+ smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
return WD_READ_UNSTABLE;
skip_test:
@@ -278,7 +310,7 @@ static void clocksource_verify_choose_cpus(void)
{
int cpu, i, n = verify_n_cpus;
- if (n < 0) {
+ if (n < 0 || n >= num_online_cpus()) {
/* Check all of the CPUs. */
cpumask_copy(&cpus_chosen, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
@@ -291,9 +323,7 @@ static void clocksource_verify_choose_cpus(void)
return;
/* Make sure to select at least one CPU other than the current CPU. */
- cpu = cpumask_first(cpu_online_mask);
- if (cpu == smp_processor_id())
- cpu = cpumask_next(cpu, cpu_online_mask);
+ cpu = cpumask_any_but(cpu_online_mask, smp_processor_id());
if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
return;
cpumask_set_cpu(cpu, &cpus_chosen);
@@ -310,10 +340,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
- cpu = get_random_u32_below(nr_cpu_ids);
- cpu = cpumask_next(cpu - 1, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_online_mask);
+ cpu = cpumask_random(cpu_online_mask);
if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
cpumask_set_cpu(cpu, &cpus_chosen);
}
@@ -341,16 +368,18 @@ void clocksource_verify_percpu(struct clocksource *cs)
cpumask_clear(&cpus_ahead);
cpumask_clear(&cpus_behind);
cpus_read_lock();
- preempt_disable();
+ migrate_disable();
clocksource_verify_choose_cpus();
if (cpumask_empty(&cpus_chosen)) {
- preempt_enable();
+ migrate_enable();
cpus_read_unlock();
pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
return;
}
testcpu = smp_processor_id();
- pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
+ cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ preempt_disable();
for_each_cpu(cpu, &cpus_chosen) {
if (cpu == testcpu)
continue;
@@ -363,14 +392,14 @@ void clocksource_verify_percpu(struct clocksource *cs)
delta = (csnow_end - csnow_mid) & cs->mask;
if (delta < 0)
cpumask_set_cpu(cpu, &cpus_ahead);
- delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
- cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
if (cs_nsec > cs_nsec_max)
cs_nsec_max = cs_nsec;
if (cs_nsec < cs_nsec_min)
cs_nsec_min = cs_nsec;
}
preempt_enable();
+ migrate_enable();
cpus_read_unlock();
if (!cpumask_empty(&cpus_ahead))
pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
@@ -378,19 +407,28 @@ void clocksource_verify_percpu(struct clocksource *cs)
if (!cpumask_empty(&cpus_behind))
pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
cpumask_pr_args(&cpus_behind), testcpu, cs->name);
- if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
- pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
- testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+ pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+ testcpu, cs_nsec_min, cs_nsec_max, cs->name);
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+
static void clocksource_watchdog(struct timer_list *unused)
{
- u64 csnow, wdnow, cslast, wdlast, delta;
+ int64_t wd_nsec, cs_nsec, interval;
+ u64 csnow, wdnow, cslast, wdlast;
int next_cpu, reset_pending;
- int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
+ unsigned long extra_wait = 0;
u32 md;
spin_lock(&watchdog_lock);
@@ -410,13 +448,30 @@ static void clocksource_watchdog(struct timer_list *unused)
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
- if (read_ret != WD_READ_SUCCESS) {
- if (read_ret == WD_READ_UNSTABLE)
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
+ if (read_ret == WD_READ_UNSTABLE) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
continue;
}
+ /*
+ * When WD_READ_SKIP is returned, it means the system is likely
+ * under very heavy load, where the latency of reading
+ * watchdog/clocksource is very big, and affect the accuracy of
+ * watchdog check. So give system some space and suspend the
+ * watchdog check for 5 minutes.
+ */
+ if (read_ret == WD_READ_SKIP) {
+ /*
+ * As the watchdog timer will be suspended, and
+ * cs->last could keep unchanged for 5 minutes, reset
+ * the counters.
+ */
+ clocksource_reset_watchdog();
+ extra_wait = HZ * 300;
+ break;
+ }
+
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
atomic_read(&watchdog_reset_pending)) {
@@ -426,12 +481,8 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
- wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
- watchdog->shift);
-
- delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
- cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
+ cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
wdlast = cs->wd_last; /* save these in case we print them */
cslast = cs->cs_last;
cs->cs_last = csnow;
@@ -440,15 +491,44 @@ static void clocksource_watchdog(struct timer_list *unused)
if (atomic_read(&watchdog_reset_pending))
continue;
+ /*
+ * The processing of timer softirqs can get delayed (usually
+ * on account of ksoftirqd not getting to run in a timely
+ * manner), which causes the watchdog interval to stretch.
+ * Skew detection may fail for longer watchdog intervals
+ * on account of fixed margins being used.
+ * Some clocksources, e.g. acpi_pm, cannot tolerate
+ * watchdog intervals longer than a few seconds.
+ */
+ interval = max(cs_nsec, wd_nsec);
+ if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
+ if (system_state > SYSTEM_SCHEDULING &&
+ interval > 2 * watchdog_max_interval) {
+ watchdog_max_interval = interval;
+ pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
+ cs_nsec, wd_nsec);
+ }
+ watchdog_timer.expires = jiffies;
+ continue;
+ }
+
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
+ s64 cs_wd_msec;
+ s64 wd_msec;
+ u32 wd_rem;
+
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
+ cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
+ wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
+ pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
+ cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
pr_warn(" '%s' is current clocksource.\n", cs->name);
else if (curr_clocksource)
@@ -503,16 +583,14 @@ static void clocksource_watchdog(struct timer_list *unused)
* Cycle through CPUs to check if the CPUs stay synchronized
* to each other.
*/
- next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(cpu_online_mask);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask);
/*
* Arm timer if not already pending: could race with concurrent
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL;
+ watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
@@ -533,18 +611,10 @@ static inline void clocksource_stop_watchdog(void)
{
if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
return;
- del_timer(&watchdog_timer);
+ timer_delete(&watchdog_timer);
watchdog_running = 0;
}
-static inline void clocksource_reset_watchdog(void)
-{
- struct clocksource *cs;
-
- list_for_each_entry(cs, &watchdog_list, wd_list)
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
-
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);
@@ -630,7 +700,7 @@ static int __clocksource_watchdog_kthread(void)
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
list_del_init(&cs->wd_list);
- __clocksource_change_rating(cs, 0);
+ clocksource_change_rating(cs, 0);
select = 1;
}
if (cs->flags & CLOCK_SOURCE_RESELECT) {
@@ -781,7 +851,7 @@ void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
*/
u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
{
- u64 now, delta, nsec = 0;
+ u64 now, nsec = 0;
if (!suspend_clocksource)
return 0;
@@ -796,12 +866,8 @@ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
else
now = suspend_clocksource->read(suspend_clocksource);
- if (now > suspend_start) {
- delta = clocksource_delta(now, suspend_start,
- suspend_clocksource->mask);
- nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
- suspend_clocksource->shift);
- }
+ if (now > suspend_start)
+ nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now);
/*
* Disable the suspend timer to save power if current clocksource is
@@ -922,6 +988,15 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
cs->maxadj, cs->mask,
&cs->max_cycles);
+
+ /*
+ * Threshold for detecting negative motion in clocksource_delta().
+ *
+ * Allow for 0.875 of the counter width so that overly long idle
+ * sleeps, which go slightly over mask/2, do not trigger the
+ * negative motion detection.
+ */
+ cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3);
}
static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
@@ -1097,14 +1172,19 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
}
/*
- * If the uncertainty margin is not specified, calculate it.
- * If both scale and freq are non-zero, calculate the clock
- * period, but bound below at 2*WATCHDOG_MAX_SKEW. However,
- * if either of scale or freq is zero, be very conservative and
- * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
- * uncertainty margin. Allow stupidly small uncertainty margins
- * to be specified by the caller for testing purposes, but warn
- * to discourage production use of this capability.
+ * If the uncertainty margin is not specified, calculate it. If
+ * both scale and freq are non-zero, calculate the clock period, but
+ * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default.
+ * However, if either of scale or freq is zero, be very conservative
+ * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value
+ * for the uncertainty margin. Allow stupidly small uncertainty
+ * margins to be specified by the caller for testing purposes,
+ * but warn to discourage production use of this capability.
+ *
+ * Bottom line: The sum of the uncertainty margins of the
+ * watchdog clocksource and the clocksource under test will be at
+ * least 500ppm by default. For more information, please see the
+ * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above.
*/
if (scale && freq && !cs->uncertainty_margin) {
cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
@@ -1187,34 +1267,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
-static void __clocksource_change_rating(struct clocksource *cs, int rating)
-{
- list_del(&cs->list);
- cs->rating = rating;
- clocksource_enqueue(cs);
-}
-
-/**
- * clocksource_change_rating - Change the rating of a registered clocksource
- * @cs: clocksource to be changed
- * @rating: new rating
- */
-void clocksource_change_rating(struct clocksource *cs, int rating)
-{
- unsigned long flags;
-
- mutex_lock(&clocksource_mutex);
- clocksource_watchdog_lock(&flags);
- __clocksource_change_rating(cs, rating);
- clocksource_watchdog_unlock(&flags);
-
- clocksource_select();
- clocksource_select_watchdog(false);
- clocksource_suspend_select(false);
- mutex_unlock(&clocksource_mutex);
-}
-EXPORT_SYMBOL(clocksource_change_rating);
-
/*
* Unbind clocksource @cs. Called with clocksource_mutex held
*/
@@ -1285,7 +1337,7 @@ static ssize_t current_clocksource_show(struct device *dev,
ssize_t count = 0;
mutex_lock(&clocksource_mutex);
- count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
+ count = sysfs_emit(buf, "%s\n", curr_clocksource->name);
mutex_unlock(&clocksource_mutex);
return count;
@@ -1415,7 +1467,7 @@ static struct attribute *clocksource_attrs[] = {
};
ATTRIBUTE_GROUPS(clocksource);
-static struct bus_type clocksource_subsys = {
+static const struct bus_type clocksource_subsys = {
.name = "clocksource",
.dev_name = "clocksource",
};
@@ -1450,7 +1502,7 @@ static int __init boot_override_clocksource(char* str)
{
mutex_lock(&clocksource_mutex);
if (str)
- strlcpy(override_name, str, sizeof(override_name));
+ strscpy(override_name, str);
mutex_unlock(&clocksource_mutex);
return 1;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..f8ea8c8fc895 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -38,6 +38,7 @@
#include <linux/sched/deadline.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/compat.h>
@@ -57,6 +58,9 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
+
/*
* The timer bases:
*
@@ -73,55 +77,46 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
},
{
.index = HRTIMER_BASE_MONOTONIC_SOFT,
.clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME_SOFT,
.clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME_SOFT,
.clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI_SOFT,
.clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
-static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
- /* Make sure we catch unsupported clockids */
- [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
-
- [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
- [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
- [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
- [CLOCK_TAI] = HRTIMER_BASE_TAI,
-};
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ else
+ return likely(base->online);
+}
/*
* Functions and macros which are different for UP/SMP systems are kept in a
@@ -144,11 +139,6 @@ static struct hrtimer_cpu_base migration_cpu_base = {
#define migration_base migration_cpu_base.clock_base[0]
-static inline bool is_migration_base(struct hrtimer_clock_base *base)
-{
- return base == &migration_base;
-}
-
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
@@ -164,6 +154,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base)
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
unsigned long *flags)
+ __acquires(&timer->base->lock)
{
struct hrtimer_clock_base *base;
@@ -181,27 +172,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
*
* Called with cpu_base->lock of target cpu held.
*/
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+ struct hrtimer_cpu_base *new_cpu_base,
+ struct hrtimer_cpu_base *this_cpu_base)
{
ktime_t expires;
+ /*
+ * The local CPU clockevent can be reprogrammed. Also get_target_base()
+ * guarantees it is online.
+ */
+ if (new_cpu_base == this_cpu_base)
+ return true;
+
+ /*
+ * The offline local CPU can't be the default target if the
+ * next remote target event is after this timer. Keep the
+ * elected new base. An IPI will be issued to reprogram
+ * it as a last resort.
+ */
+ if (!hrtimer_base_is_online(this_cpu_base))
+ return true;
+
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires < new_base->cpu_base->expires_next;
+
+ return expires >= new_base->cpu_base->expires_next;
}
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
+ if (!hrtimer_base_is_online(base)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -252,8 +270,8 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+ this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -262,8 +280,7 @@ again:
}
WRITE_ONCE(timer->base, new_base);
} else {
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
@@ -273,13 +290,9 @@ again:
#else /* CONFIG_SMP */
-static inline bool is_migration_base(struct hrtimer_clock_base *base)
-{
- return false;
-}
-
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+ __acquires(&timer->base->cpu_base->lock)
{
struct hrtimer_clock_base *base = timer->base;
@@ -346,7 +359,7 @@ static const struct debug_obj_descr hrtimer_debug_descr;
static void *hrtimer_debug_hint(void *addr)
{
- return ((struct hrtimer *) addr)->function;
+ return ACCESS_PRIVATE((struct hrtimer *)addr, function);
}
/*
@@ -414,6 +427,11 @@ static inline void debug_hrtimer_init(struct hrtimer *timer)
debug_object_init(timer, &hrtimer_debug_descr);
}
+static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
+{
+ debug_object_init_on_stack(timer, &hrtimer_debug_descr);
+}
+
static inline void debug_hrtimer_activate(struct hrtimer *timer,
enum hrtimer_mode mode)
{
@@ -425,28 +443,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
debug_object_deactivate(timer, &hrtimer_debug_descr);
}
-static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
- enum hrtimer_mode mode);
-
-void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
- enum hrtimer_mode mode)
-{
- debug_object_init_on_stack(timer, &hrtimer_debug_descr);
- __hrtimer_init(timer, clock_id, mode);
-}
-EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
-
-static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode);
-
-void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode)
-{
- debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
- __hrtimer_init_sleeper(sl, clock_id, mode);
-}
-EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
-
void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
@@ -456,17 +452,23 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
#else
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
+static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer,
enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif
-static inline void
-debug_init(struct hrtimer *timer, clockid_t clockid,
- enum hrtimer_mode mode)
+static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
{
debug_hrtimer_init(timer);
- trace_hrtimer_init(timer, clockid, mode);
+ trace_hrtimer_setup(timer, clockid, mode);
+}
+
+static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid,
+ enum hrtimer_mode mode)
+{
+ debug_hrtimer_init_on_stack(timer);
+ trace_hrtimer_setup(timer, clockid, mode);
}
static inline void debug_activate(struct hrtimer *timer,
@@ -641,17 +643,12 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
/*
* Is the high resolution mode active ?
*/
-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
cpu_base->hres_active : 0;
}
-static inline int hrtimer_hres_active(void)
-{
- return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
-}
-
static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
struct hrtimer *next_timer,
ktime_t expires_next)
@@ -675,7 +672,7 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
* set. So we'd effectively block all timers until the T2 event
* fires.
*/
- if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
+ if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
tick_program_event(expires_next, 1);
@@ -727,8 +724,6 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -744,7 +739,7 @@ static void hrtimer_switch_to_hres(void)
base->hres_active = 1;
hrtimer_resolution = HIGH_RES_NSEC;
- tick_setup_sched_timer();
+ tick_setup_sched_timer(true);
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
}
@@ -785,13 +780,13 @@ static void retrigger_next_event(void *arg)
* of the next expiring timer is enough. The return from the SMP
* function call will take care of the reprogramming in case the
* CPU was in a NOHZ idle sleep.
+ *
+ * In periodic low resolution mode, the next softirq expiration
+ * must also be updated.
*/
- if (!__hrtimer_hres_active(base) && !tick_nohz_active)
- return;
-
raw_spin_lock(&base->lock);
hrtimer_update_base(base);
- if (__hrtimer_hres_active(base))
+ if (hrtimer_hres_active(base))
hrtimer_force_reprogram(base, 0);
else
hrtimer_update_next_event(base);
@@ -948,7 +943,7 @@ void clock_was_set(unsigned int bases)
cpumask_var_t mask;
int cpu;
- if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active)
+ if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
goto out_timerfd;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1013,26 +1008,29 @@ void hrtimers_resume_local(void)
*/
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+ __releases(&timer->base->cpu_base->lock)
{
raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}
/**
- * hrtimer_forward - forward the timer expiry
+ * hrtimer_forward() - forward the timer expiry
* @timer: hrtimer to forward
* @now: forward past this time
* @interval: the interval to forward
*
* Forward the timer expiry so it will expire in the future.
- * Returns the number of overruns.
*
- * Can be safely called from the callback function of @timer. If
- * called from other contexts @timer must neither be enqueued nor
- * running the callback and the caller needs to take care of
- * serialization.
+ * .. note::
+ * This only updates the timer expiry value and does not requeue the timer.
+ *
+ * There is also a variant of the function hrtimer_forward_now().
+ *
+ * Context: Can be safely called from the callback function of @timer. If called
+ * from other contexts @timer must neither be enqueued nor running the
+ * callback and the caller needs to take care of serialization.
*
- * Note: This only updates the timer expiry value and does not requeue
- * the timer.
+ * Return: The number of overruns are returned.
*/
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
@@ -1075,13 +1073,13 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* The timer is inserted in expiry order. Insertion into the
* red black tree is O(log(n)). Must hold the base lock.
*
- * Returns 1 when the new timer is the leftmost timer in the tree.
+ * Returns true when the new timer is the leftmost timer in the tree.
*/
-static int enqueue_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- enum hrtimer_mode mode)
+static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ enum hrtimer_mode mode)
{
debug_activate(timer, mode);
+ WARN_ON_ONCE(!base->cpu_base->online);
base->cpu_base->active_bases |= 1 << base->index;
@@ -1175,7 +1173,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
/*
* CONFIG_TIME_LOW_RES indicates that the system has no way to return
* granular time values. For relative timers we add hrtimer_resolution
- * (i.e. one jiffie) to prevent short timeouts.
+ * (i.e. one jiffy) to prevent short timeouts.
*/
timer->is_rel = mode & HRTIMER_MODE_REL;
if (timer->is_rel)
@@ -1213,6 +1211,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1224,10 +1223,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
/*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
+ /*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
* remote data correctly.
@@ -1241,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL)
- tim = ktime_add_safe(tim, base->get_time());
+ tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
tim = hrtimer_update_lowres(timer, tim, mode);
@@ -1256,8 +1261,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ /*
+ * If the current CPU base is online, then the timer is
+ * never queued on a remote CPU if it would be the first
+ * expiring timer there.
+ */
+ if (hrtimer_base_is_online(this_cpu_base))
+ return first;
+
+ /*
+ * Timer was enqueued remote because the current base is
+ * already offline. If the timer is the first to expire,
+ * kick the remote CPU to reprogram the clock event.
+ */
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
@@ -1347,11 +1371,13 @@ static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
}
static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
+ __acquires(&base->softirq_expiry_lock)
{
spin_lock(&base->softirq_expiry_lock);
}
static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
+ __releases(&base->softirq_expiry_lock)
{
spin_unlock(&base->softirq_expiry_lock);
}
@@ -1374,13 +1400,25 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
}
}
+#ifdef CONFIG_SMP
+static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return base == &migration_base;
+}
+#else
+static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return false;
+}
+#endif
+
/*
* This function is called on PREEMPT_RT kernels when the fast path
* deletion of a timer failed because the timer callback function was
* running.
*
* This prevents priority inversion: if the soft irq thread is preempted
- * in the middle of a timer callback, then calling del_timer_sync() can
+ * in the middle of a timer callback, then calling hrtimer_cancel() can
* lead to two issues:
*
* - If the caller is on a remote CPU then it has to spin wait for the timer
@@ -1484,7 +1522,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!__hrtimer_hres_active(cpu_base))
+ if (!hrtimer_hres_active(cpu_base))
expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1507,7 +1545,7 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (__hrtimer_hres_active(cpu_base)) {
+ if (hrtimer_hres_active(cpu_base)) {
unsigned int active;
if (!cpu_base->softirq_activated) {
@@ -1528,18 +1566,47 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
- if (likely(clock_id < MAX_CLOCKS)) {
- int base = hrtimer_clock_to_base_table[clock_id];
+ switch (clock_id) {
+ case CLOCK_MONOTONIC:
+ return HRTIMER_BASE_MONOTONIC;
+ case CLOCK_REALTIME:
+ return HRTIMER_BASE_REALTIME;
+ case CLOCK_BOOTTIME:
+ return HRTIMER_BASE_BOOTTIME;
+ case CLOCK_TAI:
+ return HRTIMER_BASE_TAI;
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return HRTIMER_BASE_MONOTONIC;
+ }
+}
- if (likely(base != HRTIMER_MAX_CLOCK_BASES))
- return base;
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
+{
+ switch (clock_id) {
+ case CLOCK_MONOTONIC:
+ return ktime_get();
+ case CLOCK_REALTIME:
+ return ktime_get_real();
+ case CLOCK_BOOTTIME:
+ return ktime_get_boottime();
+ case CLOCK_TAI:
+ return ktime_get_clocktai();
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return ktime_get();
}
- WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
- return HRTIMER_BASE_MONOTONIC;
}
-static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
- enum hrtimer_mode mode)
+ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
+{
+ return __hrtimer_cb_get_time(timer->base->clockid);
+}
+EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
+
+static void __hrtimer_setup(struct hrtimer *timer,
+ enum hrtimer_restart (*function)(struct hrtimer *),
+ clockid_t clock_id, enum hrtimer_mode mode)
{
bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
struct hrtimer_cpu_base *cpu_base;
@@ -1572,11 +1639,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
+
+ if (WARN_ON_ONCE(!function))
+ ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
+ else
+ ACCESS_PRIVATE(timer, function) = function;
}
/**
- * hrtimer_init - initialize a timer to the given clock
+ * hrtimer_setup - initialize a timer to the given clock
* @timer: the timer to be initialized
+ * @function: the callback function
* @clock_id: the clock to be used
* @mode: The modes which are relevant for initialization:
* HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
@@ -1586,13 +1659,32 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
* but the PINNED bit is ignored as pinning happens
* when the hrtimer is started
*/
-void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
- enum hrtimer_mode mode)
+void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
+ clockid_t clock_id, enum hrtimer_mode mode)
+{
+ debug_setup(timer, clock_id, mode);
+ __hrtimer_setup(timer, function, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_setup);
+
+/**
+ * hrtimer_setup_on_stack - initialize a timer on stack memory
+ * @timer: The timer to be initialized
+ * @function: the callback function
+ * @clock_id: The clock to be used
+ * @mode: The timer mode
+ *
+ * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack
+ * memory.
+ */
+void hrtimer_setup_on_stack(struct hrtimer *timer,
+ enum hrtimer_restart (*function)(struct hrtimer *),
+ clockid_t clock_id, enum hrtimer_mode mode)
{
- debug_init(timer, clock_id, mode);
- __hrtimer_init(timer, clock_id, mode);
+ debug_setup_on_stack(timer, clock_id, mode);
+ __hrtimer_setup(timer, function, clock_id, mode);
}
-EXPORT_SYMBOL_GPL(hrtimer_init);
+EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack);
/*
* A timer is active, when it is enqueued into the rbtree or the
@@ -1663,7 +1755,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
raw_write_seqcount_barrier(&base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
- fn = timer->function;
+ fn = ACCESS_PRIVATE(timer, function);
/*
* Clear the 'is relative' flag for the TIME_LOW_RES case. If the
@@ -1753,7 +1845,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
}
}
-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
+static __latent_entropy void hrtimer_run_softirq(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
unsigned long flags;
@@ -1805,7 +1897,7 @@ retry:
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ raise_timer_softirq(HRTIMER_SOFTIRQ);
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@@ -1868,25 +1960,7 @@ retry:
tick_program_event(expires_next, 1);
pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
-
-/* called with interrupts disabled */
-static inline void __hrtimer_peek_ahead_timers(void)
-{
- struct tick_device *td;
-
- if (!hrtimer_hres_active())
- return;
-
- td = this_cpu_ptr(&tick_cpu_device);
- if (td && td->evtdev)
- hrtimer_interrupt(td->evtdev);
-}
-
-#else /* CONFIG_HIGH_RES_TIMERS */
-
-static inline void __hrtimer_peek_ahead_timers(void) { }
-
-#endif /* !CONFIG_HIGH_RES_TIMERS */
+#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
* Called from run_local_timers in hardirq context every jiffy
@@ -1897,7 +1971,7 @@ void hrtimer_run_queues(void)
unsigned long flags;
ktime_t now;
- if (__hrtimer_hres_active(cpu_base))
+ if (hrtimer_hres_active(cpu_base))
return;
/*
@@ -1918,7 +1992,7 @@ void hrtimer_run_queues(void)
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ raise_timer_softirq(HRTIMER_SOFTIRQ);
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@@ -1956,7 +2030,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
* Make the enqueue delivery mode check work on RT. If the sleeper
* was initialized for hard interrupt delivery, force the mode bit.
* This is a special case for hrtimer_sleepers because
- * hrtimer_init_sleeper() determines the delivery mode on RT so the
+ * __hrtimer_setup_sleeper() determines the delivery mode on RT so the
* fiddling with this decision is avoided at the call sites.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
@@ -1966,8 +2040,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
-static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode)
+static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
+ clockid_t clock_id, enum hrtimer_mode mode)
{
/*
* On PREEMPT_RT enabled kernels hrtimers which are not explicitly
@@ -1989,29 +2063,27 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
* expiry.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
+ if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
mode |= HRTIMER_MODE_HARD;
}
- __hrtimer_init(&sl->timer, clock_id, mode);
- sl->timer.function = hrtimer_wakeup;
+ __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode);
sl->task = current;
}
/**
- * hrtimer_init_sleeper - initialize sleeper to the given clock
+ * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory
* @sl: sleeper to be initialized
* @clock_id: the clock to be used
* @mode: timer mode abs/rel
*/
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
- enum hrtimer_mode mode)
+void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl,
+ clockid_t clock_id, enum hrtimer_mode mode)
{
- debug_init(&sl->timer, clock_id, mode);
- __hrtimer_init_sleeper(sl, clock_id, mode);
-
+ debug_setup_on_stack(&sl->timer, clock_id, mode);
+ __hrtimer_setup_sleeper(sl, clock_id, mode);
}
-EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
+EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack);
int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
@@ -2072,9 +2144,8 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
struct hrtimer_sleeper t;
int ret;
- hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
- HRTIMER_MODE_ABS);
- hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
+ hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
+ hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer);
return ret;
@@ -2086,14 +2157,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
- u64 slack;
-
- slack = current->timer_slack_ns;
- if (dl_task(current) || rt_task(current))
- slack = 0;
- hrtimer_init_sleeper_on_stack(&t, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
+ hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
+ hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
goto out;
@@ -2106,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
restart = &current->restart_block;
restart->nanosleep.clockid = t.timer.base->clockid;
- restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
@@ -2126,6 +2192,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2147,6 +2214,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2171,6 +2239,15 @@ int hrtimers_prepare_cpu(unsigned int cpu)
}
cpu_base->cpu = cpu;
+ hrtimer_cpu_base_init_expiry_lock(cpu_base);
+ return 0;
+}
+
+int hrtimers_cpu_starting(unsigned int cpu)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+
+ /* Clear out any left over state from a CPU down operation */
cpu_base->active_bases = 0;
cpu_base->hres_active = 0;
cpu_base->hang_detected = 0;
@@ -2178,7 +2255,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
- hrtimer_cpu_base_init_expiry_lock(cpu_base);
+ cpu_base->online = 1;
return 0;
}
@@ -2214,48 +2291,33 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
{
+ int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
struct hrtimer_cpu_base *old_base, *new_base;
- int i;
- BUG_ON(cpu_online(scpu));
- tick_cancel_sched_timer(scpu);
+ old_base = this_cpu_ptr(&hrtimer_bases);
+ new_base = &per_cpu(hrtimer_bases, ncpu);
/*
- * this BH disable ensures that raise_softirq_irqoff() does
- * not wakeup ksoftirqd (and acquire the pi-lock) while
- * holding the cpu_base lock
- */
- local_bh_disable();
- local_irq_disable();
- old_base = &per_cpu(hrtimer_bases, scpu);
- new_base = this_cpu_ptr(&hrtimer_bases);
- /*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&old_base->lock);
+ raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
&new_base->clock_base[i]);
}
- /*
- * The migration might have changed the first expiring softirq
- * timer on this CPU. Update it.
- */
- hrtimer_update_softirq_timer(new_base, false);
+ /* Tell the other CPU to retrigger the next event */
+ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
- raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
+ old_base->online = 0;
+ raw_spin_unlock(&old_base->lock);
- /* Check, if we got expired work to do */
- __hrtimer_peek_ahead_timers();
- local_irq_enable();
- local_bh_enable();
return 0;
}
@@ -2264,124 +2326,6 @@ int hrtimers_dead_cpu(unsigned int scpu)
void __init hrtimers_init(void)
{
hrtimers_prepare_cpu(smp_processor_id());
+ hrtimers_cpu_starting(smp_processor_id());
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
-
-/**
- * schedule_hrtimeout_range_clock - sleep until timeout
- * @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode
- * @clock_id: timer clock to be used
- */
-int __sched
-schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
- const enum hrtimer_mode mode, clockid_t clock_id)
-{
- struct hrtimer_sleeper t;
-
- /*
- * Optimize when a zero timeout value is given. It does not
- * matter whether this is an absolute or a relative time.
- */
- if (expires && *expires == 0) {
- __set_current_state(TASK_RUNNING);
- return 0;
- }
-
- /*
- * A NULL parameter means "infinite"
- */
- if (!expires) {
- schedule();
- return -EINTR;
- }
-
- hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
- hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
- hrtimer_sleeper_start_expires(&t, mode);
-
- if (likely(t.task))
- schedule();
-
- hrtimer_cancel(&t.timer);
- destroy_hrtimer_on_stack(&t.timer);
-
- __set_current_state(TASK_RUNNING);
-
- return !t.task ? 0 : -EINTR;
-}
-EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
-
-/**
- * schedule_hrtimeout_range - sleep until timeout
- * @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode
- *
- * Make the current task sleep until the given expiry time has
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
- *
- * The @delta argument gives the kernel the freedom to schedule the
- * actual wakeup to a time that is both power and performance friendly.
- * The kernel give the normal best effort behavior for "@expires+@delta",
- * but may decide to fire the timer earlier, but no earlier than @expires.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns unless the current task is explicitly
- * woken up, (e.g. by wake_up_process()).
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task or the current task is explicitly woken
- * up.
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- *
- * Returns 0 when the timer has expired. If the task was woken before the
- * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
- * by an explicit wakeup, it returns -EINTR.
- */
-int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
- const enum hrtimer_mode mode)
-{
- return schedule_hrtimeout_range_clock(expires, delta, mode,
- CLOCK_MONOTONIC);
-}
-EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
-
-/**
- * schedule_hrtimeout - sleep until timeout
- * @expires: timeout value (ktime_t)
- * @mode: timer mode
- *
- * Make the current task sleep until the given expiry time has
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns unless the current task is explicitly
- * woken up, (e.g. by wake_up_process()).
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task or the current task is explicitly woken
- * up.
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- *
- * Returns 0 when the timer has expired. If the task was woken before the
- * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
- * by an explicit wakeup, it returns -EINTR.
- */
-int __sched schedule_hrtimeout(ktime_t *expires,
- const enum hrtimer_mode mode)
-{
- return schedule_hrtimeout_range(expires, 0, mode);
-}
-EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 00629e658ca1..7c6110e964e7 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -151,7 +151,26 @@ COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
#endif
/*
- * The timer is automagically restarted, when interval != 0
+ * Invoked from dequeue_signal() when SIG_ALRM is delivered.
+ *
+ * Restart the ITIMER_REAL timer if it is armed as periodic timer. Doing
+ * this in the signal delivery path instead of self rearming prevents a DoS
+ * with small increments in the high reolution timer case and reduces timer
+ * noise in general.
+ */
+void posixtimer_rearm_itimer(struct task_struct *tsk)
+{
+ struct hrtimer *tmr = &tsk->signal->real_timer;
+
+ if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) {
+ hrtimer_forward_now(tmr, tsk->signal->it_real_incr);
+ hrtimer_restart(tmr);
+ }
+}
+
+/*
+ * Interval timers are restarted in the signal delivery path. See
+ * posixtimer_rearm_itimer().
*/
enum hrtimer_restart it_real_fn(struct hrtimer *timer)
{
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index bc4db9e5ab70..d31a6d40d38d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void)
static struct clocksource refined_jiffies;
-int register_refined_jiffies(long cycles_per_second)
+void __init register_refined_jiffies(long cycles_per_second)
{
u64 nsec_per_tick, shift_hz;
long cycles_per_tick;
-
-
refined_jiffies = clocksource_jiffies;
refined_jiffies.name = "refined-jiffies";
refined_jiffies.rating++;
@@ -100,5 +98,129 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
__clocksource_register(&refined_jiffies);
- return 0;
}
+
+#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ)
+#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ)
+
+static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ)
+static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ)
+static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies)
+static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t)
+static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies)
+static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs)
+
+static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz,
+ sysctl_kern_to_user_int_conv_hz, false)
+static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies,
+ sysctl_user_to_kern_int_conv_userhz,
+ sysctl_kern_to_user_int_conv_userhz, false)
+static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms,
+ sysctl_kern_to_user_int_conv_ms, false)
+static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax,
+ sysctl_user_to_kern_int_conv_ms,
+ sysctl_kern_to_user_int_conv_ms, true)
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ)
+ return -EINVAL;
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_userhz_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_ms_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+
+int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_ms_jiffies_minmax);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos,
+ HZ, 1000l);
+}
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 0775b9ec952a..e76be24b132c 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
+#include <linux/nstree.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/cred.h>
@@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
- refcount_set(&ns->ns.count, 1);
-
ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free_page;
ns->ucounts = ucounts;
- ns->ns.ops = &timens_operations;
ns->user_ns = get_user_ns(user_ns);
ns->offsets = old_ns->offsets;
ns->frozen_offsets = false;
+ ns_tree_add(ns);
return ns;
fail_free_page:
@@ -130,7 +129,7 @@ fail:
*
* Return: timens_for_children namespace or ERR_PTR.
*/
-struct time_namespace *copy_time_ns(unsigned long flags,
+struct time_namespace *copy_time_ns(u64 flags,
struct user_namespace *user_ns, struct time_namespace *old_ns)
{
if (!(flags & CLONE_NEWTIME))
@@ -165,26 +164,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off)
* HVCLOCK
* VVAR
*
- * The check for vdso_data->clock_mode is in the unlikely path of
+ * The check for vdso_clock->clock_mode is in the unlikely path of
* the seq begin magic. So for the non-timens case most of the time
* 'seq' is even, so the branch is not taken.
*
* If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
- * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
+ * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
* update to finish and for 'seq' to become even anyway.
*
- * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
* enforces the time namespace handling path.
*/
-static void timens_setup_vdso_data(struct vdso_data *vdata,
- struct time_namespace *ns)
+static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
+ struct time_namespace *ns)
{
- struct timens_offset *offset = vdata->offset;
+ struct timens_offset *offset = vc->offset;
struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
- vdata->seq = 1;
- vdata->clock_mode = VDSO_CLOCKMODE_TIMENS;
+ vc->seq = 1;
+ vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
offset[CLOCK_MONOTONIC] = monotonic;
offset[CLOCK_MONOTONIC_RAW] = monotonic;
offset[CLOCK_MONOTONIC_COARSE] = monotonic;
@@ -219,7 +218,8 @@ static DEFINE_MUTEX(offset_lock);
static void timens_set_vvar_page(struct task_struct *task,
struct time_namespace *ns)
{
- struct vdso_data *vdata;
+ struct vdso_time_data *vdata;
+ struct vdso_clock *vc;
unsigned int i;
if (ns == &init_time_ns)
@@ -235,10 +235,16 @@ static void timens_set_vvar_page(struct task_struct *task,
goto out;
ns->frozen_offsets = true;
- vdata = arch_get_vdso_data(page_address(ns->vvar_page));
+ vdata = page_address(ns->vvar_page);
+ vc = vdata->clock_data;
for (i = 0; i < CS_BASES; i++)
- timens_setup_vdso_data(&vdata[i], ns);
+ timens_setup_vdso_clock_data(&vc[i], ns);
+
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+ for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+ timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+ }
out:
mutex_unlock(&offset_lock);
@@ -246,16 +252,13 @@ out:
void free_time_ns(struct time_namespace *ns)
{
+ ns_tree_remove(ns);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
__free_page(ns->vvar_page);
- kfree(ns);
-}
-
-static struct time_namespace *to_time_ns(struct ns_common *ns)
-{
- return container_of(ns, struct time_namespace, ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
@@ -459,7 +462,6 @@ out:
const struct proc_ns_operations timens_operations = {
.name = "time",
- .type = CLONE_NEWTIME,
.get = timens_get,
.put = timens_put,
.install = timens_install,
@@ -469,7 +471,6 @@ const struct proc_ns_operations timens_operations = {
const struct proc_ns_operations timens_for_children_operations = {
.name = "time_for_children",
.real_ns_name = "time",
- .type = CLONE_NEWTIME,
.get = timens_for_children_get,
.put = timens_put,
.install = timens_install,
@@ -477,9 +478,12 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.count = REFCOUNT_INIT(3),
+ .ns = NS_COMMON_INIT(init_time_ns),
.user_ns = &init_user_ns,
- .ns.inum = PROC_TIME_INIT_INO,
- .ns.ops = &timens_operations,
.frozen_offsets = true,
};
+
+void __init time_ns_init(void)
+{
+ ns_tree_add(&init_time_ns);
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 406dccb79c2b..97fa99b96dd0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,26 +18,86 @@
#include <linux/module.h>
#include <linux/rtc.h>
#include <linux/audit.h>
+#include <linux/timekeeper_internal.h>
#include "ntp_internal.h"
#include "timekeeping_internal.h"
-
-/*
- * NTP timekeeping variables:
+/**
+ * struct ntp_data - Structure holding all NTP related state
+ * @tick_usec: USER_HZ period in microseconds
+ * @tick_length: Adjusted tick length
+ * @tick_length_base: Base value for @tick_length
+ * @time_state: State of the clock synchronization
+ * @time_status: Clock status bits
+ * @time_offset: Time adjustment in nanoseconds
+ * @time_constant: PLL time constant
+ * @time_maxerror: Maximum error in microseconds holding the NTP sync distance
+ * (NTP dispersion + delay / 2)
+ * @time_esterror: Estimated error in microseconds holding NTP dispersion
+ * @time_freq: Frequency offset scaled nsecs/secs
+ * @time_reftime: Time at last adjustment in seconds
+ * @time_adjust: Adjustment value
+ * @ntp_tick_adj: Constant boot-param configurable NTP tick adjustment (upscaled)
+ * @ntp_next_leap_sec: Second value of the next pending leapsecond, or TIME64_MAX if no leap
*
- * Note: All of the NTP state is protected by the timekeeping locks.
+ * @pps_valid: PPS signal watchdog counter
+ * @pps_tf: PPS phase median filter
+ * @pps_jitter: PPS current jitter in nanoseconds
+ * @pps_fbase: PPS beginning of the last freq interval
+ * @pps_shift: PPS current interval duration in seconds (shift value)
+ * @pps_intcnt: PPS interval counter
+ * @pps_freq: PPS frequency offset in scaled ns/s
+ * @pps_stabil: PPS current stability in scaled ns/s
+ * @pps_calcnt: PPS monitor: calibration intervals
+ * @pps_jitcnt: PPS monitor: jitter limit exceeded
+ * @pps_stbcnt: PPS monitor: stability limit exceeded
+ * @pps_errcnt: PPS monitor: calibration errors
+ *
+ * Protected by the timekeeping locks.
*/
+struct ntp_data {
+ unsigned long tick_usec;
+ u64 tick_length;
+ u64 tick_length_base;
+ int time_state;
+ int time_status;
+ s64 time_offset;
+ long time_constant;
+ long time_maxerror;
+ long time_esterror;
+ s64 time_freq;
+ time64_t time_reftime;
+ long time_adjust;
+ s64 ntp_tick_adj;
+ time64_t ntp_next_leap_sec;
+#ifdef CONFIG_NTP_PPS
+ int pps_valid;
+ long pps_tf[3];
+ long pps_jitter;
+ struct timespec64 pps_fbase;
+ int pps_shift;
+ int pps_intcnt;
+ s64 pps_freq;
+ long pps_stabil;
+ long pps_calcnt;
+ long pps_jitcnt;
+ long pps_stbcnt;
+ long pps_errcnt;
+#endif
+};
-
-/* USER_HZ period (usecs): */
-unsigned long tick_usec = USER_TICK_USEC;
-
-/* SHIFTED_HZ period (nsecs): */
-unsigned long tick_nsec;
-
-static u64 tick_length;
-static u64 tick_length_base;
+static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = {
+ [ 0 ... TIMEKEEPERS_MAX - 1 ] = {
+ .tick_usec = USER_TICK_USEC,
+ .time_state = TIME_OK,
+ .time_status = STA_UNSYNC,
+ .time_constant = 2,
+ .time_maxerror = NTP_PHASE_LIMIT,
+ .time_esterror = NTP_PHASE_LIMIT,
+ .ntp_next_leap_sec = TIME64_MAX,
+ },
+};
#define SECS_PER_DAY 86400
#define MAX_TICKADJ 500LL /* usecs */
@@ -45,46 +105,6 @@ static u64 tick_length_base;
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
#define MAX_TAI_OFFSET 100000
-/*
- * phase-lock loop variables
- */
-
-/*
- * clock synchronization status
- *
- * (TIME_ERROR prevents overwriting the CMOS clock)
- */
-static int time_state = TIME_OK;
-
-/* clock status bits: */
-static int time_status = STA_UNSYNC;
-
-/* time adjustment (nsecs): */
-static s64 time_offset;
-
-/* pll time constant: */
-static long time_constant = 2;
-
-/* maximum error (usecs): */
-static long time_maxerror = NTP_PHASE_LIMIT;
-
-/* estimated error (usecs): */
-static long time_esterror = NTP_PHASE_LIMIT;
-
-/* frequency offset (scaled nsecs/secs): */
-static s64 time_freq;
-
-/* time at last adjustment (secs): */
-static time64_t time_reftime;
-
-static long time_adjust;
-
-/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
-static s64 ntp_tick_adj;
-
-/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
-static time64_t ntp_next_leap_sec = TIME64_MAX;
-
#ifdef CONFIG_NTP_PPS
/*
@@ -101,128 +121,115 @@ static time64_t ntp_next_leap_sec = TIME64_MAX;
intervals to decrease it */
#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
-static int pps_valid; /* signal watchdog counter */
-static long pps_tf[3]; /* phase median filter */
-static long pps_jitter; /* current jitter (ns) */
-static struct timespec64 pps_fbase; /* beginning of the last freq interval */
-static int pps_shift; /* current interval duration (s) (shift) */
-static int pps_intcnt; /* interval counter */
-static s64 pps_freq; /* frequency offset (scaled ns/s) */
-static long pps_stabil; /* current stability (scaled ns/s) */
-
/*
- * PPS signal quality monitors
- */
-static long pps_calcnt; /* calibration intervals */
-static long pps_jitcnt; /* jitter limit exceeded */
-static long pps_stbcnt; /* stability limit exceeded */
-static long pps_errcnt; /* calibration errors */
-
-
-/* PPS kernel consumer compensates the whole phase error immediately.
+ * PPS kernel consumer compensates the whole phase error immediately.
* Otherwise, reduce the offset by a fixed factor times the time constant.
*/
-static inline s64 ntp_offset_chunk(s64 offset)
+static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset)
{
- if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ if (ntpdata->time_status & STA_PPSTIME && ntpdata->time_status & STA_PPSSIGNAL)
return offset;
else
- return shift_right(offset, SHIFT_PLL + time_constant);
+ return shift_right(offset, SHIFT_PLL + ntpdata->time_constant);
}
-static inline void pps_reset_freq_interval(void)
+static inline void pps_reset_freq_interval(struct ntp_data *ntpdata)
{
- /* the PPS calibration interval may end
- surprisingly early */
- pps_shift = PPS_INTMIN;
- pps_intcnt = 0;
+ /* The PPS calibration interval may end surprisingly early */
+ ntpdata->pps_shift = PPS_INTMIN;
+ ntpdata->pps_intcnt = 0;
}
/**
* pps_clear - Clears the PPS state variables
+ * @ntpdata: Pointer to ntp data
*/
-static inline void pps_clear(void)
+static inline void pps_clear(struct ntp_data *ntpdata)
{
- pps_reset_freq_interval();
- pps_tf[0] = 0;
- pps_tf[1] = 0;
- pps_tf[2] = 0;
- pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
- pps_freq = 0;
+ pps_reset_freq_interval(ntpdata);
+ ntpdata->pps_tf[0] = 0;
+ ntpdata->pps_tf[1] = 0;
+ ntpdata->pps_tf[2] = 0;
+ ntpdata->pps_fbase.tv_sec = ntpdata->pps_fbase.tv_nsec = 0;
+ ntpdata->pps_freq = 0;
}
-/* Decrease pps_valid to indicate that another second has passed since
- * the last PPS signal. When it reaches 0, indicate that PPS signal is
- * missing.
+/*
+ * Decrease pps_valid to indicate that another second has passed since the
+ * last PPS signal. When it reaches 0, indicate that PPS signal is missing.
*/
-static inline void pps_dec_valid(void)
+static inline void pps_dec_valid(struct ntp_data *ntpdata)
{
- if (pps_valid > 0)
- pps_valid--;
- else {
- time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
- STA_PPSWANDER | STA_PPSERROR);
- pps_clear();
+ if (ntpdata->pps_valid > 0) {
+ ntpdata->pps_valid--;
+ } else {
+ ntpdata->time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ pps_clear(ntpdata);
}
}
-static inline void pps_set_freq(s64 freq)
+static inline void pps_set_freq(struct ntp_data *ntpdata)
{
- pps_freq = freq;
+ ntpdata->pps_freq = ntpdata->time_freq;
}
-static inline int is_error_status(int status)
+static inline bool is_error_status(int status)
{
return (status & (STA_UNSYNC|STA_CLOCKERR))
- /* PPS signal lost when either PPS time or
- * PPS frequency synchronization requested
+ /*
+ * PPS signal lost when either PPS time or PPS frequency
+ * synchronization requested
*/
|| ((status & (STA_PPSFREQ|STA_PPSTIME))
&& !(status & STA_PPSSIGNAL))
- /* PPS jitter exceeded when
- * PPS time synchronization requested */
+ /*
+ * PPS jitter exceeded when PPS time synchronization
+ * requested
+ */
|| ((status & (STA_PPSTIME|STA_PPSJITTER))
== (STA_PPSTIME|STA_PPSJITTER))
- /* PPS wander exceeded or calibration error when
- * PPS frequency synchronization requested
+ /*
+ * PPS wander exceeded or calibration error when PPS
+ * frequency synchronization requested
*/
|| ((status & STA_PPSFREQ)
&& (status & (STA_PPSWANDER|STA_PPSERROR)));
}
-static inline void pps_fill_timex(struct __kernel_timex *txc)
+static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc)
{
- txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+ txc->ppsfreq = shift_right((ntpdata->pps_freq >> PPM_SCALE_INV_SHIFT) *
PPM_SCALE_INV, NTP_SCALE_SHIFT);
- txc->jitter = pps_jitter;
- if (!(time_status & STA_NANO))
- txc->jitter = pps_jitter / NSEC_PER_USEC;
- txc->shift = pps_shift;
- txc->stabil = pps_stabil;
- txc->jitcnt = pps_jitcnt;
- txc->calcnt = pps_calcnt;
- txc->errcnt = pps_errcnt;
- txc->stbcnt = pps_stbcnt;
+ txc->jitter = ntpdata->pps_jitter;
+ if (!(ntpdata->time_status & STA_NANO))
+ txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC;
+ txc->shift = ntpdata->pps_shift;
+ txc->stabil = ntpdata->pps_stabil;
+ txc->jitcnt = ntpdata->pps_jitcnt;
+ txc->calcnt = ntpdata->pps_calcnt;
+ txc->errcnt = ntpdata->pps_errcnt;
+ txc->stbcnt = ntpdata->pps_stbcnt;
}
#else /* !CONFIG_NTP_PPS */
-static inline s64 ntp_offset_chunk(s64 offset)
+static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset)
{
- return shift_right(offset, SHIFT_PLL + time_constant);
+ return shift_right(offset, SHIFT_PLL + ntpdata->time_constant);
}
-static inline void pps_reset_freq_interval(void) {}
-static inline void pps_clear(void) {}
-static inline void pps_dec_valid(void) {}
-static inline void pps_set_freq(s64 freq) {}
+static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) {}
+static inline void pps_clear(struct ntp_data *ntpdata) {}
+static inline void pps_dec_valid(struct ntp_data *ntpdata) {}
+static inline void pps_set_freq(struct ntp_data *ntpdata) {}
-static inline int is_error_status(int status)
+static inline bool is_error_status(int status)
{
return status & (STA_UNSYNC|STA_CLOCKERR);
}
-static inline void pps_fill_timex(struct __kernel_timex *txc)
+static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc)
{
/* PPS is not implemented, so these are zero */
txc->ppsfreq = 0;
@@ -237,158 +244,149 @@ static inline void pps_fill_timex(struct __kernel_timex *txc)
#endif /* CONFIG_NTP_PPS */
-
-/**
- * ntp_synced - Returns 1 if the NTP status is not UNSYNC
- *
- */
-static inline int ntp_synced(void)
-{
- return !(time_status & STA_UNSYNC);
-}
-
-
-/*
- * NTP methods:
- */
-
/*
- * Update (tick_length, tick_length_base, tick_nsec), based
- * on (tick_usec, ntp_tick_adj, time_freq):
+ * Update tick_length and tick_length_base, based on tick_usec, ntp_tick_adj and
+ * time_freq:
*/
-static void ntp_update_frequency(void)
+static void ntp_update_frequency(struct ntp_data *ntpdata)
{
- u64 second_length;
- u64 new_base;
+ u64 second_length, new_base, tick_usec = (u64)ntpdata->tick_usec;
- second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
- << NTP_SCALE_SHIFT;
+ second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT;
- second_length += ntp_tick_adj;
- second_length += time_freq;
+ second_length += ntpdata->ntp_tick_adj;
+ second_length += ntpdata->time_freq;
- tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
new_base = div_u64(second_length, NTP_INTERVAL_FREQ);
/*
- * Don't wait for the next second_overflow, apply
- * the change to the tick length immediately:
+ * Don't wait for the next second_overflow, apply the change to the
+ * tick length immediately:
*/
- tick_length += new_base - tick_length_base;
- tick_length_base = new_base;
+ ntpdata->tick_length += new_base - ntpdata->tick_length_base;
+ ntpdata->tick_length_base = new_base;
}
-static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
+static inline s64 ntp_update_offset_fll(struct ntp_data *ntpdata, s64 offset64, long secs)
{
- time_status &= ~STA_MODE;
+ ntpdata->time_status &= ~STA_MODE;
if (secs < MINSEC)
return 0;
- if (!(time_status & STA_FLL) && (secs <= MAXSEC))
+ if (!(ntpdata->time_status & STA_FLL) && (secs <= MAXSEC))
return 0;
- time_status |= STA_MODE;
+ ntpdata->time_status |= STA_MODE;
return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
}
-static void ntp_update_offset(long offset)
+static void ntp_update_offset(struct ntp_data *ntpdata, long offset)
{
- s64 freq_adj;
- s64 offset64;
- long secs;
+ s64 freq_adj, offset64;
+ long secs, real_secs;
- if (!(time_status & STA_PLL))
+ if (!(ntpdata->time_status & STA_PLL))
return;
- if (!(time_status & STA_NANO)) {
+ if (!(ntpdata->time_status & STA_NANO)) {
/* Make sure the multiplication below won't overflow */
offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC);
offset *= NSEC_PER_USEC;
}
- /*
- * Scale the phase adjustment and
- * clamp to the operating range.
- */
+ /* Scale the phase adjustment and clamp to the operating range. */
offset = clamp(offset, -MAXPHASE, MAXPHASE);
/*
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- secs = (long)(__ktime_get_real_seconds() - time_reftime);
- if (unlikely(time_status & STA_FREQHOLD))
+ real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
+ secs = (long)(real_secs - ntpdata->time_reftime);
+ if (unlikely(ntpdata->time_status & STA_FREQHOLD))
secs = 0;
- time_reftime = __ktime_get_real_seconds();
+ ntpdata->time_reftime = real_secs;
offset64 = offset;
- freq_adj = ntp_update_offset_fll(offset64, secs);
+ freq_adj = ntp_update_offset_fll(ntpdata, offset64, secs);
/*
* Clamp update interval to reduce PLL gain with low
* sampling rate (e.g. intermittent network connection)
* to avoid instability.
*/
- if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
- secs = 1 << (SHIFT_PLL + 1 + time_constant);
+ if (unlikely(secs > 1 << (SHIFT_PLL + 1 + ntpdata->time_constant)))
+ secs = 1 << (SHIFT_PLL + 1 + ntpdata->time_constant);
freq_adj += (offset64 * secs) <<
- (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
+ (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + ntpdata->time_constant));
- freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
+ freq_adj = min(freq_adj + ntpdata->time_freq, MAXFREQ_SCALED);
- time_freq = max(freq_adj, -MAXFREQ_SCALED);
+ ntpdata->time_freq = max(freq_adj, -MAXFREQ_SCALED);
- time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
+ ntpdata->time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
}
-/**
- * ntp_clear - Clears the NTP state variables
- */
-void ntp_clear(void)
+static void __ntp_clear(struct ntp_data *ntpdata)
{
- time_adjust = 0; /* stop active adjtime() */
- time_status |= STA_UNSYNC;
- time_maxerror = NTP_PHASE_LIMIT;
- time_esterror = NTP_PHASE_LIMIT;
+ /* Stop active adjtime() */
+ ntpdata->time_adjust = 0;
+ ntpdata->time_status |= STA_UNSYNC;
+ ntpdata->time_maxerror = NTP_PHASE_LIMIT;
+ ntpdata->time_esterror = NTP_PHASE_LIMIT;
- ntp_update_frequency();
+ ntp_update_frequency(ntpdata);
- tick_length = tick_length_base;
- time_offset = 0;
+ ntpdata->tick_length = ntpdata->tick_length_base;
+ ntpdata->time_offset = 0;
- ntp_next_leap_sec = TIME64_MAX;
+ ntpdata->ntp_next_leap_sec = TIME64_MAX;
/* Clear PPS state variables */
- pps_clear();
+ pps_clear(ntpdata);
+}
+
+/**
+ * ntp_clear - Clears the NTP state variables
+ * @tkid: Timekeeper ID to be able to select proper ntp data array member
+ */
+void ntp_clear(unsigned int tkid)
+{
+ __ntp_clear(&tk_ntp_data[tkid]);
}
-u64 ntp_tick_length(void)
+u64 ntp_tick_length(unsigned int tkid)
{
- return tick_length;
+ return tk_ntp_data[tkid].tick_length;
}
/**
* ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ * @tkid: Timekeeper ID
*
- * Provides the time of the next leapsecond against CLOCK_REALTIME in
- * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next
+ * leap second against CLOCK_REALTIME in a ktime_t format if a
+ * leap second is pending. KTIME_MAX otherwise.
*/
-ktime_t ntp_get_next_leap(void)
+ktime_t ntp_get_next_leap(unsigned int tkid)
{
- ktime_t ret;
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
+
+ if (tkid != TIMEKEEPER_CORE)
+ return KTIME_MAX;
- if ((time_state == TIME_INS) && (time_status & STA_INS))
- return ktime_set(ntp_next_leap_sec, 0);
- ret = KTIME_MAX;
- return ret;
+ if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS))
+ return ktime_set(ntpdata->ntp_next_leap_sec, 0);
+
+ return KTIME_MAX;
}
/*
- * this routine handles the overflow of the microsecond field
+ * This routine handles the overflow of the microsecond field
*
* The tricky bits of code to handle the accurate clock support
* were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
@@ -397,8 +395,9 @@ ktime_t ntp_get_next_leap(void)
*
* Also handles leap second processing, and returns leap offset
*/
-int second_overflow(time64_t secs)
+int second_overflow(unsigned int tkid, time64_t secs)
{
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
s64 delta;
int leap = 0;
s32 rem;
@@ -408,87 +407,84 @@ int second_overflow(time64_t secs)
* day, the system clock is set back one second; if in leap-delete
* state, the system clock is set ahead one second.
*/
- switch (time_state) {
+ switch (ntpdata->time_state) {
case TIME_OK:
- if (time_status & STA_INS) {
- time_state = TIME_INS;
+ if (ntpdata->time_status & STA_INS) {
+ ntpdata->time_state = TIME_INS;
div_s64_rem(secs, SECS_PER_DAY, &rem);
- ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
- } else if (time_status & STA_DEL) {
- time_state = TIME_DEL;
+ ntpdata->ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
+ } else if (ntpdata->time_status & STA_DEL) {
+ ntpdata->time_state = TIME_DEL;
div_s64_rem(secs + 1, SECS_PER_DAY, &rem);
- ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
+ ntpdata->ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
}
break;
case TIME_INS:
- if (!(time_status & STA_INS)) {
- ntp_next_leap_sec = TIME64_MAX;
- time_state = TIME_OK;
- } else if (secs == ntp_next_leap_sec) {
+ if (!(ntpdata->time_status & STA_INS)) {
+ ntpdata->ntp_next_leap_sec = TIME64_MAX;
+ ntpdata->time_state = TIME_OK;
+ } else if (secs == ntpdata->ntp_next_leap_sec) {
leap = -1;
- time_state = TIME_OOP;
- printk(KERN_NOTICE
- "Clock: inserting leap second 23:59:60 UTC\n");
+ ntpdata->time_state = TIME_OOP;
+ pr_notice("Clock: inserting leap second 23:59:60 UTC\n");
}
break;
case TIME_DEL:
- if (!(time_status & STA_DEL)) {
- ntp_next_leap_sec = TIME64_MAX;
- time_state = TIME_OK;
- } else if (secs == ntp_next_leap_sec) {
+ if (!(ntpdata->time_status & STA_DEL)) {
+ ntpdata->ntp_next_leap_sec = TIME64_MAX;
+ ntpdata->time_state = TIME_OK;
+ } else if (secs == ntpdata->ntp_next_leap_sec) {
leap = 1;
- ntp_next_leap_sec = TIME64_MAX;
- time_state = TIME_WAIT;
- printk(KERN_NOTICE
- "Clock: deleting leap second 23:59:59 UTC\n");
+ ntpdata->ntp_next_leap_sec = TIME64_MAX;
+ ntpdata->time_state = TIME_WAIT;
+ pr_notice("Clock: deleting leap second 23:59:59 UTC\n");
}
break;
case TIME_OOP:
- ntp_next_leap_sec = TIME64_MAX;
- time_state = TIME_WAIT;
+ ntpdata->ntp_next_leap_sec = TIME64_MAX;
+ ntpdata->time_state = TIME_WAIT;
break;
case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
+ if (!(ntpdata->time_status & (STA_INS | STA_DEL)))
+ ntpdata->time_state = TIME_OK;
break;
}
-
/* Bump the maxerror field */
- time_maxerror += MAXFREQ / NSEC_PER_USEC;
- if (time_maxerror > NTP_PHASE_LIMIT) {
- time_maxerror = NTP_PHASE_LIMIT;
- time_status |= STA_UNSYNC;
+ ntpdata->time_maxerror += MAXFREQ / NSEC_PER_USEC;
+ if (ntpdata->time_maxerror > NTP_PHASE_LIMIT) {
+ ntpdata->time_maxerror = NTP_PHASE_LIMIT;
+ ntpdata->time_status |= STA_UNSYNC;
}
/* Compute the phase adjustment for the next second */
- tick_length = tick_length_base;
+ ntpdata->tick_length = ntpdata->tick_length_base;
- delta = ntp_offset_chunk(time_offset);
- time_offset -= delta;
- tick_length += delta;
+ delta = ntp_offset_chunk(ntpdata, ntpdata->time_offset);
+ ntpdata->time_offset -= delta;
+ ntpdata->tick_length += delta;
/* Check PPS signal */
- pps_dec_valid();
+ pps_dec_valid(ntpdata);
- if (!time_adjust)
+ if (!ntpdata->time_adjust)
goto out;
- if (time_adjust > MAX_TICKADJ) {
- time_adjust -= MAX_TICKADJ;
- tick_length += MAX_TICKADJ_SCALED;
+ if (ntpdata->time_adjust > MAX_TICKADJ) {
+ ntpdata->time_adjust -= MAX_TICKADJ;
+ ntpdata->tick_length += MAX_TICKADJ_SCALED;
goto out;
}
- if (time_adjust < -MAX_TICKADJ) {
- time_adjust += MAX_TICKADJ;
- tick_length -= MAX_TICKADJ_SCALED;
+ if (ntpdata->time_adjust < -MAX_TICKADJ) {
+ ntpdata->time_adjust += MAX_TICKADJ;
+ ntpdata->tick_length -= MAX_TICKADJ_SCALED;
goto out;
}
- tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
- << NTP_SCALE_SHIFT;
- time_adjust = 0;
+ ntpdata->tick_length += (s64)(ntpdata->time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
+ << NTP_SCALE_SHIFT;
+ ntpdata->time_adjust = 0;
out:
return leap;
@@ -611,6 +607,15 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns
}
#endif
+/**
+ * ntp_synced - Tells whether the NTP status is not UNSYNC
+ * Returns: true if not UNSYNC, false otherwise
+ */
+static inline bool ntp_synced(void)
+{
+ return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC);
+}
+
/*
* If we have an externally synchronized Linux clock, then update RTC clock
* accordingly every ~11 minutes. Generally RTCs can only store second
@@ -660,9 +665,17 @@ rearm:
sched_sync_hw_clock(offset_nsec, res != 0);
}
-void ntp_notify_cmos_timer(void)
+void ntp_notify_cmos_timer(bool offset_set)
{
/*
+ * If the time jumped (using ADJ_SETOFFSET) cancels sync timer,
+ * which may have been running if the time was synchronized
+ * prior to the ADJ_SETOFFSET call.
+ */
+ if (offset_set)
+ hrtimer_cancel(&sync_hrtimer);
+
+ /*
* When the work is currently executed but has not yet the timer
* rearmed this queues the work immediately again. No big issue,
* just a pointless work scheduled.
@@ -673,8 +686,7 @@ void ntp_notify_cmos_timer(void)
static void __init ntp_init_cmos_sync(void)
{
- hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- sync_hrtimer.function = sync_timer_callback;
+ hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS);
}
#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
static inline void __init ntp_init_cmos_sync(void) { }
@@ -683,163 +695,156 @@ static inline void __init ntp_init_cmos_sync(void) { }
/*
* Propagate a new txc->status value into the NTP state:
*/
-static inline void process_adj_status(const struct __kernel_timex *txc)
+static inline void process_adj_status(struct ntp_data *ntpdata, const struct __kernel_timex *txc)
{
- if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
- time_state = TIME_OK;
- time_status = STA_UNSYNC;
- ntp_next_leap_sec = TIME64_MAX;
- /* restart PPS frequency calibration */
- pps_reset_freq_interval();
+ if ((ntpdata->time_status & STA_PLL) && !(txc->status & STA_PLL)) {
+ ntpdata->time_state = TIME_OK;
+ ntpdata->time_status = STA_UNSYNC;
+ ntpdata->ntp_next_leap_sec = TIME64_MAX;
+ /* Restart PPS frequency calibration */
+ pps_reset_freq_interval(ntpdata);
}
/*
* If we turn on PLL adjustments then reset the
* reference time to current time.
*/
- if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
- time_reftime = __ktime_get_real_seconds();
+ if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL))
+ ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
/* only set allowed bits */
- time_status &= STA_RONLY;
- time_status |= txc->status & ~STA_RONLY;
+ ntpdata->time_status &= STA_RONLY;
+ ntpdata->time_status |= txc->status & ~STA_RONLY;
}
-
-static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
+static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct __kernel_timex *txc,
s32 *time_tai)
{
if (txc->modes & ADJ_STATUS)
- process_adj_status(txc);
+ process_adj_status(ntpdata, txc);
if (txc->modes & ADJ_NANO)
- time_status |= STA_NANO;
+ ntpdata->time_status |= STA_NANO;
if (txc->modes & ADJ_MICRO)
- time_status &= ~STA_NANO;
+ ntpdata->time_status &= ~STA_NANO;
if (txc->modes & ADJ_FREQUENCY) {
- time_freq = txc->freq * PPM_SCALE;
- time_freq = min(time_freq, MAXFREQ_SCALED);
- time_freq = max(time_freq, -MAXFREQ_SCALED);
- /* update pps_freq */
- pps_set_freq(time_freq);
+ ntpdata->time_freq = txc->freq * PPM_SCALE;
+ ntpdata->time_freq = min(ntpdata->time_freq, MAXFREQ_SCALED);
+ ntpdata->time_freq = max(ntpdata->time_freq, -MAXFREQ_SCALED);
+ /* Update pps_freq */
+ pps_set_freq(ntpdata);
}
if (txc->modes & ADJ_MAXERROR)
- time_maxerror = txc->maxerror;
+ ntpdata->time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT);
if (txc->modes & ADJ_ESTERROR)
- time_esterror = txc->esterror;
+ ntpdata->time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT);
if (txc->modes & ADJ_TIMECONST) {
- time_constant = txc->constant;
- if (!(time_status & STA_NANO))
- time_constant += 4;
- time_constant = min(time_constant, (long)MAXTC);
- time_constant = max(time_constant, 0l);
+ ntpdata->time_constant = clamp(txc->constant, 0, MAXTC);
+ if (!(ntpdata->time_status & STA_NANO))
+ ntpdata->time_constant += 4;
+ ntpdata->time_constant = clamp(ntpdata->time_constant, 0, MAXTC);
}
- if (txc->modes & ADJ_TAI &&
- txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET)
+ if (txc->modes & ADJ_TAI && txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET)
*time_tai = txc->constant;
if (txc->modes & ADJ_OFFSET)
- ntp_update_offset(txc->offset);
+ ntp_update_offset(ntpdata, txc->offset);
if (txc->modes & ADJ_TICK)
- tick_usec = txc->tick;
+ ntpdata->tick_usec = txc->tick;
if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
- ntp_update_frequency();
+ ntp_update_frequency(ntpdata);
}
-
/*
- * adjtimex mainly allows reading (and writing, if superuser) of
+ * adjtimex() mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad)
+int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad)
{
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
int result;
if (txc->modes & ADJ_ADJTIME) {
- long save_adjust = time_adjust;
+ long save_adjust = ntpdata->time_adjust;
if (!(txc->modes & ADJ_OFFSET_READONLY)) {
/* adjtime() is independent from ntp_adjtime() */
- time_adjust = txc->offset;
- ntp_update_frequency();
+ ntpdata->time_adjust = txc->offset;
+ ntp_update_frequency(ntpdata);
audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust);
- audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust);
+ audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, ntpdata->time_adjust);
}
txc->offset = save_adjust;
} else {
/* If there are input parameters, then process them: */
if (txc->modes) {
- audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset);
- audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq);
- audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status);
+ audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset);
+ audit_ntp_set_old(ad, AUDIT_NTP_FREQ, ntpdata->time_freq);
+ audit_ntp_set_old(ad, AUDIT_NTP_STATUS, ntpdata->time_status);
audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai);
- audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec);
+ audit_ntp_set_old(ad, AUDIT_NTP_TICK, ntpdata->tick_usec);
- process_adjtimex_modes(txc, time_tai);
+ process_adjtimex_modes(ntpdata, txc, time_tai);
- audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset);
- audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq);
- audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status);
+ audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset);
+ audit_ntp_set_new(ad, AUDIT_NTP_FREQ, ntpdata->time_freq);
+ audit_ntp_set_new(ad, AUDIT_NTP_STATUS, ntpdata->time_status);
audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai);
- audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec);
+ audit_ntp_set_new(ad, AUDIT_NTP_TICK, ntpdata->tick_usec);
}
- txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
- NTP_SCALE_SHIFT);
- if (!(time_status & STA_NANO))
- txc->offset = (u32)txc->offset / NSEC_PER_USEC;
+ txc->offset = shift_right(ntpdata->time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT);
+ if (!(ntpdata->time_status & STA_NANO))
+ txc->offset = div_s64(txc->offset, NSEC_PER_USEC);
}
- result = time_state; /* mostly `TIME_OK' */
- /* check for errors */
- if (is_error_status(time_status))
+ result = ntpdata->time_state;
+ if (is_error_status(ntpdata->time_status))
result = TIME_ERROR;
- txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+ txc->freq = shift_right((ntpdata->time_freq >> PPM_SCALE_INV_SHIFT) *
PPM_SCALE_INV, NTP_SCALE_SHIFT);
- txc->maxerror = time_maxerror;
- txc->esterror = time_esterror;
- txc->status = time_status;
- txc->constant = time_constant;
+ txc->maxerror = ntpdata->time_maxerror;
+ txc->esterror = ntpdata->time_esterror;
+ txc->status = ntpdata->time_status;
+ txc->constant = ntpdata->time_constant;
txc->precision = 1;
txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
- txc->tick = tick_usec;
+ txc->tick = ntpdata->tick_usec;
txc->tai = *time_tai;
- /* fill PPS status fields */
- pps_fill_timex(txc);
+ /* Fill PPS status fields */
+ pps_fill_timex(ntpdata, txc);
txc->time.tv_sec = ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
- if (!(time_status & STA_NANO))
+ if (!(ntpdata->time_status & STA_NANO))
txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC;
/* Handle leapsec adjustments */
- if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
- if ((time_state == TIME_INS) && (time_status & STA_INS)) {
+ if (unlikely(ts->tv_sec >= ntpdata->ntp_next_leap_sec)) {
+ if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) {
result = TIME_OOP;
txc->tai++;
txc->time.tv_sec--;
}
- if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
+ if ((ntpdata->time_state == TIME_DEL) && (ntpdata->time_status & STA_DEL)) {
result = TIME_WAIT;
txc->tai--;
txc->time.tv_sec++;
}
- if ((time_state == TIME_OOP) &&
- (ts->tv_sec == ntp_next_leap_sec)) {
+ if ((ntpdata->time_state == TIME_OOP) && (ts->tv_sec == ntpdata->ntp_next_leap_sec))
result = TIME_WAIT;
- }
}
return result;
@@ -847,17 +852,21 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
#ifdef CONFIG_NTP_PPS
-/* actually struct pps_normtime is good old struct timespec, but it is
+/*
+ * struct pps_normtime is basically a struct timespec, but it is
* semantically different (and it is the reason why it was invented):
* pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
- * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC)
+ */
struct pps_normtime {
s64 sec; /* seconds */
long nsec; /* nanoseconds */
};
-/* normalize the timestamp so that nsec is in the
- ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+/*
+ * Normalize the timestamp so that nsec is in the
+ * [ -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval
+ */
static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts)
{
struct pps_normtime norm = {
@@ -873,54 +882,57 @@ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts)
return norm;
}
-/* get current phase correction and jitter */
-static inline long pps_phase_filter_get(long *jitter)
+/* Get current phase correction and jitter */
+static inline long pps_phase_filter_get(struct ntp_data *ntpdata, long *jitter)
{
- *jitter = pps_tf[0] - pps_tf[1];
+ *jitter = ntpdata->pps_tf[0] - ntpdata->pps_tf[1];
if (*jitter < 0)
*jitter = -*jitter;
/* TODO: test various filters */
- return pps_tf[0];
+ return ntpdata->pps_tf[0];
}
-/* add the sample to the phase filter */
-static inline void pps_phase_filter_add(long err)
+/* Add the sample to the phase filter */
+static inline void pps_phase_filter_add(struct ntp_data *ntpdata, long err)
{
- pps_tf[2] = pps_tf[1];
- pps_tf[1] = pps_tf[0];
- pps_tf[0] = err;
+ ntpdata->pps_tf[2] = ntpdata->pps_tf[1];
+ ntpdata->pps_tf[1] = ntpdata->pps_tf[0];
+ ntpdata->pps_tf[0] = err;
}
-/* decrease frequency calibration interval length.
- * It is halved after four consecutive unstable intervals.
+/*
+ * Decrease frequency calibration interval length. It is halved after four
+ * consecutive unstable intervals.
*/
-static inline void pps_dec_freq_interval(void)
+static inline void pps_dec_freq_interval(struct ntp_data *ntpdata)
{
- if (--pps_intcnt <= -PPS_INTCOUNT) {
- pps_intcnt = -PPS_INTCOUNT;
- if (pps_shift > PPS_INTMIN) {
- pps_shift--;
- pps_intcnt = 0;
+ if (--ntpdata->pps_intcnt <= -PPS_INTCOUNT) {
+ ntpdata->pps_intcnt = -PPS_INTCOUNT;
+ if (ntpdata->pps_shift > PPS_INTMIN) {
+ ntpdata->pps_shift--;
+ ntpdata->pps_intcnt = 0;
}
}
}
-/* increase frequency calibration interval length.
- * It is doubled after four consecutive stable intervals.
+/*
+ * Increase frequency calibration interval length. It is doubled after
+ * four consecutive stable intervals.
*/
-static inline void pps_inc_freq_interval(void)
+static inline void pps_inc_freq_interval(struct ntp_data *ntpdata)
{
- if (++pps_intcnt >= PPS_INTCOUNT) {
- pps_intcnt = PPS_INTCOUNT;
- if (pps_shift < PPS_INTMAX) {
- pps_shift++;
- pps_intcnt = 0;
+ if (++ntpdata->pps_intcnt >= PPS_INTCOUNT) {
+ ntpdata->pps_intcnt = PPS_INTCOUNT;
+ if (ntpdata->pps_shift < PPS_INTMAX) {
+ ntpdata->pps_shift++;
+ ntpdata->pps_intcnt = 0;
}
}
}
-/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+/*
+ * Update clock frequency based on MONOTONIC_RAW clock PPS signal
* timestamps
*
* At the end of the calibration interval the difference between the
@@ -929,90 +941,88 @@ static inline void pps_inc_freq_interval(void)
* too long, the data are discarded.
* Returns the difference between old and new frequency values.
*/
-static long hardpps_update_freq(struct pps_normtime freq_norm)
+static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime freq_norm)
{
long delta, delta_mod;
s64 ftemp;
- /* check if the frequency interval was too long */
- if (freq_norm.sec > (2 << pps_shift)) {
- time_status |= STA_PPSERROR;
- pps_errcnt++;
- pps_dec_freq_interval();
- printk_deferred(KERN_ERR
- "hardpps: PPSERROR: interval too long - %lld s\n",
- freq_norm.sec);
+ /* Check if the frequency interval was too long */
+ if (freq_norm.sec > (2 << ntpdata->pps_shift)) {
+ ntpdata->time_status |= STA_PPSERROR;
+ ntpdata->pps_errcnt++;
+ pps_dec_freq_interval(ntpdata);
+ printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n",
+ freq_norm.sec);
return 0;
}
- /* here the raw frequency offset and wander (stability) is
- * calculated. If the wander is less than the wander threshold
- * the interval is increased; otherwise it is decreased.
+ /*
+ * Here the raw frequency offset and wander (stability) is
+ * calculated. If the wander is less than the wander threshold the
+ * interval is increased; otherwise it is decreased.
*/
ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
freq_norm.sec);
- delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
- pps_freq = ftemp;
+ delta = shift_right(ftemp - ntpdata->pps_freq, NTP_SCALE_SHIFT);
+ ntpdata->pps_freq = ftemp;
if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
- printk_deferred(KERN_WARNING
- "hardpps: PPSWANDER: change=%ld\n", delta);
- time_status |= STA_PPSWANDER;
- pps_stbcnt++;
- pps_dec_freq_interval();
- } else { /* good sample */
- pps_inc_freq_interval();
+ printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta);
+ ntpdata->time_status |= STA_PPSWANDER;
+ ntpdata->pps_stbcnt++;
+ pps_dec_freq_interval(ntpdata);
+ } else {
+ /* Good sample */
+ pps_inc_freq_interval(ntpdata);
}
- /* the stability metric is calculated as the average of recent
- * frequency changes, but is used only for performance
- * monitoring
+ /*
+ * The stability metric is calculated as the average of recent
+ * frequency changes, but is used only for performance monitoring
*/
delta_mod = delta;
if (delta_mod < 0)
delta_mod = -delta_mod;
- pps_stabil += (div_s64(((s64)delta_mod) <<
- (NTP_SCALE_SHIFT - SHIFT_USEC),
- NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
-
- /* if enabled, the system clock frequency is updated */
- if ((time_status & STA_PPSFREQ) != 0 &&
- (time_status & STA_FREQHOLD) == 0) {
- time_freq = pps_freq;
- ntp_update_frequency();
+ ntpdata->pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC),
+ NSEC_PER_USEC) - ntpdata->pps_stabil) >> PPS_INTMIN;
+
+ /* If enabled, the system clock frequency is updated */
+ if ((ntpdata->time_status & STA_PPSFREQ) && !(ntpdata->time_status & STA_FREQHOLD)) {
+ ntpdata->time_freq = ntpdata->pps_freq;
+ ntp_update_frequency(ntpdata);
}
return delta;
}
-/* correct REALTIME clock phase error against PPS signal */
-static void hardpps_update_phase(long error)
+/* Correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(struct ntp_data *ntpdata, long error)
{
long correction = -error;
long jitter;
- /* add the sample to the median filter */
- pps_phase_filter_add(correction);
- correction = pps_phase_filter_get(&jitter);
+ /* Add the sample to the median filter */
+ pps_phase_filter_add(ntpdata, correction);
+ correction = pps_phase_filter_get(ntpdata, &jitter);
- /* Nominal jitter is due to PPS signal noise. If it exceeds the
+ /*
+ * Nominal jitter is due to PPS signal noise. If it exceeds the
* threshold, the sample is discarded; otherwise, if so enabled,
* the time offset is updated.
*/
- if (jitter > (pps_jitter << PPS_POPCORN)) {
- printk_deferred(KERN_WARNING
- "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
- jitter, (pps_jitter << PPS_POPCORN));
- time_status |= STA_PPSJITTER;
- pps_jitcnt++;
- } else if (time_status & STA_PPSTIME) {
- /* correct the time using the phase offset */
- time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
- NTP_INTERVAL_FREQ);
- /* cancel running adjtime() */
- time_adjust = 0;
+ if (jitter > (ntpdata->pps_jitter << PPS_POPCORN)) {
+ printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+ jitter, (ntpdata->pps_jitter << PPS_POPCORN));
+ ntpdata->time_status |= STA_PPSJITTER;
+ ntpdata->pps_jitcnt++;
+ } else if (ntpdata->time_status & STA_PPSTIME) {
+ /* Correct the time using the phase offset */
+ ntpdata->time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+ NTP_INTERVAL_FREQ);
+ /* Cancel running adjtime() */
+ ntpdata->time_adjust = 0;
}
- /* update jitter */
- pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+ /* Update jitter */
+ ntpdata->pps_jitter += (jitter - ntpdata->pps_jitter) >> PPS_INTMIN;
}
/*
@@ -1029,68 +1039,70 @@ static void hardpps_update_phase(long error)
*/
void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
struct pps_normtime pts_norm, freq_norm;
pts_norm = pps_normalize_ts(*phase_ts);
- /* clear the error bits, they will be set again if needed */
- time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+ /* Clear the error bits, they will be set again if needed */
+ ntpdata->time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
/* indicate signal presence */
- time_status |= STA_PPSSIGNAL;
- pps_valid = PPS_VALID;
+ ntpdata->time_status |= STA_PPSSIGNAL;
+ ntpdata->pps_valid = PPS_VALID;
- /* when called for the first time,
- * just start the frequency interval */
- if (unlikely(pps_fbase.tv_sec == 0)) {
- pps_fbase = *raw_ts;
+ /*
+ * When called for the first time, just start the frequency
+ * interval
+ */
+ if (unlikely(ntpdata->pps_fbase.tv_sec == 0)) {
+ ntpdata->pps_fbase = *raw_ts;
return;
}
- /* ok, now we have a base for frequency calculation */
- freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase));
-
- /* check that the signal is in the range
- * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
- if ((freq_norm.sec == 0) ||
- (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
- (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
- time_status |= STA_PPSJITTER;
- /* restart the frequency calibration interval */
- pps_fbase = *raw_ts;
+ /* Ok, now we have a base for frequency calculation */
+ freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, ntpdata->pps_fbase));
+
+ /*
+ * Check that the signal is in the range
+ * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it
+ */
+ if ((freq_norm.sec == 0) || (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+ (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+ ntpdata->time_status |= STA_PPSJITTER;
+ /* Restart the frequency calibration interval */
+ ntpdata->pps_fbase = *raw_ts;
printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
return;
}
- /* signal is ok */
-
- /* check if the current frequency interval is finished */
- if (freq_norm.sec >= (1 << pps_shift)) {
- pps_calcnt++;
- /* restart the frequency calibration interval */
- pps_fbase = *raw_ts;
- hardpps_update_freq(freq_norm);
+ /* Signal is ok. Check if the current frequency interval is finished */
+ if (freq_norm.sec >= (1 << ntpdata->pps_shift)) {
+ ntpdata->pps_calcnt++;
+ /* Restart the frequency calibration interval */
+ ntpdata->pps_fbase = *raw_ts;
+ hardpps_update_freq(ntpdata, freq_norm);
}
- hardpps_update_phase(pts_norm.nsec);
+ hardpps_update_phase(ntpdata, pts_norm.nsec);
}
#endif /* CONFIG_NTP_PPS */
static int __init ntp_tick_adj_setup(char *str)
{
- int rc = kstrtos64(str, 0, &ntp_tick_adj);
+ int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj);
if (rc)
return rc;
- ntp_tick_adj <<= NTP_SCALE_SHIFT;
+ tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
}
-
__setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
- ntp_clear();
+ for (int id = 0; id < TIMEKEEPERS_MAX; id++)
+ __ntp_clear(tk_ntp_data + id);
ntp_init_cmos_sync();
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 23d1b74c3065..7084d839c207 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -3,20 +3,19 @@
#define _LINUX_NTP_INTERNAL_H
extern void ntp_init(void);
-extern void ntp_clear(void);
+extern void ntp_clear(unsigned int tkid);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
-extern u64 ntp_tick_length(void);
-extern ktime_t ntp_get_next_leap(void);
-extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct __kernel_timex *txc,
- const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad);
+extern u64 ntp_tick_length(unsigned int tkid);
+extern ktime_t ntp_get_next_leap(unsigned int tkid);
+extern int second_overflow(unsigned int tkid, time64_t secs);
+extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
-extern void ntp_notify_cmos_timer(void);
+extern void ntp_notify_cmos_timer(bool offset_set);
#else
-static inline void ntp_notify_cmos_timer(void) { }
+static inline void ntp_notify_cmos_timer(bool offset_set) { }
#endif
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 77c0c2370b6d..101a0f7c43e0 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,8 @@
*/
static struct posix_clock *get_posix_clock(struct file *fp)
{
- struct posix_clock *clk = fp->private_data;
+ struct posix_clock_context *pccontext = fp->private_data;
+ struct posix_clock *clk = pccontext->clk;
down_read(&clk->rwsem);
@@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk)
static ssize_t posix_clock_read(struct file *fp, char __user *buf,
size_t count, loff_t *ppos)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -EINVAL;
@@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
return -ENODEV;
if (clk->ops.read)
- err = clk->ops.read(clk, fp->f_flags, buf, count);
+ err = clk->ops.read(pccontext, fp->f_flags, buf, count);
put_posix_clock(clk);
@@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
__poll_t result = 0;
@@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
return EPOLLERR;
if (clk->ops.poll)
- result = clk->ops.poll(clk, fp, wait);
+ result = clk->ops.poll(pccontext, fp, wait);
put_posix_clock(clk);
@@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
static long posix_clock_ioctl(struct file *fp,
unsigned int cmd, unsigned long arg)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -ENOTTY;
@@ -79,37 +83,19 @@ static long posix_clock_ioctl(struct file *fp,
return -ENODEV;
if (clk->ops.ioctl)
- err = clk->ops.ioctl(clk, cmd, arg);
+ err = clk->ops.ioctl(pccontext, cmd, arg);
put_posix_clock(clk);
return err;
}
-#ifdef CONFIG_COMPAT
-static long posix_clock_compat_ioctl(struct file *fp,
- unsigned int cmd, unsigned long arg)
-{
- struct posix_clock *clk = get_posix_clock(fp);
- int err = -ENOTTY;
-
- if (!clk)
- return -ENODEV;
-
- if (clk->ops.ioctl)
- err = clk->ops.ioctl(clk, cmd, arg);
-
- put_posix_clock(clk);
-
- return err;
-}
-#endif
-
static int posix_clock_open(struct inode *inode, struct file *fp)
{
int err;
struct posix_clock *clk =
container_of(inode->i_cdev, struct posix_clock, cdev);
+ struct posix_clock_context *pccontext;
down_read(&clk->rwsem);
@@ -117,15 +103,24 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = -ENODEV;
goto out;
}
- if (clk->ops.open)
- err = clk->ops.open(clk, fp->f_mode);
- else
- err = 0;
-
- if (!err) {
- get_device(clk->dev);
- fp->private_data = clk;
+ pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL);
+ if (!pccontext) {
+ err = -ENOMEM;
+ goto out;
+ }
+ pccontext->clk = clk;
+ pccontext->fp = fp;
+ if (clk->ops.open) {
+ err = clk->ops.open(pccontext, fp->f_mode);
+ if (err) {
+ kfree(pccontext);
+ goto out;
+ }
}
+
+ fp->private_data = pccontext;
+ get_device(clk->dev);
+ err = 0;
out:
up_read(&clk->rwsem);
return err;
@@ -133,14 +128,20 @@ out:
static int posix_clock_release(struct inode *inode, struct file *fp)
{
- struct posix_clock *clk = fp->private_data;
+ struct posix_clock_context *pccontext = fp->private_data;
+ struct posix_clock *clk;
int err = 0;
+ if (!pccontext)
+ return -ENODEV;
+ clk = pccontext->clk;
+
if (clk->ops.release)
- err = clk->ops.release(clk);
+ err = clk->ops.release(pccontext);
put_device(clk->dev);
+ kfree(pccontext);
fp->private_data = NULL;
return err;
@@ -148,15 +149,12 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
static const struct file_operations posix_clock_file_operations = {
.owner = THIS_MODULE,
- .llseek = no_llseek,
.read = posix_clock_read,
.poll = posix_clock_poll,
.unlocked_ioctl = posix_clock_ioctl,
+ .compat_ioctl = posix_clock_ioctl,
.open = posix_clock_open,
.release = posix_clock_release,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = posix_clock_compat_ioctl,
-#endif
};
int posix_clock_register(struct posix_clock *clk, struct device *dev)
@@ -232,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx)
if (err)
return err;
- if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) {
err = -EACCES;
goto out;
}
@@ -290,6 +288,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
struct posix_clock_desc cd;
int err;
+ if (!timespec64_valid_strict(ts))
+ return -EINVAL;
+
err = get_clock_desc(id, &cd);
if (err)
return err;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cb925e8ef9a8..0de2bb7cbec0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -243,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
*/
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- u64 curr_cputime;
-retry:
- curr_cputime = atomic64_read(cputime);
- if (sum_cputime > curr_cputime) {
- if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
- goto retry;
- }
+ u64 curr_cputime = atomic64_read(cputime);
+
+ do {
+ if (sum_cputime <= curr_cputime)
+ return;
+ } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime));
}
static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
@@ -494,19 +493,28 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
*/
WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
} else {
- if (timer->it.cpu.firing)
+ if (timer->it.cpu.firing) {
+ /*
+ * Prevent signal delivery. The timer cannot be dequeued
+ * because it is on the firing list which is not protected
+ * by sighand->lock. The delivery path is waiting for
+ * the timer lock. So go back, unlock and retry.
+ */
+ timer->it.cpu.firing = false;
ret = TIMER_RETRY;
- else
+ } else {
disarm_timer(timer, p);
-
+ }
unlock_task_sighand(p, &flags);
}
out:
rcu_read_unlock();
- if (!ret)
- put_pid(ctmr->pid);
+ if (!ret) {
+ put_pid(ctmr->pid);
+ timer->it_status = POSIX_TIMER_DISARMED;
+ }
return ret;
}
@@ -560,6 +568,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p)
struct cpu_timer *ctmr = &timer->it.cpu;
u64 newexp = cpu_timer_getexpires(ctmr);
+ timer->it_status = POSIX_TIMER_ARMED;
if (!cpu_timer_enqueue(&base->tqhead, ctmr))
return;
@@ -585,36 +594,25 @@ static void cpu_timer_fire(struct k_itimer *timer)
{
struct cpu_timer *ctmr = &timer->it.cpu;
- if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
- /*
- * User don't want any signal.
- */
- cpu_timer_setexpires(ctmr, 0);
- } else if (unlikely(timer->sigq == NULL)) {
+ timer->it_status = POSIX_TIMER_DISARMED;
+
+ if (unlikely(ctmr->nanosleep)) {
/*
* This a special case for clock_nanosleep,
* not a normal timer from sys_timer_create.
*/
wake_up_process(timer->it_process);
cpu_timer_setexpires(ctmr, 0);
- } else if (!timer->it_interval) {
- /*
- * One-shot timer. Clear it as soon as it's fired.
- */
- posix_timer_event(timer, 0);
- cpu_timer_setexpires(ctmr, 0);
- } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
- /*
- * The signal did not get queued because the signal
- * was ignored, so we won't get any callback to
- * reload the timer. But we need to keep it
- * ticking in case the signal is deliverable next time.
- */
- posix_cpu_timer_rearm(timer);
- ++timer->it_requeue_pending;
+ } else {
+ posix_timer_queue_signal(timer);
+ /* Disable oneshot timers */
+ if (!timer->it_interval)
+ cpu_timer_setexpires(ctmr, 0);
}
}
+static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now);
+
/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
@@ -624,9 +622,10 @@ static void cpu_timer_fire(struct k_itimer *timer)
static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
struct itimerspec64 *new, struct itimerspec64 *old)
{
+ bool sigev_none = timer->it_sigev_notify == SIGEV_NONE;
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
- u64 old_expires, new_expires, old_incr, val;
struct cpu_timer *ctmr = &timer->it.cpu;
+ u64 old_expires, new_expires, now;
struct sighand_struct *sighand;
struct task_struct *p;
unsigned long flags;
@@ -663,168 +662,136 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
return -ESRCH;
}
- /*
- * Disarm any old timer after extracting its expiry time.
- */
- old_incr = timer->it_interval;
+ /* Retrieve the current expiry time before disarming the timer */
old_expires = cpu_timer_getexpires(ctmr);
if (unlikely(timer->it.cpu.firing)) {
- timer->it.cpu.firing = -1;
+ /*
+ * Prevent signal delivery. The timer cannot be dequeued
+ * because it is on the firing list which is not protected
+ * by sighand->lock. The delivery path is waiting for
+ * the timer lock. So go back, unlock and retry.
+ */
+ timer->it.cpu.firing = false;
ret = TIMER_RETRY;
} else {
cpu_timer_dequeue(ctmr);
+ timer->it_status = POSIX_TIMER_DISARMED;
}
/*
- * We need to sample the current value to convert the new
- * value from to relative and absolute, and to convert the
- * old value from absolute to relative. To set a process
- * timer, we need a sample to balance the thread expiry
- * times (in arm_timer). With an absolute time, we must
- * check if it's already passed. In short, we need a sample.
+ * Sample the current clock for saving the previous setting
+ * and for rearming the timer.
*/
if (CPUCLOCK_PERTHREAD(timer->it_clock))
- val = cpu_clock_sample(clkid, p);
+ now = cpu_clock_sample(clkid, p);
else
- val = cpu_clock_sample_group(clkid, p, true);
+ now = cpu_clock_sample_group(clkid, p, !sigev_none);
+ /* Retrieve the previous expiry value if requested. */
if (old) {
- if (old_expires == 0) {
- old->it_value.tv_sec = 0;
- old->it_value.tv_nsec = 0;
- } else {
- /*
- * Update the timer in case it has overrun already.
- * If it has, we'll report it as having overrun and
- * with the next reloaded timer already ticking,
- * though we are swallowing that pending
- * notification here to install the new setting.
- */
- u64 exp = bump_cpu_timer(timer, val);
-
- if (val < exp) {
- old_expires = exp - val;
- old->it_value = ns_to_timespec64(old_expires);
- } else {
- old->it_value.tv_nsec = 1;
- old->it_value.tv_sec = 0;
- }
- }
+ old->it_value = (struct timespec64){ };
+ if (old_expires)
+ __posix_cpu_timer_get(timer, old, now);
}
+ /* Retry if the timer expiry is running concurrently */
if (unlikely(ret)) {
- /*
- * We are colliding with the timer actually firing.
- * Punt after filling in the timer's old value, and
- * disable this firing since we are already reporting
- * it as an overrun (thanks to bump_cpu_timer above).
- */
unlock_task_sighand(p, &flags);
goto out;
}
- if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
- new_expires += val;
- }
+ /* Convert relative expiry time to absolute */
+ if (new_expires && !(timer_flags & TIMER_ABSTIME))
+ new_expires += now;
+
+ /* Set the new expiry time (might be 0) */
+ cpu_timer_setexpires(ctmr, new_expires);
/*
- * Install the new expiry time (or zero).
- * For a timer with no notification action, we don't actually
- * arm the timer (we'll just fake it for timer_gettime).
+ * Arm the timer if it is not disabled, the new expiry value has
+ * not yet expired and the timer requires signal delivery.
+ * SIGEV_NONE timers are never armed. In case the timer is not
+ * armed, enforce the reevaluation of the timer base so that the
+ * process wide cputime counter can be disabled eventually.
*/
- cpu_timer_setexpires(ctmr, new_expires);
- if (new_expires != 0 && val < new_expires) {
- arm_timer(timer, p);
+ if (likely(!sigev_none)) {
+ if (new_expires && now < new_expires)
+ arm_timer(timer, p);
+ else
+ trigger_base_recalc_expires(timer, p);
}
unlock_task_sighand(p, &flags);
+
+ posix_timer_set_common(timer, new);
+
/*
- * Install the new reload setting, and
- * set up the signal and overrun bookkeeping.
+ * If the new expiry time was already in the past the timer was not
+ * queued. Fire it immediately even if the thread never runs to
+ * accumulate more time on this clock.
*/
- timer->it_interval = timespec64_to_ktime(new->it_interval);
+ if (!sigev_none && new_expires && now >= new_expires)
+ cpu_timer_fire(timer);
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now)
+{
+ bool sigev_none = timer->it_sigev_notify == SIGEV_NONE;
+ u64 expires, iv = timer->it_interval;
/*
- * This acts as a modification timestamp for the timer,
- * so any automatic reload attempt will punt on seeing
- * that we have reset the timer manually.
+ * Make sure that interval timers are moved forward for the
+ * following cases:
+ * - SIGEV_NONE timers which are never armed
+ * - Timers which expired, but the signal has not yet been
+ * delivered
*/
- timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
- ~REQUEUE_PENDING;
- timer->it_overrun_last = 0;
- timer->it_overrun = -1;
-
- if (val >= new_expires) {
- if (new_expires != 0) {
- /*
- * The designated time already passed, so we notify
- * immediately, even if the thread never runs to
- * accumulate more time on this clock.
- */
- cpu_timer_fire(timer);
- }
+ if (iv && timer->it_status != POSIX_TIMER_ARMED)
+ expires = bump_cpu_timer(timer, now);
+ else
+ expires = cpu_timer_getexpires(&timer->it.cpu);
+ /*
+ * Expired interval timers cannot have a remaining time <= 0.
+ * The kernel has to move them forward so that the next
+ * timer expiry is > @now.
+ */
+ if (now < expires) {
+ itp->it_value = ns_to_timespec64(expires - now);
+ } else {
/*
- * Make sure we don't keep around the process wide cputime
- * counter or the tick dependency if they are not necessary.
+ * A single shot SIGEV_NONE timer must return 0, when it is
+ * expired! Timers which have a real signal delivery mode
+ * must return a remaining time greater than 0 because the
+ * signal has not yet been delivered.
*/
- sighand = lock_task_sighand(p, &flags);
- if (!sighand)
- goto out;
-
- if (!cpu_timer_queued(ctmr))
- trigger_base_recalc_expires(timer, p);
-
- unlock_task_sighand(p, &flags);
+ if (!sigev_none)
+ itp->it_value.tv_nsec = 1;
}
- out:
- rcu_read_unlock();
- if (old)
- old->it_interval = ns_to_timespec64(old_incr);
-
- return ret;
}
static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
{
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
- struct cpu_timer *ctmr = &timer->it.cpu;
- u64 now, expires = cpu_timer_getexpires(ctmr);
struct task_struct *p;
+ u64 now;
rcu_read_lock();
p = cpu_timer_task_rcu(timer);
- if (!p)
- goto out;
-
- /*
- * Easy part: convert the reload time.
- */
- itp->it_interval = ktime_to_timespec64(timer->it_interval);
-
- if (!expires)
- goto out;
+ if (p && cpu_timer_getexpires(&timer->it.cpu)) {
+ itp->it_interval = ktime_to_timespec64(timer->it_interval);
- /*
- * Sample the clock to take the difference with the expiry time.
- */
- if (CPUCLOCK_PERTHREAD(timer->it_clock))
- now = cpu_clock_sample(clkid, p);
- else
- now = cpu_clock_sample_group(clkid, p, false);
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ now = cpu_clock_sample(clkid, p);
+ else
+ now = cpu_clock_sample_group(clkid, p, false);
- if (now < expires) {
- itp->it_value = ns_to_timespec64(expires - now);
- } else {
- /*
- * The timer should have expired already, but the firing
- * hasn't taken place yet. Say it's just about to expire.
- */
- itp->it_value.tv_nsec = 1;
- itp->it_value.tv_sec = 0;
+ __posix_cpu_timer_get(timer, itp, now);
}
-out:
rcu_read_unlock();
}
@@ -846,7 +813,9 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
if (++i == MAX_COLLECTED || now < expires)
return expires;
- ctmr->firing = 1;
+ ctmr->firing = true;
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(ctmr->handling, current);
cpu_timer_dequeue(ctmr);
list_add_tail(&ctmr->elist, firing);
}
@@ -1162,7 +1131,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk);
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
static void posix_cpu_timers_work(struct callback_head *work)
{
+ struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+ mutex_lock(&cw->mutex);
handle_posix_cpu_timers(current);
+ mutex_unlock(&cw->mutex);
+}
+
+/*
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
+ */
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
+
+ /* Has the handling task completed expiry already? */
+ if (!tsk)
+ return;
+
+ /* Ensure that the task cannot go away */
+ get_task_struct(tsk);
+ /* Now drop the RCU protection so the mutex can be locked */
+ rcu_read_unlock();
+ /* Wait on the expiry mutex */
+ mutex_lock(&tsk->posix_cputimers_work.mutex);
+ /* Release it immediately again. */
+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
+ /* Drop the task reference. */
+ put_task_struct(tsk);
+ /* Relock RCU so the callsite is balanced */
+ rcu_read_lock();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ /* Ensure that timr->it.cpu.handling task cannot go away */
+ rcu_read_lock();
+ spin_unlock_irq(&timr->it_lock);
+ posix_cpu_timer_wait_running(timr);
+ rcu_read_unlock();
+ /* @timr is on stack and is valid */
+ spin_lock_irq(&timr->it_lock);
}
/*
@@ -1178,6 +1189,7 @@ void clear_posix_cputimers_work(struct task_struct *p)
sizeof(p->posix_cputimers_work.work));
init_task_work(&p->posix_cputimers_work.work,
posix_cpu_timers_work);
+ mutex_init(&p->posix_cputimers_work.mutex);
p->posix_cputimers_work.scheduled = false;
}
@@ -1256,6 +1268,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
lockdep_posixtimer_exit();
}
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ spin_unlock_irq(&timr->it_lock);
+ cpu_relax();
+ spin_lock_irq(&timr->it_lock);
+}
+
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
return false;
@@ -1344,7 +1368,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
* timer call will interfere.
*/
list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
- int cpu_firing;
+ bool cpu_firing;
/*
* spin_lock() is sufficient here even independent of the
@@ -1356,14 +1380,16 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
spin_lock(&timer->it_lock);
list_del_init(&timer->it.cpu.elist);
cpu_firing = timer->it.cpu.firing;
- timer->it.cpu.firing = 0;
+ timer->it.cpu.firing = false;
/*
- * The firing flag is -1 if we collided with a reset
- * of the timer, which already reported this
- * almost-firing as an overrun. So don't generate an event.
+ * If the firing flag is cleared then this raced with a
+ * timer rearm/delete operation. So don't generate an
+ * event.
*/
- if (likely(cpu_firing >= 0))
+ if (likely(cpu_firing))
cpu_timer_fire(timer);
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
spin_unlock(&timer->it_lock);
}
}
@@ -1380,6 +1406,15 @@ void run_posix_cpu_timers(void)
lockdep_assert_irqs_disabled();
/*
+ * Ensure that release_task(tsk) can't happen while
+ * handle_posix_cpu_timers() is running. Otherwise, a concurrent
+ * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
+ * miss timer->it.cpu.firing != 0.
+ */
+ if (tsk->exit_state)
+ return;
+
+ /*
* If the actual expiry is deferred to task work context and the
* work is already scheduled there is no point to do anything here.
*/
@@ -1457,6 +1492,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
timer.it_overrun = -1;
error = posix_cpu_timer_create(&timer);
timer.it_process = current;
+ timer.it.cpu.nanosleep = true;
if (!error) {
static struct itimerspec64 zero_it;
@@ -1498,23 +1534,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
- /*
- * Timer is now unarmed, deletion can not fail.
- */
+ /* Timer is now unarmed, deletion can not fail. */
posix_cpu_timer_del(&timer);
+ } else {
+ while (error == TIMER_RETRY) {
+ posix_cpu_timer_wait_running_nsleep(&timer);
+ error = posix_cpu_timer_del(&timer);
+ }
}
- spin_unlock_irq(&timer.it_lock);
- while (error == TIMER_RETRY) {
- /*
- * We need to handle case when timer was or is in the
- * middle of firing. In other cases we already freed
- * resources.
- */
- spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_del(&timer);
- spin_unlock_irq(&timer.it_lock);
- }
+ spin_unlock_irq(&timer.it_lock);
if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
/*
@@ -1528,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* Report back to the user the time still remaining.
*/
restart = &current->restart_block;
- restart->nanosleep.expires = expires;
+ restart->nanosleep.expires = ns_to_ktime(expires);
if (restart->nanosleep.type != TT_NONE)
error = nanosleep_copyout(restart, &it.it_value);
}
@@ -1570,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
clockid_t which_clock = restart_block->nanosleep.clockid;
struct timespec64 t;
- t = ns_to_timespec64(restart_block->nanosleep.expires);
+ t = ktime_to_timespec64(restart_block->nanosleep.expires);
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
}
@@ -1624,6 +1653,7 @@ const struct k_clock clock_posix_cpu = {
.timer_del = posix_cpu_timer_del,
.timer_get = posix_cpu_timer_get,
.timer_rearm = posix_cpu_timer_rearm,
+ .timer_wait_running = posix_cpu_timer_wait_running,
};
const struct k_clock clock_process = {
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 90ea5f373e50..9b6fcb8d85e7 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -17,40 +17,6 @@
#include <linux/time_namespace.h>
#include <linux/compat.h>
-#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
-/* Architectures may override SYS_NI and COMPAT_SYS_NI */
-#include <asm/syscall_wrapper.h>
-#endif
-
-asmlinkage long sys_ni_posix_timers(void)
-{
- pr_err_once("process %d (%s) attempted a POSIX timer syscall "
- "while CONFIG_POSIX_TIMERS is not set\n",
- current->pid, current->comm);
- return -ENOSYS;
-}
-
-#ifndef SYS_NI
-#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
-#endif
-
-#ifndef COMPAT_SYS_NI
-#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
-#endif
-
-SYS_NI(timer_create);
-SYS_NI(timer_gettime);
-SYS_NI(timer_getoverrun);
-SYS_NI(timer_settime);
-SYS_NI(timer_delete);
-SYS_NI(clock_adjtime);
-SYS_NI(getitimer);
-SYS_NI(setitimer);
-SYS_NI(clock_adjtime32);
-#ifdef __ARCH_WANT_SYS_ALARM
-SYS_NI(alarm);
-#endif
-
/*
* We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
* as it is easy to remain compatible with little code. CLOCK_BOOTTIME
@@ -147,6 +113,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
texp = timespec64_to_ktime(t);
@@ -157,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
which_clock);
}
-#ifdef CONFIG_COMPAT
-COMPAT_SYS_NI(timer_create);
-#endif
-
-#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
-#endif
-
#ifdef CONFIG_COMPAT_32BIT_TIME
-SYS_NI(timer_settime32);
-SYS_NI(timer_gettime32);
SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
struct old_timespec32 __user *, tp)
@@ -240,6 +196,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
texp = timespec64_to_ktime(t);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 5dead89308b7..80a8a09a21a0 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -9,164 +9,188 @@
*
* These are all the functions necessary to implement POSIX clocks & timers
*/
-#include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/compiler.h>
+#include <linux/init.h>
+#include <linux/jhash.h>
#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/mutex.h>
-#include <linux/sched/task.h>
-
-#include <linux/uaccess.h>
#include <linux/list.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/hash.h>
+#include <linux/memblock.h>
+#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/prctl.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
-#include <linux/wait.h>
-#include <linux/workqueue.h>
-#include <linux/export.h>
-#include <linux/hashtable.h>
-#include <linux/compat.h>
-#include <linux/nospec.h>
+#include <linux/time.h>
#include <linux/time_namespace.h>
+#include <linux/uaccess.h>
#include "timekeeping.h"
#include "posix-timers.h"
/*
- * Management arrays for POSIX timers. Timers are now kept in static hash table
- * with 512 entries.
- * Timer ids are allocated by local routine, which selects proper hash head by
- * key, constructed from current->signal address and per signal struct counter.
- * This keeps timer ids unique per process, but now they can intersect between
- * processes.
+ * Timers are managed in a hash table for lockless lookup. The hash key is
+ * constructed from current::signal and the timer ID and the timer is
+ * matched against current::signal and the timer ID when walking the hash
+ * bucket list.
+ *
+ * This allows checkpoint/restore to reconstruct the exact timer IDs for
+ * a process.
*/
+struct timer_hash_bucket {
+ spinlock_t lock;
+ struct hlist_head head;
+};
-/*
- * Lets keep our timers in a slab cache :-)
- */
-static struct kmem_cache *posix_timers_cache;
+static struct {
+ struct timer_hash_bucket *buckets;
+ unsigned long mask;
+ struct kmem_cache *cache;
+} __timer_data __ro_after_init __aligned(4*sizeof(long));
-static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
-static DEFINE_SPINLOCK(hash_lock);
+#define timer_buckets (__timer_data.buckets)
+#define timer_hashmask (__timer_data.mask)
+#define posix_timers_cache (__timer_data.cache)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
-/*
- * we assume that the new SIGEV_THREAD_ID shares no bits with the other
- * SIGEV values. Here we put out an error if this assumption fails.
- */
+#define TIMER_ANY_ID INT_MIN
+
+/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
- ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
-/*
- * The timer ID is turned into a timer address by idr_find().
- * Verifying a valid ID consists of:
- *
- * a) checking that idr_find() returns other than -1.
- * b) checking that the timer id matches the one in the timer itself.
- * c) that the timer owner is in the callers thread group.
- */
-
-/*
- * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
- * to implement others. This structure defines the various
- * clocks.
- *
- * RESOLUTION: Clock resolution is used to round up timer and interval
- * times, NOT to report clock times, which are reported with as
- * much resolution as the system can muster. In some cases this
- * resolution may depend on the underlying clock hardware and
- * may not be quantifiable until run time, and only then is the
- * necessary code is written. The standard says we should say
- * something about this issue in the documentation...
- *
- * FUNCTIONS: The CLOCKs structure defines possible functions to
- * handle various clock functions.
- *
- * The standard POSIX timer management code assumes the
- * following: 1.) The k_itimer struct (sched.h) is used for
- * the timer. 2.) The list, it_lock, it_clock, it_id and
- * it_pid fields are not modified by timer code.
- *
- * Permissions: It is assumed that the clock_settime() function defined
- * for each clock will take care of permission checks. Some
- * clocks may be set able by any user (i.e. local process
- * clocks) others not. Currently the only set able clock we
- * have is CLOCK_REALTIME and its high res counter part, both of
- * which we beg off on and pass to do_sys_settimeofday().
- */
-static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id);
-#define lock_timer(tid, flags) \
-({ struct k_itimer *__timr; \
- __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
- __timr; \
+#define lock_timer(tid) \
+({ struct k_itimer *__timr; \
+ __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \
+ __timr; \
})
-static int hash(struct signal_struct *sig, unsigned int nr)
+static inline void unlock_timer(struct k_itimer *timr)
{
- return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+ if (likely((timr)))
+ spin_unlock_irq(&timr->it_lock);
}
-static struct k_itimer *__posix_timers_find(struct hlist_head *head,
- struct signal_struct *sig,
- timer_t id)
+#define scoped_timer_get_or_fail(_id) \
+ scoped_cond_guard(lock_timer, return -EINVAL, _id)
+
+#define scoped_timer (scope)
+
+DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id);
+DEFINE_CLASS_IS_COND_GUARD(lock_timer);
+
+static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
{
+ return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask];
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+ struct signal_struct *sig = current->signal;
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash,
- lockdep_is_held(&hash_lock)) {
- if ((timer->it_signal == sig) && (timer->it_id == id))
+ hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) {
+ /* timer->it_signal can be set concurrently */
+ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
}
return NULL;
}
-static struct k_itimer *posix_timer_by_id(timer_t id)
+static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer)
{
- struct signal_struct *sig = current->signal;
- struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+ unsigned long val = (unsigned long)timer->it_signal;
+
+ /*
+ * Mask out bit 0, which acts as invalid marker to prevent
+ * posix_timer_by_id() detecting it as valid.
+ */
+ return (struct signal_struct *)(val & ~1UL);
+}
+
+static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig,
+ timer_t id)
+{
+ struct hlist_head *head = &bucket->head;
+ struct k_itimer *timer;
- return __posix_timers_find(head, sig, id);
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) {
+ if ((posix_sig_owner(timer) == sig) && (timer->it_id == id))
+ return true;
+ }
+ return false;
}
-static int posix_timer_add(struct k_itimer *timer)
+static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
{
- struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
- struct hlist_head *head;
- int ret = -ENOENT;
-
- do {
- spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
- hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
+
+ scoped_guard (spinlock, &bucket->lock) {
+ /*
+ * Validate under the lock as this could have raced against
+ * another thread ending up with the same ID, which is
+ * highly unlikely, but possible.
+ */
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return true;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
- spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ return false;
}
-static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+static int posix_timer_add(struct k_itimer *timer, int req_id)
{
- spin_unlock_irqrestore(&timr->it_lock, flags);
+ struct signal_struct *sig = current->signal;
+
+ if (unlikely(req_id != TIMER_ANY_ID)) {
+ if (!posix_timer_add_at(timer, sig, req_id))
+ return -EBUSY;
+
+ /*
+ * Move the ID counter past the requested ID, so that after
+ * switching back to normal mode the IDs are outside of the
+ * exact allocated region. That avoids ID collisions on the
+ * next regular timer_create() invocations.
+ */
+ atomic_set(&sig->next_posix_timer_id, req_id + 1);
+ return req_id;
+ }
+
+ for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
+ /* Get the next timer ID and clamp it to positive space */
+ unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
+
+ if (posix_timer_add_at(timer, sig, id))
+ return id;
+ cond_resched();
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
-/* Get clock_realtime */
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
@@ -178,7 +202,6 @@ static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
return ktime_get_real();
}
-/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
{
@@ -191,9 +214,6 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
return do_adjtimex(t);
}
-/*
- * Get monotonic time for posix timers
- */
static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
@@ -206,9 +226,6 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
return ktime_get();
}
-/*
- * Get monotonic-raw time for posix timers
- */
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
@@ -216,7 +233,6 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-
static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_coarse_real_ts64(tp);
@@ -268,164 +284,115 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
}
/*
- * Initialize everything, well, just everything in Posix clocks/timers ;)
- */
-static __init int init_posix_timers(void)
-{
- posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof(struct k_itimer), 0,
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
- return 0;
-}
-__initcall(init_posix_timers);
-
-/*
* The siginfo si_overrun field and the return value of timer_getoverrun(2)
* are of type int. Clamp the overrun value to INT_MAX
*/
-static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval)
+static inline int timer_overrun_to_int(struct k_itimer *timr)
{
- s64 sum = timr->it_overrun_last + (s64)baseval;
+ if (timr->it_overrun_last > (s64)INT_MAX)
+ return INT_MAX;
- return sum > (s64)INT_MAX ? INT_MAX : (int)sum;
+ return (int)timr->it_overrun_last;
}
static void common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
- timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
- timr->it_interval);
+ timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
hrtimer_restart(timer);
}
+static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr)
+{
+ guard(spinlock)(&timr->it_lock);
+
+ /*
+ * Check if the timer is still alive or whether it got modified
+ * since the signal was queued. In either case, don't rearm and
+ * drop the signal.
+ */
+ if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr)))
+ return false;
+
+ if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
+ return true;
+
+ timr->kclock->timer_rearm(timr);
+ timr->it_status = POSIX_TIMER_ARMED;
+ timr->it_overrun_last = timr->it_overrun;
+ timr->it_overrun = -1LL;
+ ++timr->it_signal_seq;
+ info->si_overrun = timer_overrun_to_int(timr);
+ return true;
+}
+
/*
- * This function is exported for use by the signal deliver code. It is
- * called just prior to the info block being released and passes that
- * block to us. It's function is to update the overrun entry AND to
- * restart the timer. It should only be called if the timer is to be
- * restarted (i.e. we have flagged this in the sys_private entry of the
- * info block).
- *
- * To protect against the timer going away while the interrupt is queued,
- * we require that the it_requeue_pending flag be set.
+ * This function is called from the signal delivery code. It decides
+ * whether the signal should be dropped and rearms interval timers. The
+ * timer can be unconditionally accessed as there is a reference held on
+ * it.
*/
-void posixtimer_rearm(struct kernel_siginfo *info)
+bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq)
{
- struct k_itimer *timr;
- unsigned long flags;
+ struct k_itimer *timr = container_of(timer_sigq, struct k_itimer, sigq);
+ bool ret;
- timr = lock_timer(info->si_tid, &flags);
- if (!timr)
- return;
-
- if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) {
- timr->kclock->timer_rearm(timr);
+ /*
+ * Release siglock to ensure proper locking order versus
+ * timr::it_lock. Keep interrupts disabled.
+ */
+ spin_unlock(&current->sighand->siglock);
- timr->it_active = 1;
- timr->it_overrun_last = timr->it_overrun;
- timr->it_overrun = -1LL;
- ++timr->it_requeue_pending;
+ ret = __posixtimer_deliver_signal(info, timr);
- info->si_overrun = timer_overrun_to_int(timr, info->si_overrun);
- }
+ /* Drop the reference which was acquired when the signal was queued */
+ posixtimer_putref(timr);
- unlock_timer(timr, flags);
+ spin_lock(&current->sighand->siglock);
+ return ret;
}
-int posix_timer_event(struct k_itimer *timr, int si_private)
+void posix_timer_queue_signal(struct k_itimer *timr)
{
- enum pid_type type;
- int ret;
- /*
- * FIXME: if ->sigq is queued we can race with
- * dequeue_signal()->posixtimer_rearm().
- *
- * If dequeue_signal() sees the "right" value of
- * si_sys_private it calls posixtimer_rearm().
- * We re-queue ->sigq and drop ->it_lock().
- * posixtimer_rearm() locks the timer
- * and re-schedules it while ->sigq is pending.
- * Not really bad, but not that we want.
- */
- timr->sigq->info.si_sys_private = si_private;
+ lockdep_assert_held(&timr->it_lock);
+
+ if (!posixtimer_valid(timr))
+ return;
- type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
- ret = send_sigqueue(timr->sigq, timr->it_pid, type);
- /* If we failed to send the signal the timer stops. */
- return ret > 0;
+ timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED;
+ posixtimer_send_sigqueue(timr);
}
/*
- * This function gets called when a POSIX.1b interval timer expires. It
- * is used as a callback from the kernel internal timer. The
- * run_timer_list code ALWAYS calls with interrupts on.
-
- * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ * This function gets called when a POSIX.1b interval timer expires from
+ * the HRTIMER interrupt (soft interrupt on RT kernels).
+ *
+ * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI
+ * based timers.
*/
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
- struct k_itimer *timr;
- unsigned long flags;
- int si_private = 0;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
-
- timr = container_of(timer, struct k_itimer, it.real.timer);
- spin_lock_irqsave(&timr->it_lock, flags);
+ struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer);
- timr->it_active = 0;
- if (timr->it_interval != 0)
- si_private = ++timr->it_requeue_pending;
-
- if (posix_timer_event(timr, si_private)) {
- /*
- * signal was not sent because of sig_ignor
- * we will not get a call back to restart it AND
- * it should be restarted.
- */
- if (timr->it_interval != 0) {
- ktime_t now = hrtimer_cb_get_time(timer);
-
- /*
- * FIXME: What we really want, is to stop this
- * timer completely and restart it in case the
- * SIG_IGN is removed. This is a non trivial
- * change which involves sighand locking
- * (sigh !), which we don't want to do late in
- * the release cycle.
- *
- * For now we just let timers with an interval
- * less than a jiffie expire every jiffie to
- * avoid softirq starvation in case of SIG_IGN
- * and a very small interval, which would put
- * the timer right back on the softirq pending
- * list. By moving now ahead of time we trick
- * hrtimer_forward() to expire the timer
- * later, while we still maintain the overrun
- * accuracy, but have some inconsistency in
- * the timer_gettime() case. This is at least
- * better than a starved softirq. A more
- * complex fix which solves also another related
- * inconsistency is already in the pipeline.
- */
-#ifdef CONFIG_HIGH_RES_TIMERS
- {
- ktime_t kj = NSEC_PER_SEC / HZ;
+ guard(spinlock_irqsave)(&timr->it_lock);
+ posix_timer_queue_signal(timr);
+ return HRTIMER_NORESTART;
+}
- if (timr->it_interval < kj)
- now = ktime_add(now, kj);
- }
-#endif
- timr->it_overrun += hrtimer_forward(timer, now,
- timr->it_interval);
- ret = HRTIMER_RESTART;
- ++timr->it_requeue_pending;
- timr->it_active = 1;
- }
+long posixtimer_create_prctl(unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_TIMER_CREATE_RESTORE_IDS_OFF:
+ current->signal->timer_create_restore_ids = 0;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_ON:
+ current->signal->timer_create_restore_ids = 1;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_GET:
+ return current->signal->timer_create_restore_ids;
}
-
- unlock_timer(timr, flags);
- return ret;
+ return -EINVAL;
}
static struct pid *good_sigevent(sigevent_t * event)
@@ -452,45 +419,45 @@ static struct pid *good_sigevent(sigevent_t * event)
}
}
-static struct k_itimer * alloc_posix_timer(void)
+static struct k_itimer *alloc_posix_timer(void)
{
struct k_itimer *tmr;
+
+ if (unlikely(!posix_timers_cache))
+ return NULL;
+
tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
if (!tmr)
return tmr;
- if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
+
+ if (unlikely(!posixtimer_init_sigqueue(&tmr->sigq))) {
kmem_cache_free(posix_timers_cache, tmr);
return NULL;
}
- clear_siginfo(&tmr->sigq->info);
+ rcuref_init(&tmr->rcuref, 1);
return tmr;
}
-static void k_itimer_rcu_free(struct rcu_head *head)
+void posixtimer_free_timer(struct k_itimer *tmr)
{
- struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
-
- kmem_cache_free(posix_timers_cache, tmr);
+ put_pid(tmr->it_pid);
+ if (tmr->sigq.ucounts)
+ dec_rlimit_put_ucounts(tmr->sigq.ucounts, UCOUNT_RLIMIT_SIGPENDING);
+ kfree_rcu(tmr, rcu);
}
-#define IT_ID_SET 1
-#define IT_ID_NOT_SET 0
-static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
- if (it_id_set) {
- unsigned long flags;
- spin_lock_irqsave(&hash_lock, flags);
+ struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id);
+
+ scoped_guard (spinlock, &bucket->lock)
hlist_del_rcu(&tmr->t_hash);
- spin_unlock_irqrestore(&hash_lock, flags);
- }
- put_pid(tmr->it_pid);
- sigqueue_free(tmr->sigq);
- call_rcu(&tmr->rcu, k_itimer_rcu_free);
+ posixtimer_putref(tmr);
}
static int common_timer_create(struct k_itimer *new_timer)
{
- hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+ hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0);
return 0;
}
@@ -499,78 +466,107 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
+ timer_t req_id = TIMER_ANY_ID;
struct k_itimer *new_timer;
int error, new_timer_id;
- int it_id_set = IT_ID_NOT_SET;
if (!kc)
return -EINVAL;
if (!kc->timer_create)
return -EOPNOTSUPP;
+ /* Special case for CRIU to restore timers with a given timer ID. */
+ if (unlikely(current->signal->timer_create_restore_ids)) {
+ if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
+ return -EFAULT;
+ /* Valid IDs are 0..INT_MAX */
+ if ((unsigned int)req_id > INT_MAX)
+ return -EINVAL;
+ }
+
new_timer = alloc_posix_timer();
if (unlikely(!new_timer))
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
- new_timer_id = posix_timer_add(new_timer);
+
+ /*
+ * Add the timer to the hash table. The timer is not yet valid
+ * after insertion, but has a unique ID allocated.
+ */
+ new_timer_id = posix_timer_add(new_timer, req_id);
if (new_timer_id < 0) {
- error = new_timer_id;
- goto out;
+ posixtimer_free_timer(new_timer);
+ return new_timer_id;
}
- it_id_set = IT_ID_SET;
- new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
new_timer->it_overrun = -1LL;
if (event) {
- rcu_read_lock();
- new_timer->it_pid = get_pid(good_sigevent(event));
- rcu_read_unlock();
+ scoped_guard (rcu)
+ new_timer->it_pid = get_pid(good_sigevent(event));
if (!new_timer->it_pid) {
error = -EINVAL;
goto out;
}
new_timer->it_sigev_notify = event->sigev_notify;
- new_timer->sigq->info.si_signo = event->sigev_signo;
- new_timer->sigq->info.si_value = event->sigev_value;
+ new_timer->sigq.info.si_signo = event->sigev_signo;
+ new_timer->sigq.info.si_value = event->sigev_value;
} else {
new_timer->it_sigev_notify = SIGEV_SIGNAL;
- new_timer->sigq->info.si_signo = SIGALRM;
- memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t));
- new_timer->sigq->info.si_value.sival_int = new_timer->it_id;
+ new_timer->sigq.info.si_signo = SIGALRM;
+ new_timer->sigq.info.si_value.sival_int = new_timer->it_id;
new_timer->it_pid = get_pid(task_tgid(current));
}
- new_timer->sigq->info.si_tid = new_timer->it_id;
- new_timer->sigq->info.si_code = SI_TIMER;
+ if (new_timer->it_sigev_notify & SIGEV_THREAD_ID)
+ new_timer->it_pid_type = PIDTYPE_PID;
+ else
+ new_timer->it_pid_type = PIDTYPE_TGID;
- if (copy_to_user(created_timer_id,
- &new_timer_id, sizeof (new_timer_id))) {
+ new_timer->sigq.info.si_tid = new_timer->it_id;
+ new_timer->sigq.info.si_code = SI_TIMER;
+
+ if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) {
error = -EFAULT;
goto out;
}
-
+ /*
+ * After successful copy out, the timer ID is visible to user space
+ * now but not yet valid because new_timer::signal low order bit is 1.
+ *
+ * Complete the initialization with the clock specific create
+ * callback.
+ */
error = kc->timer_create(new_timer);
if (error)
goto out;
- spin_lock_irq(&current->sighand->siglock);
- new_timer->it_signal = current->signal;
- list_add(&new_timer->list, &current->signal->posix_timers);
- spin_unlock_irq(&current->sighand->siglock);
-
- return 0;
/*
- * In the case of the timer belonging to another task, after
- * the task is unlocked, the timer is owned by the other task
- * and may cease to exist at any time. Don't use or modify
- * new_timer after the unlock call.
+ * timer::it_lock ensures that __lock_timer() observes a fully
+ * initialized timer when it observes a valid timer::it_signal.
+ *
+ * sighand::siglock is required to protect signal::posix_timers.
+ */
+ scoped_guard (spinlock_irq, &new_timer->it_lock) {
+ guard(spinlock)(&current->sighand->siglock);
+ /*
+ * new_timer::it_signal contains the signal pointer with
+ * bit 0 set, which makes it invalid for syscall operations.
+ * Store the unmodified signal pointer to make it valid.
+ */
+ WRITE_ONCE(new_timer->it_signal, current->signal);
+ hlist_add_head_rcu(&new_timer->list, &current->signal->posix_timers);
+ }
+ /*
+ * After unlocking @new_timer is subject to concurrent removal and
+ * cannot be touched anymore
*/
+ return 0;
out:
- release_posix_timer(new_timer, it_id_set);
+ posix_timer_unhash_and_free(new_timer);
return error;
}
@@ -604,14 +600,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
}
#endif
-/*
- * Locking issues: We need to protect the result of the id look up until
- * we get the timer locked down so it is not deleted under us. The
- * removal is done under the idr spinlock so we use that here to bridge
- * the find to the timer lock. To avoid a dead lock, the timer id MUST
- * be release with out holding the timer lock.
- */
-static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id)
{
struct k_itimer *timr;
@@ -622,18 +611,46 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
if ((unsigned long long)timer_id > INT_MAX)
return NULL;
- rcu_read_lock();
+ /*
+ * The hash lookup and the timers are RCU protected.
+ *
+ * Timers are added to the hash in invalid state where
+ * timr::it_signal is marked invalid. timer::it_signal is only set
+ * after the rest of the initialization succeeded.
+ *
+ * Timer destruction happens in steps:
+ * 1) Set timr::it_signal marked invalid with timr::it_lock held
+ * 2) Release timr::it_lock
+ * 3) Remove from the hash under hash_lock
+ * 4) Put the reference count.
+ *
+ * The reference count might not drop to zero if timr::sigq is
+ * queued. In that case the signal delivery or flush will put the
+ * last reference count.
+ *
+ * When the reference count reaches zero, the timer is scheduled
+ * for RCU removal after the grace period.
+ *
+ * Holding rcu_read_lock() across the lookup ensures that
+ * the timer cannot be freed.
+ *
+ * The lookup validates locklessly that timr::it_signal ==
+ * current::it_signal and timr::it_id == @timer_id. timr::it_id
+ * can't change, but timr::it_signal can become invalid during
+ * destruction, which makes the locked check fail.
+ */
+ guard(rcu)();
timr = posix_timer_by_id(timer_id);
if (timr) {
- spin_lock_irqsave(&timr->it_lock, *flags);
- if (timr->it_signal == current->signal) {
- rcu_read_unlock();
+ spin_lock_irq(&timr->it_lock);
+ /*
+ * Validate under timr::it_lock that timr::it_signal is
+ * still valid. Pairs with #1 above.
+ */
+ if (timr->it_signal == current->signal)
return timr;
- }
- spin_unlock_irqrestore(&timr->it_lock, *flags);
+ spin_unlock_irq(&timr->it_lock);
}
- rcu_read_unlock();
-
return NULL;
}
@@ -652,20 +669,16 @@ static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
}
/*
- * Get the time remaining on a POSIX.1b interval timer. This function
- * is ALWAYS called with spin_lock_irq on the timer, thus it must not
- * mess with irq.
+ * Get the time remaining on a POSIX.1b interval timer.
*
- * We have a couple of messes to clean up here. First there is the case
- * of a timer that has a requeue pending. These timers should appear to
- * be in the timer list with an expiry as if we were to requeue them
- * now.
+ * Two issues to handle here:
*
- * The second issue is the SIGEV_NONE timer which may be active but is
- * not really ever put in the timer list (to save system resources).
- * This timer may be expired, and if so, we will do it here. Otherwise
- * it is the same as a requeue pending timer WRT to what we should
- * report.
+ * 1) The timer has a requeue pending. The return value must appear as
+ * if the timer has been requeued right now.
+ *
+ * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued
+ * into the hrtimer queue and therefore never expired. Emulate expiry
+ * here taking #1 into account.
*/
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
@@ -679,10 +692,14 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
/* interval timer ? */
if (iv) {
cur_setting->it_interval = ktime_to_timespec64(iv);
- } else if (!timr->it_active) {
+ } else if (timr->it_status == POSIX_TIMER_DISARMED) {
/*
- * SIGEV_NONE oneshot timers are never queued. Check them
- * below.
+ * SIGEV_NONE oneshot timers are never queued and therefore
+ * timr->it_status is always DISARMED. The check below
+ * vs. remaining time will handle this case.
+ *
+ * For all other timers there is nothing to update here, so
+ * return.
*/
if (!sig_none)
return;
@@ -691,18 +708,29 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
now = kc->clock_get_ktime(timr->it_clock);
/*
- * When a requeue is pending or this is a SIGEV_NONE timer move the
- * expiry time forward by intervals, so expiry is > now.
+ * If this is an interval timer and either has requeue pending or
+ * is a SIGEV_NONE timer move the expiry time forward by intervals,
+ * so expiry is > now.
*/
- if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
+ if (iv && timr->it_status != POSIX_TIMER_ARMED)
timr->it_overrun += kc->timer_forward(timr, now);
remaining = kc->timer_remaining(timr, now);
- /* Return 0 only, when the timer is expired and not pending */
+ /*
+ * As @now is retrieved before a possible timer_forward() and
+ * cannot be reevaluated by the compiler @remaining is based on the
+ * same @now value. Therefore @remaining is consistent vs. @now.
+ *
+ * Consequently all interval timers, i.e. @iv > 0, cannot have a
+ * remaining time <= 0 because timer_forward() guarantees to move
+ * them forward so that the next timer expiry is > @now.
+ */
if (remaining <= 0) {
/*
- * A single shot SIGEV_NONE timer must return 0, when
- * it is expired !
+ * A single shot SIGEV_NONE timer must return 0, when it is
+ * expired! Timers which have a real signal delivery mode
+ * must return a remaining time greater than 0 because the
+ * signal has not yet been delivered.
*/
if (!sig_none)
cur_setting->it_value.tv_nsec = 1;
@@ -711,27 +739,12 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
}
}
-/* Get the time remaining on a POSIX.1b interval timer. */
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
- struct k_itimer *timr;
- const struct k_clock *kc;
- unsigned long flags;
- int ret = 0;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
memset(setting, 0, sizeof(*setting));
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_get))
- ret = -EINVAL;
- else
- kc->timer_get(timr, setting);
-
- unlock_timer(timr, flags);
- return ret;
+ scoped_timer_get_or_fail(timer_id)
+ scoped_timer->kclock->timer_get(scoped_timer, setting);
+ return 0;
}
/* Get the time remaining on a POSIX.1b interval timer. */
@@ -765,29 +778,28 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
#endif
-/*
- * Get the number of overruns of a POSIX.1b interval timer. This is to
- * be the overrun of the timer last delivered. At the same time we are
- * accumulating overruns on the next timer. The overrun is frozen when
- * the signal is delivered, either at the notify time (if the info block
- * is not queued) or at the actual delivery time (as we are informed by
- * the call back to posixtimer_rearm(). So all we need to do is
- * to pick up the frozen overrun.
+/**
+ * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer
+ * @timer_id: The timer ID which identifies the timer
+ *
+ * The "overrun count" of a timer is one plus the number of expiration
+ * intervals which have elapsed between the first expiry, which queues the
+ * signal and the actual signal delivery. On signal delivery the "overrun
+ * count" is calculated and cached, so it can be returned directly here.
+ *
+ * As this is relative to the last queued signal the returned overrun count
+ * is meaningless outside of the signal delivery path and even there it
+ * does not accurately reflect the current state when user space evaluates
+ * it.
+ *
+ * Returns:
+ * -EINVAL @timer_id is invalid
+ * 1..INT_MAX The number of overruns related to the last delivered signal
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
- struct k_itimer *timr;
- int overrun;
- unsigned long flags;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
- overrun = timer_overrun_to_int(timr, 0);
- unlock_timer(timr, flags);
-
- return overrun;
+ scoped_timer_get_or_fail(timer_id)
+ return timer_overrun_to_int(scoped_timer);
}
static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
@@ -800,7 +812,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
/*
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
- * hood. See hrtimer_init(). Update timr->kclock, so the generic
+ * hood. See hrtimer_setup(). Update timr->kclock, so the generic
* functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
@@ -809,11 +821,10 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
if (timr->it_clock == CLOCK_REALTIME)
timr->kclock = absolute ? &clock_realtime : &clock_monotonic;
- hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
- timr->it.real.timer.function = posix_timer_fn;
+ hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode);
if (!absolute)
- expires = ktime_add_safe(expires, timer->base->get_time());
+ expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
hrtimer_set_expires(timer, expires);
if (!sigev_none)
@@ -831,27 +842,41 @@ static void common_timer_wait_running(struct k_itimer *timer)
}
/*
- * On PREEMPT_RT this prevent priority inversion against softirq kthread in
- * case it gets preempted while executing a timer callback. See comments in
- * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a
- * cpu_relax().
+ * On PREEMPT_RT this prevents priority inversion and a potential livelock
+ * against the ksoftirqd thread in case that ksoftirqd gets preempted while
+ * executing a hrtimer callback.
+ *
+ * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this
+ * just results in a cpu_relax().
+ *
+ * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is
+ * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this
+ * prevents spinning on an eventually scheduled out task and a livelock
+ * when the task which tries to delete or disarm the timer has preempted
+ * the task which runs the expiry in task work context.
*/
-static struct k_itimer *timer_wait_running(struct k_itimer *timer,
- unsigned long *flags)
+static void timer_wait_running(struct k_itimer *timer)
{
- const struct k_clock *kc = READ_ONCE(timer->kclock);
- timer_t timer_id = READ_ONCE(timer->it_id);
-
- /* Prevent kfree(timer) after dropping the lock */
- rcu_read_lock();
- unlock_timer(timer, *flags);
+ /*
+ * kc->timer_wait_running() might drop RCU lock. So @timer
+ * cannot be touched anymore after the function returns!
+ */
+ timer->kclock->timer_wait_running(timer);
+}
- if (!WARN_ON_ONCE(!kc->timer_wait_running))
- kc->timer_wait_running(timer);
+/*
+ * Set up the new interval and reset the signal delivery data
+ */
+void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting)
+{
+ if (new_setting->it_value.tv_sec || new_setting->it_value.tv_nsec)
+ timer->it_interval = timespec64_to_ktime(new_setting->it_interval);
+ else
+ timer->it_interval = 0;
- rcu_read_unlock();
- /* Relock the timer. It might be not longer hashed. */
- return lock_timer(timer_id, flags);
+ /* Reset overrun accounting */
+ timer->it_overrun_last = 0;
+ timer->it_overrun = -1LL;
}
/* Set a POSIX.1b interval timer. */
@@ -866,8 +891,6 @@ int common_timer_set(struct k_itimer *timr, int flags,
if (old_setting)
common_timer_get(timr, old_setting);
- /* Prevent rearming by clearing the interval */
- timr->it_interval = 0;
/*
* Careful here. On SMP systems the timer expiry function could be
* active and spinning on timr->it_lock.
@@ -875,35 +898,27 @@ int common_timer_set(struct k_itimer *timr, int flags,
if (kc->timer_try_to_cancel(timr) < 0)
return TIMER_RETRY;
- timr->it_active = 0;
- timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
- ~REQUEUE_PENDING;
- timr->it_overrun_last = 0;
+ timr->it_status = POSIX_TIMER_DISARMED;
+ posix_timer_set_common(timr, new_setting);
- /* Switch off the timer when it_value is zero */
+ /* Keep timer disarmed when it_value is zero */
if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
return 0;
- timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
if (flags & TIMER_ABSTIME)
expires = timens_ktime_to_host(timr->it_clock, expires);
sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
- timr->it_active = !sigev_none;
+ if (!sigev_none)
+ timr->it_status = POSIX_TIMER_ARMED;
return 0;
}
-static int do_timer_settime(timer_t timer_id, int tmr_flags,
- struct itimerspec64 *new_spec64,
+static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64,
struct itimerspec64 *old_spec64)
{
- const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
- int error = 0;
-
if (!timespec64_valid(&new_spec64->it_interval) ||
!timespec64_valid(&new_spec64->it_value))
return -EINVAL;
@@ -911,27 +926,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags,
if (old_spec64)
memset(old_spec64, 0, sizeof(*old_spec64));
- timr = lock_timer(timer_id, &flags);
-retry:
- if (!timr)
- return -EINVAL;
+ for (; ; old_spec64 = NULL) {
+ struct k_itimer *timr;
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_set))
- error = -EINVAL;
- else
- error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
-
- if (error == TIMER_RETRY) {
- // We already got the old time...
- old_spec64 = NULL;
- /* Unlocks and relocks the timer if it still exists */
- timr = timer_wait_running(timr, &flags);
- goto retry;
- }
- unlock_timer(timr, flags);
+ scoped_timer_get_or_fail(timer_id) {
+ timr = scoped_timer;
- return error;
+ if (old_spec64)
+ old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
+
+ /* Prevent signal delivery and rearming. */
+ timr->it_signal_seq++;
+
+ int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64);
+ if (ret != TIMER_RETRY)
+ return ret;
+
+ /* Protect the timer from being freed when leaving the lock scope */
+ rcu_read_lock();
+ }
+ timer_wait_running(timr);
+ rcu_read_unlock();
+ }
}
/* Set a POSIX.1b interval timer */
@@ -939,8 +955,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct __kernel_itimerspec __user *, new_setting,
struct __kernel_itimerspec __user *, old_setting)
{
- struct itimerspec64 new_spec, old_spec;
- struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
+ struct itimerspec64 new_spec, old_spec, *rtn;
int error = 0;
if (!new_setting)
@@ -949,6 +964,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
if (get_itimerspec64(&new_spec, new_setting))
return -EFAULT;
+ rtn = old_setting ? &old_spec : NULL;
error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old_setting) {
if (put_itimerspec64(&old_spec, old_setting))
@@ -984,92 +1000,115 @@ int common_timer_del(struct k_itimer *timer)
{
const struct k_clock *kc = timer->kclock;
- timer->it_interval = 0;
if (kc->timer_try_to_cancel(timer) < 0)
return TIMER_RETRY;
- timer->it_active = 0;
+ timer->it_status = POSIX_TIMER_DISARMED;
return 0;
}
-static inline int timer_delete_hook(struct k_itimer *timer)
+/*
+ * If the deleted timer is on the ignored list, remove it and
+ * drop the associated reference.
+ */
+static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr)
{
- const struct k_clock *kc = timer->kclock;
-
- if (WARN_ON_ONCE(!kc || !kc->timer_del))
- return -EINVAL;
- return kc->timer_del(timer);
+ if (!hlist_unhashed(&tmr->ignored_list)) {
+ hlist_del_init(&tmr->ignored_list);
+ posixtimer_putref(tmr);
+ }
}
-/* Delete a POSIX.1b interval timer. */
-SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
+static void posix_timer_delete(struct k_itimer *timer)
{
- struct k_itimer *timer;
- unsigned long flags;
-
- timer = lock_timer(timer_id, &flags);
+ /*
+ * Invalidate the timer, remove it from the linked list and remove
+ * it from the ignored list if pending.
+ *
+ * The invalidation must be written with siglock held so that the
+ * signal code observes the invalidated timer::it_signal in
+ * do_sigaction(), which prevents it from moving a pending signal
+ * of a deleted timer to the ignore list.
+ *
+ * The invalidation also prevents signal queueing, signal delivery
+ * and therefore rearming from the signal delivery path.
+ *
+ * A concurrent lookup can still find the timer in the hash, but it
+ * will check timer::it_signal with timer::it_lock held and observe
+ * bit 0 set, which invalidates it. That also prevents the timer ID
+ * from being handed out before this timer is completely gone.
+ */
+ timer->it_signal_seq++;
-retry_delete:
- if (!timer)
- return -EINVAL;
+ scoped_guard (spinlock, &current->sighand->siglock) {
+ unsigned long sig = (unsigned long)timer->it_signal | 1UL;
- if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
- /* Unlocks and relocks the timer if it still exists */
- timer = timer_wait_running(timer, &flags);
- goto retry_delete;
+ WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig);
+ hlist_del_rcu(&timer->list);
+ posix_timer_cleanup_ignored(timer);
}
- spin_lock(&current->sighand->siglock);
- list_del(&timer->list);
- spin_unlock(&current->sighand->siglock);
- /*
- * This keeps any tasks waiting on the spin lock from thinking
- * they got something (see the lock code above).
- */
- timer->it_signal = NULL;
-
- unlock_timer(timer, flags);
- release_posix_timer(timer, IT_ID_SET);
- return 0;
+ while (timer->kclock->timer_del(timer) == TIMER_RETRY) {
+ guard(rcu)();
+ spin_unlock_irq(&timer->it_lock);
+ timer_wait_running(timer);
+ spin_lock_irq(&timer->it_lock);
+ }
}
-/*
- * return timer owned by the process, used by exit_itimers
- */
-static void itimer_delete(struct k_itimer *timer)
+/* Delete a POSIX.1b interval timer. */
+SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
-retry_delete:
- spin_lock_irq(&timer->it_lock);
+ struct k_itimer *timer;
- if (timer_delete_hook(timer) == TIMER_RETRY) {
- spin_unlock_irq(&timer->it_lock);
- goto retry_delete;
+ scoped_timer_get_or_fail(timer_id) {
+ timer = scoped_timer;
+ posix_timer_delete(timer);
}
- list_del(&timer->list);
-
- spin_unlock_irq(&timer->it_lock);
- release_posix_timer(timer, IT_ID_SET);
+ /* Remove it from the hash, which frees up the timer ID */
+ posix_timer_unhash_and_free(timer);
+ return 0;
}
/*
- * This is called by do_exit or de_thread, only when nobody else can
- * modify the signal->posix_timers list. Yet we need sighand->siglock
- * to prevent the race with /proc/pid/timers.
+ * Invoked from do_exit() when the last thread of a thread group exits.
+ * At that point no other task can access the timers of the dying
+ * task anymore.
*/
void exit_itimers(struct task_struct *tsk)
{
- struct list_head timers;
- struct k_itimer *tmr;
+ struct hlist_head timers;
+ struct hlist_node *next;
+ struct k_itimer *timer;
+
+ /* Clear restore mode for exec() */
+ tsk->signal->timer_create_restore_ids = 0;
- if (list_empty(&tsk->signal->posix_timers))
+ if (hlist_empty(&tsk->signal->posix_timers))
return;
- spin_lock_irq(&tsk->sighand->siglock);
- list_replace_init(&tsk->signal->posix_timers, &timers);
- spin_unlock_irq(&tsk->sighand->siglock);
+ /* Protect against concurrent read via /proc/$PID/timers */
+ scoped_guard (spinlock_irq, &tsk->sighand->siglock)
+ hlist_move_list(&tsk->signal->posix_timers, &timers);
+
+ /* The timers are not longer accessible via tsk::signal */
+ hlist_for_each_entry_safe(timer, next, &timers, list) {
+ scoped_guard (spinlock_irq, &timer->it_lock)
+ posix_timer_delete(timer);
+ posix_timer_unhash_and_free(timer);
+ cond_resched();
+ }
+
+ /*
+ * There should be no timers on the ignored list. itimer_delete() has
+ * mopped them up.
+ */
+ if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers)))
+ return;
- while (!list_empty(&timers)) {
- tmr = list_first_entry(&timers, struct k_itimer, list);
- itimer_delete(tmr);
+ hlist_move_list(&tsk->signal->ignored_posix_timers, &timers);
+ while (!hlist_empty(&timers)) {
+ posix_timer_cleanup_ignored(hlist_entry(timers.first, struct k_itimer,
+ ignored_list));
}
}
@@ -1085,6 +1124,10 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
if (get_timespec64(&new_tp, tp))
return -EFAULT;
+ /*
+ * Permission checks have to be done inside the clock specific
+ * setter callback.
+ */
return kc->clock_set(which_clock, &new_tp);
}
@@ -1135,6 +1178,79 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
return err;
}
+/**
+ * sys_clock_getres - Get the resolution of a clock
+ * @which_clock: The clock to get the resolution for
+ * @tp: Pointer to a a user space timespec64 for storage
+ *
+ * POSIX defines:
+ *
+ * "The clock_getres() function shall return the resolution of any
+ * clock. Clock resolutions are implementation-defined and cannot be set by
+ * a process. If the argument res is not NULL, the resolution of the
+ * specified clock shall be stored in the location pointed to by res. If
+ * res is NULL, the clock resolution is not returned. If the time argument
+ * of clock_settime() is not a multiple of res, then the value is truncated
+ * to a multiple of res."
+ *
+ * Due to the various hardware constraints the real resolution can vary
+ * wildly and even change during runtime when the underlying devices are
+ * replaced. The kernel also can use hardware devices with different
+ * resolutions for reading the time and for arming timers.
+ *
+ * The kernel therefore deviates from the POSIX spec in various aspects:
+ *
+ * 1) The resolution returned to user space
+ *
+ * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI,
+ * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW
+ * the kernel differentiates only two cases:
+ *
+ * I) Low resolution mode:
+ *
+ * When high resolution timers are disabled at compile or runtime
+ * the resolution returned is nanoseconds per tick, which represents
+ * the precision at which timers expire.
+ *
+ * II) High resolution mode:
+ *
+ * When high resolution timers are enabled the resolution returned
+ * is always one nanosecond independent of the actual resolution of
+ * the underlying hardware devices.
+ *
+ * For CLOCK_*_ALARM the actual resolution depends on system
+ * state. When system is running the resolution is the same as the
+ * resolution of the other clocks. During suspend the actual
+ * resolution is the resolution of the underlying RTC device which
+ * might be way less precise than the clockevent device used during
+ * running state.
+ *
+ * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution
+ * returned is always nanoseconds per tick.
+ *
+ * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution
+ * returned is always one nanosecond under the assumption that the
+ * underlying scheduler clock has a better resolution than nanoseconds
+ * per tick.
+ *
+ * For dynamic POSIX clocks (PTP devices) the resolution returned is
+ * always one nanosecond.
+ *
+ * 2) Affect on sys_clock_settime()
+ *
+ * The kernel does not truncate the time which is handed in to
+ * sys_clock_settime(). The kernel internal timekeeping is always using
+ * nanoseconds precision independent of the clocksource device which is
+ * used to read the time from. The resolution of that device only
+ * affects the precision of the time returned by sys_clock_gettime().
+ *
+ * Returns:
+ * 0 Success. @tp contains the resolution
+ * -EINVAL @which_clock is not a valid clock ID
+ * -EFAULT Copying the resolution to @tp faulted
+ * -ENODEV Dynamic POSIX clock is not backed by a device
+ * -EOPNOTSUPP Dynamic POSIX clock does not support getres()
+ */
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct __kernel_timespec __user *, tp)
{
@@ -1226,7 +1342,7 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
#endif
/*
- * nanosleep for monotonic and realtime clocks
+ * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI
*/
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
@@ -1238,8 +1354,13 @@ static int common_nsleep(const clockid_t which_clock, int flags,
which_clock);
}
+/*
+ * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME
+ *
+ * Absolute nanosleeps for these clocks are time-namespace adjusted.
+ */
static int common_nsleep_timens(const clockid_t which_clock, int flags,
- const struct timespec64 *rqtp)
+ const struct timespec64 *rqtp)
{
ktime_t texp = timespec64_to_ktime(*rqtp);
@@ -1270,6 +1391,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
@@ -1297,6 +1419,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
@@ -1402,6 +1525,9 @@ static const struct k_clock * const posix_clocks[] = {
[CLOCK_REALTIME_ALARM] = &alarm_clock,
[CLOCK_BOOTTIME_ALARM] = &alarm_clock,
[CLOCK_TAI] = &clock_tai,
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+ [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux,
+#endif
};
static const struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -1418,3 +1544,31 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id)
return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}
+
+static int __init posixtimer_init(void)
+{
+ unsigned long i, size;
+ unsigned int shift;
+
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+ sizeof(struct k_itimer),
+ __alignof__(struct k_itimer),
+ SLAB_ACCOUNT, NULL);
+
+ if (IS_ENABLED(CONFIG_BASE_SMALL))
+ size = 512;
+ else
+ size = roundup_pow_of_two(512 * num_possible_cpus());
+
+ timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets),
+ size, 0, 0, &shift, NULL, size, size);
+ size = 1UL << shift;
+ timer_hashmask = size - 1;
+
+ for (i = 0; i < size; i++) {
+ spin_lock_init(&timer_buckets[i].lock);
+ INIT_HLIST_HEAD(&timer_buckets[i].head);
+ }
+ return 0;
+}
+core_initcall(posixtimer_init);
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index f32a2ebba9b8..7f259e845d24 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -1,6 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
#define TIMER_RETRY 1
+enum posix_timer_state {
+ POSIX_TIMER_DISARMED,
+ POSIX_TIMER_ARMED,
+ POSIX_TIMER_REQUEUE_PENDING,
+};
+
struct k_clock {
int (*clock_getres)(const clockid_t which_clock,
struct timespec64 *tp);
@@ -35,11 +41,13 @@ extern const struct k_clock clock_posix_dynamic;
extern const struct k_clock clock_process;
extern const struct k_clock clock_thread;
extern const struct k_clock alarm_clock;
+extern const struct k_clock clock_aux;
-int posix_timer_event(struct k_itimer *timr, int si_private);
+void posix_timer_queue_signal(struct k_itimer *timr);
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting);
int common_timer_set(struct k_itimer *timr, int flags,
struct itimerspec64 *new_setting,
struct itimerspec64 *old_setting);
+void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting);
int common_timer_del(struct k_itimer *timer);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 8464c5acc913..f39111830ca3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -64,14 +64,14 @@ static struct clock_data cd ____cacheline_aligned = {
.actual_read_sched_clock = jiffy_sched_clock_read,
};
-static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
+static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
return (cyc * mult) >> shift;
}
notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
{
- *seq = raw_read_seqcount_latch(&cd.seq);
+ *seq = read_seqcount_latch(&cd.seq);
return cd.read_data + (*seq & 1);
}
@@ -80,23 +80,45 @@ notrace int sched_clock_read_retry(unsigned int seq)
return read_seqcount_latch_retry(&cd.seq, seq);
}
-unsigned long long notrace sched_clock(void)
+static __always_inline unsigned long long __sched_clock(void)
{
- u64 cyc, res;
- unsigned int seq;
struct clock_read_data *rd;
+ unsigned int seq;
+ u64 cyc, res;
do {
- rd = sched_clock_read_begin(&seq);
+ seq = raw_read_seqcount_latch(&cd.seq);
+ rd = cd.read_data + (seq & 1);
cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
rd->sched_clock_mask;
res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
- } while (sched_clock_read_retry(seq));
+ } while (raw_read_seqcount_latch_retry(&cd.seq, seq));
return res;
}
+unsigned long long noinstr sched_clock_noinstr(void)
+{
+ return __sched_clock();
+}
+
+unsigned long long notrace sched_clock(void)
+{
+ unsigned long long ns;
+ preempt_disable_notrace();
+ /*
+ * All of __sched_clock() is a seqcount_latch reader critical section,
+ * but relies on the raw helpers which are uninstrumented. For KCSAN,
+ * mark all accesses in __sched_clock() as atomic.
+ */
+ kcsan_nestable_atomic_begin();
+ ns = __sched_clock();
+ kcsan_nestable_atomic_end();
+ preempt_enable_notrace();
+ return ns;
+}
+
/*
* Updating the data required to read the clock.
*
@@ -109,17 +131,19 @@ unsigned long long notrace sched_clock(void)
*/
static void update_clock_read_data(struct clock_read_data *rd)
{
- /* update the backup (odd) copy with the new data */
- cd.read_data[1] = *rd;
-
/* steer readers towards the odd copy */
- raw_write_seqcount_latch(&cd.seq);
+ write_seqcount_latch_begin(&cd.seq);
/* now its safe for us to update the normal (even) copy */
cd.read_data[0] = *rd;
/* switch readers back to the even copy */
- raw_write_seqcount_latch(&cd.seq);
+ write_seqcount_latch(&cd.seq);
+
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ write_seqcount_latch_end(&cd.seq);
}
/*
@@ -150,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
-void __init
-sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
+void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
@@ -223,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
pr_debug("Registered %pS as sched_clock source\n", read);
}
+EXPORT_SYMBOL_GPL(sched_clock_register);
void __init generic_sched_clock_init(void)
{
@@ -239,8 +263,7 @@ void __init generic_sched_clock_init(void)
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- sched_clock_timer.function = sched_clock_poll;
+ hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
@@ -257,7 +280,7 @@ void __init generic_sched_clock_init(void)
*/
static u64 notrace suspended_sched_clock_read(void)
{
- unsigned int seq = raw_read_seqcount_latch(&cd.seq);
+ unsigned int seq = read_seqcount_latch(&cd.seq);
return cd.read_data[seq & 1].epoch_cyc;
}
@@ -273,6 +296,11 @@ int sched_clock_suspend(void)
return 0;
}
+static int sched_clock_syscore_suspend(void *data)
+{
+ return sched_clock_suspend();
+}
+
void sched_clock_resume(void)
{
struct clock_read_data *rd = &cd.read_data[0];
@@ -282,14 +310,23 @@ void sched_clock_resume(void)
rd->read_sched_clock = cd.actual_read_sched_clock;
}
-static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+static void sched_clock_syscore_resume(void *data)
+{
+ sched_clock_resume();
+}
+
+static const struct syscore_ops sched_clock_syscore_ops = {
+ .suspend = sched_clock_syscore_suspend,
+ .resume = sched_clock_syscore_resume,
+};
+
+static struct syscore sched_clock_syscore = {
+ .ops = &sched_clock_syscore_ops,
};
static int __init sched_clock_syscore_init(void)
{
- register_syscore_ops(&sched_clock_ops);
+ register_syscore(&sched_clock_syscore);
return 0;
}
diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c
new file mode 100644
index 000000000000..3c90574bd904
--- /dev/null
+++ b/kernel/time/sleep_timeout.c
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Kernel internal schedule timeout and sleeping functions
+ */
+
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+
+#include "tick-internal.h"
+
+/*
+ * Since schedule_timeout()'s timer is defined on the stack, it must store
+ * the target task on the stack as well.
+ */
+struct process_timer {
+ struct timer_list timer;
+ struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
+{
+ struct process_timer *timeout = timer_container_of(timeout, t, timer);
+
+ wake_up_process(timeout->task);
+}
+
+/**
+ * schedule_timeout - sleep until timeout
+ * @timeout: timeout value in jiffies
+ *
+ * Make the current task sleep until @timeout jiffies have elapsed.
+ * The function behavior depends on the current task state
+ * (see also set_current_state() description):
+ *
+ * %TASK_RUNNING - the scheduler is called, but the task does not sleep
+ * at all. That happens because sched_submit_work() does nothing for
+ * tasks in %TASK_RUNNING state.
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process()).
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task or the current task is explicitly woken
+ * up.
+ *
+ * The current task state is guaranteed to be %TASK_RUNNING when this
+ * routine returns.
+ *
+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
+ * the CPU away without a bound on the timeout. In this case the return
+ * value will be %MAX_SCHEDULE_TIMEOUT.
+ *
+ * Returns: 0 when the timer has expired otherwise the remaining time in
+ * jiffies will be returned. In all cases the return value is guaranteed
+ * to be non-negative.
+ */
+signed long __sched schedule_timeout(signed long timeout)
+{
+ struct process_timer timer;
+ unsigned long expire;
+
+ switch (timeout) {
+ case MAX_SCHEDULE_TIMEOUT:
+ /*
+ * These two special cases are useful to be comfortable
+ * in the caller. Nothing more. We could take
+ * MAX_SCHEDULE_TIMEOUT from one of the negative value
+ * but I' d like to return a valid offset (>=0) to allow
+ * the caller to do everything it want with the retval.
+ */
+ schedule();
+ goto out;
+ default:
+ /*
+ * Another bit of PARANOID. Note that the retval will be
+ * 0 since no piece of kernel is supposed to do a check
+ * for a negative retval of schedule_timeout() (since it
+ * should never happens anyway). You just have the printk()
+ * that will tell you if something is gone wrong and where.
+ */
+ if (timeout < 0) {
+ pr_err("%s: wrong timeout value %lx\n", __func__, timeout);
+ dump_stack();
+ __set_current_state(TASK_RUNNING);
+ goto out;
+ }
+ }
+
+ expire = timeout + jiffies;
+
+ timer.task = current;
+ timer_setup_on_stack(&timer.timer, process_timeout, 0);
+ timer.timer.expires = expire;
+ add_timer(&timer.timer);
+ schedule();
+ timer_delete_sync(&timer.timer);
+
+ /* Remove the timer from the object tracker */
+ timer_destroy_on_stack(&timer.timer);
+
+ timeout = expire - jiffies;
+
+ out:
+ return timeout < 0 ? 0 : timeout;
+}
+EXPORT_SYMBOL(schedule_timeout);
+
+/*
+ * __set_current_state() can be used in schedule_timeout_*() functions, because
+ * schedule_timeout() calls schedule() unconditionally.
+ */
+
+/**
+ * schedule_timeout_interruptible - sleep until timeout (interruptible)
+ * @timeout: timeout value in jiffies
+ *
+ * See schedule_timeout() for details.
+ *
+ * Task state is set to TASK_INTERRUPTIBLE before starting the timeout.
+ */
+signed long __sched schedule_timeout_interruptible(signed long timeout)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_interruptible);
+
+/**
+ * schedule_timeout_killable - sleep until timeout (killable)
+ * @timeout: timeout value in jiffies
+ *
+ * See schedule_timeout() for details.
+ *
+ * Task state is set to TASK_KILLABLE before starting the timeout.
+ */
+signed long __sched schedule_timeout_killable(signed long timeout)
+{
+ __set_current_state(TASK_KILLABLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_killable);
+
+/**
+ * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible)
+ * @timeout: timeout value in jiffies
+ *
+ * See schedule_timeout() for details.
+ *
+ * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout.
+ */
+signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+
+/**
+ * schedule_timeout_idle - sleep until timeout (idle)
+ * @timeout: timeout value in jiffies
+ *
+ * See schedule_timeout() for details.
+ *
+ * Task state is set to TASK_IDLE before starting the timeout. It is similar to
+ * schedule_timeout_uninterruptible(), except this task will not contribute to
+ * load average.
+ */
+signed long __sched schedule_timeout_idle(signed long timeout)
+{
+ __set_current_state(TASK_IDLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_idle);
+
+/**
+ * schedule_hrtimeout_range_clock - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @delta: slack in expires timeout (ktime_t)
+ * @mode: timer mode
+ * @clock_id: timer clock to be used
+ *
+ * Details are explained in schedule_hrtimeout_range() function description as
+ * this function is commonly used.
+ */
+int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
+ const enum hrtimer_mode mode, clockid_t clock_id)
+{
+ struct hrtimer_sleeper t;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && *expires == 0) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "infinite"
+ */
+ if (!expires) {
+ schedule();
+ return -EINTR;
+ }
+
+ hrtimer_setup_sleeper_on_stack(&t, clock_id, mode);
+ hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
+ hrtimer_sleeper_start_expires(&t, mode);
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t.task ? 0 : -EINTR;
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
+
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @delta: slack in expires timeout (ktime_t)
+ * @mode: timer mode
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly
+ * for regular (non RT/DL) tasks.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process()).
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task or the current task is explicitly woken
+ * up.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns: 0 when the timer has expired. If the task was woken before the
+ * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
+ * by an explicit wakeup, it returns -EINTR.
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
+ const enum hrtimer_mode mode)
+{
+ return schedule_hrtimeout_range_clock(expires, delta, mode,
+ CLOCK_MONOTONIC);
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @mode: timer mode
+ *
+ * See schedule_hrtimeout_range() for details. @delta argument of
+ * schedule_hrtimeout_range() is set to 0 and has therefore no impact.
+ */
+int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode)
+{
+ return schedule_hrtimeout_range(expires, 0, mode);
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout);
+
+/**
+ * msleep - sleep safely even with waitqueue interruptions
+ * @msecs: Requested sleep duration in milliseconds
+ *
+ * msleep() uses jiffy based timeouts for the sleep duration. Because of the
+ * design of the timer wheel, the maximum additional percentage delay (slack) is
+ * 12.5%. This is only valid for timers which will end up in level 1 or a higher
+ * level of the timer wheel. For explanation of those 12.5% please check the
+ * detailed description about the basics of the timer wheel.
+ *
+ * The slack of timers which will end up in level 0 depends on sleep duration
+ * (msecs) and HZ configuration and can be calculated in the following way (with
+ * the timer wheel design restriction that the slack is not less than 12.5%):
+ *
+ * ``slack = MSECS_PER_TICK / msecs``
+ *
+ * When the allowed slack of the callsite is known, the calculation could be
+ * turned around to find the minimal allowed sleep duration to meet the
+ * constraints. For example:
+ *
+ * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``:
+ * all sleep durations greater or equal 4ms will meet the constraints.
+ * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``:
+ * all sleep durations greater or equal 8ms will meet the constraints.
+ * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``:
+ * all sleep durations greater or equal 16ms will meet the constraints.
+ * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``:
+ * all sleep durations greater or equal 32ms will meet the constraints.
+ *
+ * See also the signal aware variant msleep_interruptible().
+ */
+void msleep(unsigned int msecs)
+{
+ unsigned long timeout = msecs_to_jiffies(msecs);
+
+ while (timeout)
+ timeout = schedule_timeout_uninterruptible(timeout);
+}
+EXPORT_SYMBOL(msleep);
+
+/**
+ * msleep_interruptible - sleep waiting for signals
+ * @msecs: Requested sleep duration in milliseconds
+ *
+ * See msleep() for some basic information.
+ *
+ * The difference between msleep() and msleep_interruptible() is that the sleep
+ * could be interrupted by a signal delivery and then returns early.
+ *
+ * Returns: The remaining time of the sleep duration transformed to msecs (see
+ * schedule_timeout() for details).
+ */
+unsigned long msleep_interruptible(unsigned int msecs)
+{
+ unsigned long timeout = msecs_to_jiffies(msecs);
+
+ while (timeout && !signal_pending(current))
+ timeout = schedule_timeout_interruptible(timeout);
+ return jiffies_to_msecs(timeout);
+}
+EXPORT_SYMBOL(msleep_interruptible);
+
+/**
+ * usleep_range_state - Sleep for an approximate time in a given state
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ * @state: State of the current task that will be while sleeping
+ *
+ * usleep_range_state() sleeps at least for the minimum specified time but not
+ * longer than the maximum specified amount of time. The range might reduce
+ * power usage by allowing hrtimers to coalesce an already scheduled interrupt
+ * with this hrtimer. In the worst case, an interrupt is scheduled for the upper
+ * bound.
+ *
+ * The sleeping task is set to the specified state before starting the sleep.
+ *
+ * In non-atomic context where the exact wakeup time is flexible, use
+ * usleep_range() or its variants instead of udelay(). The sleep improves
+ * responsiveness by avoiding the CPU-hogging busy-wait of udelay().
+ */
+void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state)
+{
+ ktime_t exp = ktime_add_us(ktime_get(), min);
+ u64 delta = (u64)(max - min) * NSEC_PER_USEC;
+
+ if (WARN_ON_ONCE(max < min))
+ delta = 0;
+
+ for (;;) {
+ __set_current_state(state);
+ /* Do not return before the requested sleep time has elapsed */
+ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
+ break;
+ }
+}
+EXPORT_SYMBOL(usleep_range_state);
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 13b11eb62685..783f2297111b 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -149,11 +149,12 @@ module_init(udelay_test_init);
static void __exit udelay_test_exit(void)
{
mutex_lock(&udelay_test_lock);
- debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL));
+ debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL);
mutex_unlock(&udelay_test_lock);
}
module_exit(udelay_test_exit);
+MODULE_DESCRIPTION("udelay test module");
MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
MODULE_LICENSE("GPL");
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 797eb93103ad..a88b72b0f35e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* hrtimer callback function is currently running, then
* hrtimer_start() cannot move it and the timer stays on the CPU on
* which it is assigned at the moment.
+ */
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
*
- * As this can be called from idle code, the hrtimer_start()
- * invocation has to be wrapped with RCU_NONIDLE() as
- * hrtimer_start() can call into tracing.
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
*/
- RCU_NONIDLE( {
- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
- /*
- * The core tick broadcast mode expects bc->bound_on to be set
- * correctly to prevent a CPU which has the broadcast hrtimer
- * armed from going deep idle.
- *
- * As tick_broadcast_lock is held, nothing can change the cpu
- * base which was just established in hrtimer_start() above. So
- * the below access is safe even without holding the hrtimer
- * base lock.
- */
- bc->bound_on = bctimer.base->cpu_base->cpu;
- } );
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+
return 0;
}
@@ -105,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
void tick_setup_hrtimer_broadcast(void)
{
- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- bctimer.function = bc_handler;
+ hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
clockevents_register_device(&ce_broadcast_hrtimer);
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f7fe6fe36173..0207868c8b4d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -35,14 +35,15 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
#ifdef CONFIG_TICK_ONESHOT
static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device);
-static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
+static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic);
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
# ifdef CONFIG_HOTPLUG_CPU
static void tick_broadcast_oneshot_offline(unsigned int cpu);
# endif
#else
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
+static inline void
+tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); }
static inline void tick_broadcast_clear_oneshot(int cpu) { }
static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
# ifdef CONFIG_HOTPLUG_CPU
@@ -264,7 +265,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
tick_broadcast_start_periodic(bc);
else
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, false);
ret = 1;
} else {
/*
@@ -500,7 +501,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
tick_broadcast_start_periodic(bc);
else
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, false);
}
}
out:
@@ -622,9 +623,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
* to avoid a deep idle transition as we are about to get the
* broadcast IPI right away.
*/
-int tick_check_broadcast_expired(void)
+noinstr int tick_check_broadcast_expired(void)
{
+#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+ return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask));
+#else
return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+#endif
}
/*
@@ -1015,49 +1020,104 @@ static inline ktime_t tick_get_next_period(void)
/**
* tick_broadcast_setup_oneshot - setup the broadcast device
+ * @bc: the broadcast device
+ * @from_periodic: true if called from periodic mode
*/
-static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
+ bool from_periodic)
{
int cpu = smp_processor_id();
+ ktime_t nexttick = 0;
if (!bc)
return;
- /* Set it up only once ! */
- if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = clockevent_state_periodic(bc);
-
- bc->event_handler = tick_handle_oneshot_broadcast;
-
+ /*
+ * When the broadcast device was switched to oneshot by the first
+ * CPU handling the NOHZ change, the other CPUs will reach this
+ * code via hrtimer_run_queues() -> tick_check_oneshot_change()
+ * too. Set up the broadcast device only once!
+ */
+ if (bc->event_handler == tick_handle_oneshot_broadcast) {
/*
- * We must be careful here. There might be other CPUs
- * waiting for periodic broadcast. We need to set the
- * oneshot_mask bits for those and program the
- * broadcast device to fire.
+ * The CPU which switched from periodic to oneshot mode
+ * set the broadcast oneshot bit for all other CPUs which
+ * are in the general (periodic) broadcast mask to ensure
+ * that CPUs which wait for the periodic broadcast are
+ * woken up.
+ *
+ * Clear the bit for the local CPU as the set bit would
+ * prevent the first tick_broadcast_enter() after this CPU
+ * switched to oneshot state to program the broadcast
+ * device.
+ *
+ * This code can also be reached via tick_broadcast_control(),
+ * but this cannot avoid the tick_broadcast_clear_oneshot()
+ * as that would break the periodic to oneshot transition of
+ * secondary CPUs. But that's harmless as the below only
+ * clears already cleared bits.
*/
+ tick_broadcast_clear_oneshot(cpu);
+ return;
+ }
+
+
+ bc->event_handler = tick_handle_oneshot_broadcast;
+ bc->next_event = KTIME_MAX;
+
+ /*
+ * When the tick mode is switched from periodic to oneshot it must
+ * be ensured that CPUs which are waiting for periodic broadcast
+ * get their wake-up at the next tick. This is achieved by ORing
+ * tick_broadcast_mask into tick_broadcast_oneshot_mask.
+ *
+ * For other callers, e.g. broadcast device replacement,
+ * tick_broadcast_oneshot_mask must not be touched as this would
+ * set bits for CPUs which are already NOHZ, but not idle. Their
+ * next tick_broadcast_enter() would observe the bit set and fail
+ * to update the expiry time and the broadcast event device.
+ */
+ if (from_periodic) {
cpumask_copy(tmpmask, tick_broadcast_mask);
+ /* Remove the local CPU as it is obviously not idle */
cpumask_clear_cpu(cpu, tmpmask);
- cpumask_or(tick_broadcast_oneshot_mask,
- tick_broadcast_oneshot_mask, tmpmask);
+ cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask);
- if (was_periodic && !cpumask_empty(tmpmask)) {
- ktime_t nextevt = tick_get_next_period();
+ /*
+ * Ensure that the oneshot broadcast handler will wake the
+ * CPUs which are still waiting for periodic broadcast.
+ */
+ nexttick = tick_get_next_period();
+ tick_broadcast_init_next_event(tmpmask, nexttick);
- clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
- tick_broadcast_init_next_event(tmpmask, nextevt);
- tick_broadcast_set_event(bc, cpu, nextevt);
- } else
- bc->next_event = KTIME_MAX;
- } else {
/*
- * The first cpu which switches to oneshot mode sets
- * the bit for all other cpus which are in the general
- * (periodic) broadcast mask. So the bit is set and
- * would prevent the first broadcast enter after this
- * to program the bc device.
+ * If the underlying broadcast clock event device is
+ * already in oneshot state, then there is nothing to do.
+ * The device was already armed for the next tick
+ * in tick_handle_broadcast_periodic()
*/
- tick_broadcast_clear_oneshot(cpu);
+ if (clockevent_state_oneshot(bc))
+ return;
}
+
+ /*
+ * When switching from periodic to oneshot mode arm the broadcast
+ * device for the next tick.
+ *
+ * If the broadcast device has been replaced in oneshot mode and
+ * the oneshot broadcast mask is not empty, then arm it to expire
+ * immediately in order to reevaluate the next expiring timer.
+ * @nexttick is 0 and therefore in the past which will cause the
+ * clockevent code to force an event.
+ *
+ * For both cases the programming can be avoided when the oneshot
+ * broadcast mask is empty.
+ *
+ * tick_broadcast_set_event() implicitly switches the broadcast
+ * device to oneshot state.
+ */
+ if (!cpumask_empty(tick_broadcast_oneshot_mask))
+ tick_broadcast_set_event(bc, cpu, nexttick);
}
/*
@@ -1066,14 +1126,16 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
void tick_broadcast_switch_to_oneshot(void)
{
struct clock_event_device *bc;
+ enum tick_device_mode oldmode;
unsigned long flags;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ oldmode = tick_broadcast_device.mode;
tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
bc = tick_broadcast_device.evtdev;
if (bc)
- tick_broadcast_setup_oneshot(bc);
+ tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC);
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -1088,6 +1150,30 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
bc = tick_broadcast_device.evtdev;
if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /*
+ * If the broadcast force bit of the current CPU is set,
+ * then the current CPU has not yet reprogrammed the local
+ * timer device to avoid a ping-pong race. See
+ * ___tick_broadcast_oneshot_control().
+ *
+ * If the broadcast device is hrtimer based then
+ * programming the broadcast event below does not have any
+ * effect because the local clockevent device is not
+ * running and not programmed because the broadcast event
+ * is not earlier than the pending event of the local clock
+ * event device. As a consequence all CPUs waiting for a
+ * broadcast event are stuck forever.
+ *
+ * Detect this condition and reprogram the cpu local timer
+ * device to avoid the starvation.
+ */
+ if (tick_check_broadcast_expired()) {
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask);
+ tick_program_event(td->evtdev->next_event, 1);
+ }
+
/* This moves the broadcast assignment to this CPU: */
clockevents_program_event(bc, bc->next_event, 1);
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 46789356f856..7e33d3f2e889 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -7,6 +7,7 @@
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
*/
+#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
@@ -84,7 +85,7 @@ int tick_is_oneshot_available(void)
*/
static void tick_periodic(int cpu)
{
- if (tick_do_timer_cpu == cpu) {
+ if (READ_ONCE(tick_do_timer_cpu) == cpu) {
raw_spin_lock(&jiffies_lock);
write_seqcount_begin(&jiffies_seq);
@@ -111,15 +112,13 @@ void tick_handle_periodic(struct clock_event_device *dev)
tick_periodic(cpu);
-#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
/*
* The cpu might have transitioned to HIGHRES or NOHZ mode via
* update_process_times() -> run_local_timers() ->
* hrtimer_run_queues().
*/
- if (dev->event_handler != tick_handle_periodic)
+ if (IS_ENABLED(CONFIG_TICK_ONESHOT) && dev->event_handler != tick_handle_periodic)
return;
-#endif
if (!clockevent_state_oneshot(dev))
return;
@@ -179,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
}
}
-#ifdef CONFIG_NO_HZ_FULL
-static void giveup_do_timer(void *info)
-{
- int cpu = *(unsigned int *)info;
-
- WARN_ON(tick_do_timer_cpu != smp_processor_id());
-
- tick_do_timer_cpu = cpu;
-}
-
-static void tick_take_do_timer_from_boot(void)
-{
- int cpu = smp_processor_id();
- int from = tick_do_timer_boot_cpu;
-
- if (from >= 0 && from != cpu)
- smp_call_function_single(from, giveup_do_timer, &cpu, 1);
-}
-#endif
-
/*
* Setup the tick device
*/
@@ -217,25 +196,30 @@ static void tick_setup_device(struct tick_device *td,
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
- if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- tick_do_timer_cpu = cpu;
-
+ if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) {
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
tick_next_period = ktime_get();
#ifdef CONFIG_NO_HZ_FULL
/*
- * The boot CPU may be nohz_full, in which case set
- * tick_do_timer_boot_cpu so the first housekeeping
- * secondary that comes up will take do_timer from
- * us.
+ * The boot CPU may be nohz_full, in which case the
+ * first housekeeping secondary will take do_timer()
+ * from it.
*/
if (tick_nohz_full_cpu(cpu))
tick_do_timer_boot_cpu = cpu;
- } else if (tick_do_timer_boot_cpu != -1 &&
- !tick_nohz_full_cpu(cpu)) {
- tick_take_do_timer_from_boot();
+ } else if (tick_do_timer_boot_cpu != -1 && !tick_nohz_full_cpu(cpu)) {
tick_do_timer_boot_cpu = -1;
- WARN_ON(tick_do_timer_cpu != cpu);
+ /*
+ * The boot CPU will stay in periodic (NOHZ disabled)
+ * mode until clocksource_done_booting() called after
+ * smp_init() selects a high resolution clocksource and
+ * timekeeping_notify() kicks the NOHZ stuff alive.
+ *
+ * So this WRITE_ONCE can only race with the READ_ONCE
+ * check in tick_periodic() but this race is harmless.
+ */
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
#endif
}
@@ -399,37 +383,46 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
#ifdef CONFIG_HOTPLUG_CPU
+void tick_assert_timekeeping_handover(void)
+{
+ WARN_ON_ONCE(tick_do_timer_cpu == smp_processor_id());
+}
/*
- * Transfer the do_timer job away from a dying cpu.
- *
- * Called with interrupts disabled. No locking required. If
- * tick_do_timer_cpu is owned by this cpu, nothing can change it.
+ * Stop the tick and transfer the timekeeping job away from a dying cpu.
*/
-void tick_handover_do_timer(void)
+int tick_cpu_dying(unsigned int dying_cpu)
{
- if (tick_do_timer_cpu == smp_processor_id())
+ /*
+ * If the current CPU is the timekeeper, it's the only one that can
+ * safely hand over its duty. Also all online CPUs are in stop
+ * machine, guaranteed not to be idle, therefore there is no
+ * concurrency and it's safe to pick any online successor.
+ */
+ if (tick_do_timer_cpu == dying_cpu)
tick_do_timer_cpu = cpumask_first(cpu_online_mask);
+
+ /* Make sure the CPU won't try to retake the timekeeping duty */
+ tick_sched_timer_dying(dying_cpu);
+
+ /* Remove CPU from timer broadcasting */
+ tick_offline_cpu(dying_cpu);
+
+ return 0;
}
/*
- * Shutdown an event device on a given cpu:
+ * Shutdown an event device on the outgoing CPU:
*
- * This is called on a life CPU, when a CPU is dead. So we cannot
- * access the hardware device itself.
- * We just set the mode and remove it from the lists.
+ * Called by the dying CPU during teardown, with clockevents_lock held
+ * and interrupts disabled.
*/
-void tick_shutdown(unsigned int cpu)
+void tick_shutdown(void)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
if (dev) {
- /*
- * Prevent that the clock events layer tries to call
- * the set mode function!
- */
- clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
@@ -510,6 +503,7 @@ void tick_resume(void)
#ifdef CONFIG_SUSPEND
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP);
static unsigned int tick_freeze_depth;
/**
@@ -529,9 +523,22 @@ void tick_freeze(void)
if (tick_freeze_depth == num_online_cpus()) {
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
+ /*
+ * All other CPUs have their interrupts disabled and are
+ * suspended to idle. Other tasks have been frozen so there
+ * is no scheduling happening. This means that there is no
+ * concurrency in the system at this point. Therefore it is
+ * okay to acquire a sleeping lock on PREEMPT_RT, such as a
+ * spinlock, because the lock cannot be held by other CPUs
+ * or threads and acquiring it cannot block.
+ *
+ * Inform lockdep about the situation.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
system_state = SYSTEM_SUSPEND;
sched_clock_suspend();
timekeeping_suspend();
+ lock_map_release(&tick_freeze_map);
} else {
tick_suspend_local();
}
@@ -553,8 +560,16 @@ void tick_unfreeze(void)
raw_spin_lock(&tick_freeze_lock);
if (tick_freeze_depth == num_online_cpus()) {
+ /*
+ * Similar to tick_freeze(). On resumption the first CPU may
+ * acquire uncontended sleeping locks while other CPUs block on
+ * tick_freeze_lock.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
timekeeping_resume();
sched_clock_resume();
+ lock_map_release(&tick_freeze_map);
+
system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 649f2b48e8f0..4e4f7bbe2a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -8,6 +8,11 @@
#include "timekeeping.h"
#include "tick-sched.h"
+struct timer_events {
+ u64 local;
+ u64 global;
+};
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS
# define TICK_DO_TIMER_NONE -1
@@ -20,7 +25,8 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_shutdown(unsigned int cpu);
+extern void tick_offline_cpu(unsigned int cpu);
+extern void tick_shutdown(void);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
@@ -56,7 +62,6 @@ extern int clockevents_program_event(struct clock_event_device *dev,
ktime_t expires, bool force);
extern void clockevents_handle_noop(struct clock_event_device *dev);
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
/* Broadcasting support */
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
@@ -153,8 +158,16 @@ static inline void tick_nohz_init(void) { }
#ifdef CONFIG_NO_HZ_COMMON
extern unsigned long tick_nohz_active;
extern void timers_update_nohz(void);
+extern u64 get_jiffies_update(unsigned long *basej);
# ifdef CONFIG_SMP
extern struct static_key_false timers_migration_enabled;
+extern void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
+ struct timer_events *tevt,
+ unsigned int cpu);
+extern void timer_lock_remote_bases(unsigned int cpu);
+extern void timer_unlock_remote_bases(unsigned int cpu);
+extern bool timer_base_is_idle(void);
+extern void timer_expire_remote(unsigned int cpu);
# endif
#else /* CONFIG_NO_HZ_COMMON */
static inline void timers_update_nohz(void) { }
@@ -164,6 +177,7 @@ static inline void timers_update_nohz(void) { }
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
+u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle);
void timer_clear_idle(void);
#define CLOCK_SET_WALL \
@@ -197,3 +211,5 @@ void hrtimers_resume_local(void);
#else
#define JIFFIES_SHIFT 8
#endif
+
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5e2c2c26b3cc..ffee943d796d 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -19,6 +19,10 @@
/**
* tick_program_event - program the CPU local timer device for the next event
+ * @expires: the time at which the next timer event should occur
+ * @force: flag to force reprograming even if the event time hasn't changed
+ *
+ * Return: 0 on success, negative error code on failure
*/
int tick_program_event(ktime_t expires, int force)
{
@@ -57,6 +61,13 @@ void tick_resume_oneshot(void)
/**
* tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ * @newdev: Pointer to the clock event device to configure
+ * @handler: Function to be called when the event device triggers an interrupt
+ * @next_event: Initial expiry time for the next event (in ktime)
+ *
+ * Configures the specified clock event device for onshot mode,
+ * assigns the given handler as its event callback, and programs
+ * the device to trigger at the specified next event time.
*/
void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
@@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
/**
* tick_switch_to_oneshot - switch to oneshot mode
+ * @handler: function to call when an event occurs on the tick device
+ *
+ * Return: 0 on success, -EINVAL if the tick device is not present,
+ * not functional, or does not support oneshot mode.
*/
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
@@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
/**
* tick_oneshot_mode_active - check whether the system is in oneshot mode
*
- * returns 1 when either nohz or highres are enabled. otherwise 0.
+ * Return: 1 when either nohz or highres are enabled, otherwise 0.
*/
int tick_oneshot_mode_active(void)
{
@@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void)
* tick_init_highres - switch to high resolution mode
*
* Called with interrupts disabled.
+ *
+ * Return: 0 on success, -EINVAL if the tick device cannot switch
+ * to oneshot/high-resolution mode.
*/
int tick_init_highres(void)
{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b0e3c9205946..8ddf74e705d3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -4,10 +4,11 @@
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
- * No idle tick implementation for low and high resolution timers
+ * NOHZ implementation for low and high resolution timers
*
* Started by: Thomas Gleixner and Ingo Molnar
*/
+#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
@@ -43,9 +44,8 @@ struct tick_sched *tick_get_tick_sched(int cpu)
return &per_cpu(tick_cpu_sched, cpu);
}
-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
/*
- * The time, when the last jiffy update happened. Write access must hold
+ * The time when the last jiffy update happened. Write access must hold
* jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
* consistent view of jiffies and last_jiffies_update.
*/
@@ -60,13 +60,13 @@ static void tick_do_update_jiffies64(ktime_t now)
ktime_t delta, nextp;
/*
- * 64bit can do a quick check without holding jiffies lock and
+ * 64-bit can do a quick check without holding the jiffies lock and
* without looking at the sequence count. The smp_load_acquire()
* pairs with the update done later in this function.
*
- * 32bit cannot do that because the store of tick_next_period
- * consists of two 32bit stores and the first store could move it
- * to a random point in the future.
+ * 32-bit cannot do that because the store of 'tick_next_period'
+ * consists of two 32-bit stores, and the first store could be
+ * moved by the CPU to a random point in the future.
*/
if (IS_ENABLED(CONFIG_64BIT)) {
if (ktime_before(now, smp_load_acquire(&tick_next_period)))
@@ -75,7 +75,7 @@ static void tick_do_update_jiffies64(ktime_t now)
unsigned int seq;
/*
- * Avoid contention on jiffies_lock and protect the quick
+ * Avoid contention on 'jiffies_lock' and protect the quick
* check with the sequence count.
*/
do {
@@ -90,7 +90,7 @@ static void tick_do_update_jiffies64(ktime_t now)
/* Quick check failed, i.e. update is required. */
raw_spin_lock(&jiffies_lock);
/*
- * Reevaluate with the lock held. Another CPU might have done the
+ * Re-evaluate with the lock held. Another CPU might have done the
* update already.
*/
if (ktime_before(now, tick_next_period)) {
@@ -114,25 +114,23 @@ static void tick_do_update_jiffies64(ktime_t now)
TICK_NSEC);
}
- /* Advance jiffies to complete the jiffies_seq protected job */
+ /* Advance jiffies to complete the 'jiffies_seq' protected job */
jiffies_64 += ticks;
- /*
- * Keep the tick_next_period variable up to date.
- */
+ /* Keep the tick_next_period variable up to date */
nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
if (IS_ENABLED(CONFIG_64BIT)) {
/*
* Pairs with smp_load_acquire() in the lockless quick
- * check above and ensures that the update to jiffies_64 is
- * not reordered vs. the store to tick_next_period, neither
+ * check above, and ensures that the update to 'jiffies_64' is
+ * not reordered vs. the store to 'tick_next_period', neither
* by the compiler nor by the CPU.
*/
smp_store_release(&tick_next_period, nextp);
} else {
/*
- * A plain store is good enough on 32bit as the quick check
+ * A plain store is good enough on 32-bit, as the quick check
* above is protected by the sequence count.
*/
tick_next_period = nextp;
@@ -140,7 +138,7 @@ static void tick_do_update_jiffies64(ktime_t now)
/*
* Release the sequence count. calc_global_load() below is not
- * protected by it, but jiffies_lock needs to be held to prevent
+ * protected by it, but 'jiffies_lock' needs to be held to prevent
* concurrent invocations.
*/
write_seqcount_end(&jiffies_seq);
@@ -160,75 +158,132 @@ static ktime_t tick_init_jiffy_update(void)
raw_spin_lock(&jiffies_lock);
write_seqcount_begin(&jiffies_seq);
- /* Did we start the jiffies update yet ? */
- if (last_jiffies_update == 0)
+
+ /* Have we started the jiffies update yet ? */
+ if (last_jiffies_update == 0) {
+ u32 rem;
+
+ /*
+ * Ensure that the tick is aligned to a multiple of
+ * TICK_NSEC.
+ */
+ div_u64_rem(tick_next_period, TICK_NSEC, &rem);
+ if (rem)
+ tick_next_period += TICK_NSEC - rem;
+
last_jiffies_update = tick_next_period;
+ }
period = last_jiffies_update;
+
write_seqcount_end(&jiffies_seq);
raw_spin_unlock(&jiffies_lock);
+
return period;
}
+static inline int tick_sched_flag_test(struct tick_sched *ts,
+ unsigned long flag)
+{
+ return !!(ts->flags & flag);
+}
+
+static inline void tick_sched_flag_set(struct tick_sched *ts,
+ unsigned long flag)
+{
+ lockdep_assert_irqs_disabled();
+ ts->flags |= flag;
+}
+
+static inline void tick_sched_flag_clear(struct tick_sched *ts,
+ unsigned long flag)
+{
+ lockdep_assert_irqs_disabled();
+ ts->flags &= ~flag;
+}
+
+/*
+ * Allow only one non-timekeeper CPU at a time update jiffies from
+ * the timer tick.
+ *
+ * Returns true if update was run.
+ */
+static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now)
+{
+ static atomic_t in_progress;
+ int inp;
+
+ inp = atomic_read(&in_progress);
+ if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1))
+ return false;
+
+ if (ts->last_tick_jiffies == jiffies)
+ tick_do_update_jiffies64(now);
+ atomic_set(&in_progress, 0);
+ return true;
+}
+
#define MAX_STALLED_JIFFIES 5
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
- int cpu = smp_processor_id();
+ int tick_cpu, cpu = smp_processor_id();
-#ifdef CONFIG_NO_HZ_COMMON
/*
* Check if the do_timer duty was dropped. We don't care about
* concurrency: This happens only when the CPU in charge went
* into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
- * jiffies_lock.
+ * 'jiffies_lock'.
*
* If nohz_full is enabled, this should not happen because the
- * tick_do_timer_cpu never relinquishes.
+ * 'tick_do_timer_cpu' CPU never relinquishes.
*/
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
+ tick_cpu = READ_ONCE(tick_do_timer_cpu);
+
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
WARN_ON_ONCE(tick_nohz_full_running);
#endif
- tick_do_timer_cpu = cpu;
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
+ tick_cpu = cpu;
}
-#endif
- /* Check, if the jiffies need an update */
- if (tick_do_timer_cpu == cpu)
+ /* Check if jiffies need an update */
+ if (tick_cpu == cpu)
tick_do_update_jiffies64(now);
/*
- * If jiffies update stalled for too long (timekeeper in stop_machine()
+ * If the jiffies update stalled for too long (timekeeper in stop_machine()
* or VMEXIT'ed for several msecs), force an update.
*/
if (ts->last_tick_jiffies != jiffies) {
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
- if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
- tick_do_update_jiffies64(now);
- ts->stalled_jiffies = 0;
- ts->last_tick_jiffies = READ_ONCE(jiffies);
+ if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
+ if (tick_limited_update_jiffies64(ts, now)) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
}
}
- if (ts->inidle)
+ if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
ts->got_idle_tick = 1;
}
static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
-#ifdef CONFIG_NO_HZ_COMMON
/*
* When we are idle and the tick is stopped, we have to touch
* the watchdog as we might not schedule for a really long
- * time. This happens on complete idle SMP systems while
+ * time. This happens on completely idle SMP systems while
* waiting on the login prompt. We also increment the "start of
* idle" jiffy stamp so the idle accounting adjustment we do
- * when we go busy again does not account too much ticks.
+ * when we go busy again does not account too many ticks.
*/
- if (ts->tick_stopped) {
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
+ tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
@@ -239,11 +294,44 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
*/
ts->next_tick = 0;
}
-#endif
+
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
}
-#endif
+
+/*
+ * We rearm the timer until we get disabled by the idle code.
+ * Called with interrupts disabled.
+ */
+static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer)
+{
+ struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer);
+ struct pt_regs *regs = get_irq_regs();
+ ktime_t now = ktime_get();
+
+ tick_sched_do_timer(ts, now);
+
+ /*
+ * Do not call when we are not in IRQ context and have
+ * no valid 'regs' pointer
+ */
+ if (regs)
+ tick_sched_handle(ts, regs);
+ else
+ ts->next_tick = 0;
+
+ /*
+ * In dynticks mode, tick reprogram is deferred:
+ * - to the idle task if in dynticks-idle
+ * - to IRQ exit if in full-dynticks.
+ */
+ if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED)))
+ return HRTIMER_NORESTART;
+
+ hrtimer_forward(timer, now, TICK_NSEC);
+
+ return HRTIMER_RESTART;
+}
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
@@ -281,6 +369,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
+ if (val & TICK_DEP_MASK_RCU_EXP) {
+ trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
+ return true;
+ }
+
return false;
}
@@ -346,7 +439,7 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
/*
* If the task is not running, run_posix_cpu_timers()
- * has nothing to elapse, IPI can then be spared.
+ * has nothing to elapse, and an IPI can then be optimized out.
*
* activate_task() STORE p->tick_dep_mask
* STORE p->on_rq
@@ -355,6 +448,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
* smp_mb__after_spin_lock()
* tick_nohz_task_switch()
* LOAD p->tick_dep_mask
+ *
+ * XXX given a task picks up the dependency on schedule(), should we
+ * only care about tasks that are currently on the CPU instead of all
+ * that are on the runqueue?
+ *
+ * That is, does this want to be: task_on_cpu() / task_curr()?
*/
if (!sched_task_on_rq(tsk))
return;
@@ -409,7 +508,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep,
/*
* Set a global tick dependency. Used by perf events that rely on freq and
- * by unstable clock.
+ * unstable clocks.
*/
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
@@ -423,7 +522,7 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit)
/*
* Set per-CPU tick dependency. Used by scheduler and perf events in order to
- * manage events throttling.
+ * manage event-throttling.
*/
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
@@ -439,7 +538,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
if (cpu == smp_processor_id()) {
tick_nohz_full_kick();
} else {
- /* Remote irq work not NMI-safe */
+ /* Remote IRQ work not NMI-safe */
if (!WARN_ON_ONCE(in_nmi()))
tick_nohz_full_kick_cpu(cpu);
}
@@ -457,7 +556,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
- * Set a per-task tick dependency. RCU need this. Also posix CPU timers
+ * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
* in order to elapse per task timers.
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
@@ -512,7 +611,7 @@ void __tick_nohz_task_switch(void)
ts = this_cpu_ptr(&tick_cpu_sched);
- if (ts->tick_stopped) {
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
if (atomic_read(&current->tick_dep_mask) ||
atomic_read(&current->signal->tick_dep_mask))
tick_nohz_full_kick();
@@ -527,16 +626,21 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
tick_nohz_full_running = true;
}
-static int tick_nohz_cpu_down(unsigned int cpu)
+bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
/*
- * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+ * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
* timers, workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled.
*/
- if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
- return -EBUSY;
- return 0;
+ if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
+ return false;
+ return true;
+}
+
+static int tick_nohz_cpu_down(unsigned int cpu)
+{
+ return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}
void __init tick_nohz_init(void)
@@ -547,12 +651,12 @@ void __init tick_nohz_init(void)
return;
/*
- * Full dynticks uses irq work to drive the tick rescheduling on safe
- * locking contexts. But then we need irq work to raise its own
- * interrupts to avoid circular dependency on the tick
+ * Full dynticks uses IRQ work to drive the tick rescheduling on safe
+ * locking contexts. But then we need IRQ work to raise its own
+ * interrupts to avoid circular dependency on the tick.
*/
if (!arch_irq_work_has_interrupt()) {
- pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
+ pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
tick_nohz_full_running = false;
return;
@@ -579,7 +683,7 @@ void __init tick_nohz_init(void)
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
}
-#endif
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
/*
* NOHZ - aka dynamic tick functionality
@@ -604,25 +708,26 @@ bool tick_nohz_tick_stopped(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- return ts->tick_stopped;
+ return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}
bool tick_nohz_tick_stopped_cpu(int cpu)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
- return ts->tick_stopped;
+ return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
+ * @now: current ktime_t
*
* Called from interrupt entry when the CPU was idle
*
* In case the sched_tick was stopped on this CPU, we have to check if jiffies
* must be updated. Otherwise an interrupt handler could use a stale jiffy
* value. We do this unconditionally on any CPU, as we don't know whether the
- * CPU, which has the update task assigned is in a long sleep.
+ * CPU, which has the update task assigned, is in a long sleep.
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
@@ -637,43 +742,67 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}
-/*
- * Updates the per-CPU time idle statistics counters
- */
-static void
-update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
ktime_t delta;
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- if (nr_iowait_cpu(cpu) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- ts->idle_entrytime = now;
- }
+ if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
+ return;
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
+ delta = ktime_sub(now, ts->idle_entrytime);
-}
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
+ if (nr_iowait_cpu(smp_processor_id()) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- update_ts_time_stats(smp_processor_id(), ts, now, NULL);
- ts->idle_active = 0;
+ ts->idle_entrytime = now;
+ tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
+ write_seqcount_end(&ts->idle_sleeptime_seq);
sched_clock_idle_wakeup_event();
}
static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
ts->idle_entrytime = ktime_get();
- ts->idle_active = 1;
+ tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
+ write_seqcount_end(&ts->idle_sleeptime_seq);
+
sched_clock_idle_sleep_event();
}
+static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now, idle;
+ unsigned int seq;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ do {
+ seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
+
+ if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ idle = ktime_add(*sleeptime, delta);
+ } else {
+ idle = *sleeptime;
+ }
+ } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
+
+ return ktime_to_us(idle);
+
+}
+
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
@@ -681,37 +810,22 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note that this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
- * This function returns -1 if NOHZ is not enabled.
+ * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
*/
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, idle;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- idle = ts->idle_sleeptime;
- } else {
- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
- }
- }
-
- return ktime_to_us(idle);
+ return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
+ !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
@@ -722,36 +836,22 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* counters if NULL.
*
* Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
- * This function returns -1 if NOHZ is not enabled.
+ * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
*/
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, iowait;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- iowait = ts->iowait_sleeptime;
- } else {
- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
- }
- return ktime_to_us(iowait);
+ return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
+ nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
@@ -763,7 +863,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
/* Forward the time to expire in the future */
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
hrtimer_start_expires(&ts->sched_timer,
HRTIMER_MODE_ABS_PINNED_HARD);
} else {
@@ -771,7 +871,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
}
/*
- * Reset to make sure next tick stop doesn't get fooled by past
+ * Reset to make sure the next tick stop doesn't get fooled by past
* cached clock deadline.
*/
ts->next_tick = 0;
@@ -779,32 +879,55 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
static inline bool local_timer_softirq_pending(void)
{
- return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
+ return local_timers_pending() & BIT(TIMER_SOFTIRQ);
}
-static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
+/*
+ * Read jiffies and the time when jiffies were updated last
+ */
+u64 get_jiffies_update(unsigned long *basej)
{
- u64 basemono, next_tick, delta, expires;
unsigned long basejiff;
unsigned int seq;
+ u64 basemono;
- /* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqcount_begin(&jiffies_seq);
basemono = last_jiffies_update;
basejiff = jiffies;
} while (read_seqcount_retry(&jiffies_seq, seq));
+ *basej = basejiff;
+ return basemono;
+}
+
+/**
+ * tick_nohz_next_event() - return the clock monotonic based next event
+ * @ts: pointer to tick_sched struct
+ * @cpu: CPU number
+ *
+ * Return:
+ * *%0 - When the next event is a maximum of TICK_NSEC in the future
+ * and the tick is not stopped yet
+ * *%next_event - Next event based on clock monotonic
+ */
+static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
+{
+ u64 basemono, next_tick, delta, expires;
+ unsigned long basejiff;
+ int tick_cpu;
+
+ basemono = get_jiffies_update(&basejiff);
ts->last_jiffies = basejiff;
ts->timer_expires_base = basemono;
/*
* Keep the periodic tick, when RCU, architecture or irq_work
* requests it.
- * Aside of that check whether the local timer softirq is
- * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * Aside of that, check whether the local timer softirq is
+ * pending. If so, its a bad idea to call get_next_timer_interrupt(),
* because there is an already expired timer, so it will request
* immediate expiry, which rearms the hardware timer with a
- * minimal delta which brings us back to this place
+ * minimal delta, which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
if (rcu_needs_cpu() || arch_needs_cpu() ||
@@ -822,6 +945,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
ts->next_timer = next_tick;
}
+ /* Make sure next_tick is never before basemono! */
+ if (WARN_ON_ONCE(basemono > next_tick))
+ next_tick = basemono;
+
/*
* If the tick is due in the next period, keep it ticking or
* force prod the timer.
@@ -829,15 +956,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
/*
- * Tell the timer code that the base is not idle, i.e. undo
- * the effect of get_next_timer_interrupt():
- */
- timer_clear_idle();
- /*
* We've not stopped the tick yet, and there's a timer in the
* next period, so no point in stopping it either, bail.
*/
- if (!ts->tick_stopped) {
+ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
ts->timer_expires = 0;
goto out;
}
@@ -845,12 +967,13 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
/*
* If this CPU is the one which had the do_timer() duty last, we limit
- * the sleep time to the timekeeping max_deferment value.
+ * the sleep time to the timekeeping 'max_deferment' value.
* Otherwise we can sleep as long as we want.
*/
delta = timekeeping_max_deferment();
- if (cpu != tick_do_timer_cpu &&
- (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
+ tick_cpu = READ_ONCE(tick_do_timer_cpu);
+ if (tick_cpu != cpu &&
+ (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
delta = KTIME_MAX;
/* Calculate the next expiry time */
@@ -868,76 +991,103 @@ out:
static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+ unsigned long basejiff = ts->last_jiffies;
u64 basemono = ts->timer_expires_base;
- u64 expires = ts->timer_expires;
- ktime_t tick = expires;
+ bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
+ int tick_cpu;
+ u64 expires;
/* Make sure we won't be trying to stop it twice in a row. */
ts->timer_expires_base = 0;
/*
+ * Now the tick should be stopped definitely - so the timer base needs
+ * to be marked idle as well to not miss a newly queued timer.
+ */
+ expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle);
+ if (expires > ts->timer_expires) {
+ /*
+ * This path could only happen when the first timer was removed
+ * between calculating the possible sleep length and now (when
+ * high resolution mode is not active, timer could also be a
+ * hrtimer).
+ *
+ * We have to stick to the original calculated expiry value to
+ * not stop the tick for too long with a shallow C-state (which
+ * was programmed by cpuidle because of an early next expiration
+ * value).
+ */
+ expires = ts->timer_expires;
+ }
+
+ /* If the timer base is not idle, retain the not yet stopped tick. */
+ if (!timer_idle)
+ return;
+
+ /*
* If this CPU is the one which updates jiffies, then give up
* the assignment and let it be taken by the CPU which runs
* the tick timer next, which might be this CPU as well. If we
- * don't drop this here the jiffies might be stale and
- * do_timer() never invoked. Keep track of the fact that it
+ * don't drop this here, the jiffies might be stale and
+ * do_timer() never gets invoked. Keep track of the fact that it
* was the one which had the do_timer() duty last.
*/
- if (cpu == tick_do_timer_cpu) {
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- ts->do_timer_last = 1;
- } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
- ts->do_timer_last = 0;
+ tick_cpu = READ_ONCE(tick_do_timer_cpu);
+ if (tick_cpu == cpu) {
+ WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
+ tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
+ } else if (tick_cpu != TICK_DO_TIMER_NONE) {
+ tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
}
- /* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && (expires == ts->next_tick)) {
+ /* Skip reprogram of event if it's not changed */
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
- if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
+ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
return;
- WARN_ON_ONCE(1);
- printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
- basemono, ts->next_tick, dev->next_event,
- hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
+ WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
+ "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
+ dev->next_event, hrtimer_active(&ts->sched_timer),
+ hrtimer_get_expires(&ts->sched_timer));
}
/*
- * nohz_stop_sched_tick can be called several times before
- * the nohz_restart_sched_tick is called. This happens when
- * interrupts arrive which do not cause a reschedule. In the
- * first call we save the current tick time, so we can restart
- * the scheduler tick in nohz_restart_sched_tick.
+ * tick_nohz_stop_tick() can be called several times before
+ * tick_nohz_restart_sched_tick() is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the first
+ * call we save the current tick time, so we can restart the
+ * scheduler tick in tick_nohz_restart_sched_tick().
*/
- if (!ts->tick_stopped) {
+ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
calc_load_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
- ts->tick_stopped = 1;
+ tick_sched_flag_set(ts, TS_FLAG_STOPPED);
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
- ts->next_tick = tick;
+ ts->next_tick = expires;
/*
* If the expiration time == KTIME_MAX, then we simply stop
* the tick timer.
*/
if (unlikely(expires == KTIME_MAX)) {
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
hrtimer_cancel(&ts->sched_timer);
else
tick_program_event(KTIME_MAX, 1);
return;
}
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, tick,
+ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
+ hrtimer_start(&ts->sched_timer, expires,
HRTIMER_MODE_ABS_PINNED_HARD);
} else {
- hrtimer_set_expires(&ts->sched_timer, tick);
- tick_program_event(tick, 1);
+ hrtimer_set_expires(&ts->sched_timer, expires);
+ tick_program_event(expires, 1);
}
}
@@ -947,7 +1097,7 @@ static void tick_nohz_retain_tick(struct tick_sched *ts)
}
#ifdef CONFIG_NO_HZ_FULL
-static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
+static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu)
{
if (tick_nohz_next_event(ts, cpu))
tick_nohz_stop_tick(ts, cpu);
@@ -969,10 +1119,9 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
calc_load_nohz_stop();
touch_softlockup_watchdog_sched();
- /*
- * Cancel the scheduled timer and restore the tick
- */
- ts->tick_stopped = 0;
+
+ /* Cancel the scheduled timer and restore the tick: */
+ tick_sched_flag_clear(ts, TS_FLAG_STOPPED);
tick_nohz_restart(ts, now);
}
@@ -983,8 +1132,8 @@ static void __tick_nohz_full_update_tick(struct tick_sched *ts,
int cpu = smp_processor_id();
if (can_stop_full_tick(cpu, ts))
- tick_nohz_stop_sched_tick(ts, cpu);
- else if (ts->tick_stopped)
+ tick_nohz_full_stop_tick(ts, cpu);
+ else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
tick_nohz_restart_sched_tick(ts, now);
#endif
}
@@ -994,7 +1143,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
- if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
+ if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ))
return;
__tick_nohz_full_update_tick(ts, ktime_get());
@@ -1003,11 +1152,11 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
/*
* A pending softirq outside an IRQ (or softirq disabled section) context
* should be waiting for ksoftirqd to handle it. Therefore we shouldn't
- * reach here due to the need_resched() early check in can_stop_idle_tick().
+ * reach this code due to the need_resched() early check in can_stop_idle_tick().
*
* However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
* cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
- * triggering the below since wakep_softirqd() is ignored.
+ * triggering the code below, since wakep_softirqd() is ignored.
*
*/
static bool report_idle_softirq(void)
@@ -1025,41 +1174,24 @@ static bool report_idle_softirq(void)
return false;
}
- if (ratelimit < 10)
- return false;
-
- /* On RT, softirqs handling may be waiting on some lock */
- if (!local_bh_blocked())
+ /* On RT, softirq handling may be waiting on some lock */
+ if (local_bh_blocked())
return false;
- pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
- pending);
- ratelimit++;
+ if (ratelimit < 10) {
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+ }
return true;
}
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
- /*
- * If this CPU is offline and it is the one which updates
- * jiffies, then give up the assignment and let it be taken by
- * the CPU which runs the tick timer next. If we don't drop
- * this here the jiffies might be stale and do_timer() never
- * invoked.
- */
- if (unlikely(!cpu_online(cpu))) {
- if (cpu == tick_do_timer_cpu)
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- /*
- * Make sure the CPU doesn't get fooled by obsolete tick
- * deadline if it comes back online later.
- */
- ts->next_tick = 0;
- return false;
- }
+ WARN_ON_ONCE(cpu_is_offline(cpu));
- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+ if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ)))
return false;
if (need_resched())
@@ -1069,25 +1201,33 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
if (tick_nohz_full_enabled()) {
+ int tick_cpu = READ_ONCE(tick_do_timer_cpu);
+
/*
* Keep the tick alive to guarantee timekeeping progression
* if there are full dynticks CPUs around
*/
- if (tick_do_timer_cpu == cpu)
+ if (tick_cpu == cpu)
return false;
/* Should not happen for nohz-full */
- if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
return false;
}
return true;
}
-static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
+/**
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ */
+void tick_nohz_idle_stop_tick(void)
{
- ktime_t expires;
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
int cpu = smp_processor_id();
+ ktime_t expires;
/*
* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
@@ -1103,14 +1243,14 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
ts->idle_calls++;
if (expires > 0LL) {
- int was_stopped = ts->tick_stopped;
+ int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
tick_nohz_stop_tick(ts, cpu);
ts->idle_sleeps++;
ts->idle_expires = expires;
- if (!was_stopped && ts->tick_stopped) {
+ if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
ts->idle_jiffies = ts->last_jiffies;
nohz_balance_enter_idle(cpu);
}
@@ -1119,24 +1259,9 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
}
}
-/**
- * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- */
-void tick_nohz_idle_stop_tick(void)
-{
- __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
-}
-
void tick_nohz_idle_retain_tick(void)
{
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
- /*
- * Undo the effect of get_next_timer_interrupt() called from
- * tick_nohz_next_event().
- */
- timer_clear_idle();
}
/**
@@ -1156,25 +1281,36 @@ void tick_nohz_idle_enter(void)
WARN_ON_ONCE(ts->timer_expires_base);
- ts->inidle = 1;
+ tick_sched_flag_set(ts, TS_FLAG_INIDLE);
tick_nohz_start_idle(ts);
local_irq_enable();
}
/**
- * tick_nohz_irq_exit - update next tick event from interrupt exit
+ * tick_nohz_irq_exit - Notify the tick about IRQ exit
+ *
+ * A timer may have been added/modified/deleted either by the current IRQ,
+ * or by another place using this IRQ as a notification. This IRQ may have
+ * also updated the RCU callback list. These events may require a
+ * re-evaluation of the next tick. Depending on the context:
*
- * When an interrupt fires while we are idle and it doesn't cause
- * a reschedule, it may still add, modify or delete a timer, enqueue
- * an RCU callback, etc...
- * So we need to re-calculate and reprogram the next tick event.
+ * 1) If the CPU is idle and no resched is pending, just proceed with idle
+ * time accounting. The next tick will be re-evaluated on the next idle
+ * loop iteration.
+ *
+ * 2) If the CPU is nohz_full:
+ *
+ * 2.1) If there is any tick dependency, restart the tick if stopped.
+ *
+ * 2.2) If there is no tick dependency, (re-)evaluate the next tick and
+ * stop/update it accordingly.
*/
void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- if (ts->inidle)
+ if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
tick_nohz_start_idle(ts);
else
tick_nohz_full_update_tick(ts);
@@ -1182,6 +1318,8 @@ void tick_nohz_irq_exit(void)
/**
* tick_nohz_idle_got_tick - Check whether or not the tick handler has run
+ *
+ * Return: %true if the tick handler has run, otherwise %false
*/
bool tick_nohz_idle_got_tick(void)
{
@@ -1196,10 +1334,12 @@ bool tick_nohz_idle_got_tick(void)
/**
* tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
- * or the tick, whatever that expires first. Note that, if the tick has been
+ * or the tick, whichever expires first. Note that, if the tick has been
* stopped, it returns the next hrtimer.
*
* Called from power state control code with interrupts disabled
+ *
+ * Return: the next expiration time
*/
ktime_t tick_nohz_get_next_hrtimer(void)
{
@@ -1215,6 +1355,8 @@ ktime_t tick_nohz_get_next_hrtimer(void)
* The return value of this function and/or the value returned by it through the
* @delta_next pointer can be negative which must be taken into account by its
* callers.
+ *
+ * Return: the expected length of the current sleep
*/
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
@@ -1228,7 +1370,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
ktime_t now = ts->idle_entrytime;
ktime_t next_event;
- WARN_ON_ONCE(!ts->inidle);
+ WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
*delta_next = ktime_sub(dev->next_event, now);
@@ -1240,7 +1382,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
return *delta_next;
/*
- * If the next highres timer to expire is earlier than next_event, the
+ * If the next highres timer to expire is earlier than 'next_event', the
* idle governor needs to know that.
*/
next_event = min_t(u64, next_event,
@@ -1252,8 +1394,11 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
/**
* tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
* for a particular CPU.
+ * @cpu: target CPU number
*
* Called from the schedutil frequency scaling governor in scheduler context.
+ *
+ * Return: the current idle calls counter value for @cpu
*/
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
@@ -1262,18 +1407,6 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
return ts->idle_calls;
}
-/**
- * tick_nohz_get_idle_calls - return the current idle calls counter value
- *
- * Called from the schedutil frequency scaling governor in scheduler context.
- */
-unsigned long tick_nohz_get_idle_calls(void)
-{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-
- return ts->idle_calls;
-}
-
static void tick_nohz_account_idle_time(struct tick_sched *ts,
ktime_t now)
{
@@ -1284,9 +1417,9 @@ static void tick_nohz_account_idle_time(struct tick_sched *ts,
if (vtime_accounting_enabled_this_cpu())
return;
/*
- * We stopped the tick in idle. Update process times would miss the
- * time we slept as update_process_times does only a 1 tick
- * accounting. Enforce that this is accounted to idle !
+ * We stopped the tick in idle. update_process_times() would miss the
+ * time we slept, as it does only a 1 tick accounting.
+ * Enforce that this is accounted to idle !
*/
ticks = jiffies - ts->idle_jiffies;
/*
@@ -1300,7 +1433,7 @@ void tick_nohz_idle_restart_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- if (ts->tick_stopped) {
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
ktime_t now = ktime_get();
tick_nohz_restart_sched_tick(ts, now);
tick_nohz_account_idle_time(ts, now);
@@ -1318,11 +1451,20 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
}
/**
- * tick_nohz_idle_exit - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - Update the tick upon idle task exit
+ *
+ * When the idle task exits, update the tick depending on the
+ * following situations:
+ *
+ * 1) If the CPU is not in nohz_full mode (most cases), then
+ * restart the tick.
+ *
+ * 2) If the CPU is in nohz_full mode (corner case):
+ * 2.1) If the tick can be kept stopped (no tick dependencies)
+ * then re-evaluate the next tick and try to keep it stopped
+ * as long as possible.
+ * 2.2) If the tick has dependencies, restart the tick.
*
- * Restart the idle tick when the CPU is woken up from idle
- * This also exit the RCU extended quiescent state. The CPU
- * can use RCU again after this function is called.
*/
void tick_nohz_idle_exit(void)
{
@@ -1332,12 +1474,12 @@ void tick_nohz_idle_exit(void)
local_irq_disable();
- WARN_ON_ONCE(!ts->inidle);
+ WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
WARN_ON_ONCE(ts->timer_expires_base);
- ts->inidle = 0;
- idle_active = ts->idle_active;
- tick_stopped = ts->tick_stopped;
+ tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
+ idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
+ tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
if (idle_active || tick_stopped)
now = ktime_get();
@@ -1352,69 +1494,47 @@ void tick_nohz_idle_exit(void)
}
/*
- * The nohz low res interrupt handler
+ * In low-resolution mode, the tick handler must be implemented directly
+ * at the clockevent level. hrtimer can't be used instead, because its
+ * infrastructure actually relies on the tick itself as a backend in
+ * low-resolution mode (see hrtimer_run_queues()).
*/
-static void tick_nohz_handler(struct clock_event_device *dev)
+static void tick_nohz_lowres_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- struct pt_regs *regs = get_irq_regs();
- ktime_t now = ktime_get();
dev->next_event = KTIME_MAX;
- tick_sched_do_timer(ts, now);
- tick_sched_handle(ts, regs);
-
- if (unlikely(ts->tick_stopped)) {
- /*
- * The clockevent device is not reprogrammed, so change the
- * clock event device to ONESHOT_STOPPED to avoid spurious
- * interrupts on devices which might not be truly one shot.
- */
- tick_program_event(KTIME_MAX, 1);
- return;
- }
-
- hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
- tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
-static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
+static inline void tick_nohz_activate(struct tick_sched *ts)
{
if (!tick_nohz_enabled)
return;
- ts->nohz_mode = mode;
+ tick_sched_flag_set(ts, TS_FLAG_NOHZ);
/* One update is enough */
if (!test_and_set_bit(0, &tick_nohz_active))
timers_update_nohz();
}
/**
- * tick_nohz_switch_to_nohz - switch to nohz mode
+ * tick_nohz_switch_to_nohz - switch to NOHZ mode
*/
static void tick_nohz_switch_to_nohz(void)
{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- ktime_t next;
-
if (!tick_nohz_enabled)
return;
- if (tick_switch_to_oneshot(tick_nohz_handler))
+ if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
return;
/*
- * Recycle the hrtimer in ts, so we can share the
- * hrtimer_forward with the highres code.
+ * Recycle the hrtimer in 'ts', so we can share the
+ * highres code.
*/
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- /* Get the next period */
- next = tick_init_jiffy_update();
-
- hrtimer_set_expires(&ts->sched_timer, next);
- hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
- tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
- tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
+ tick_setup_sched_timer(false);
}
static inline void tick_nohz_irq_enter(void)
@@ -1422,19 +1542,19 @@ static inline void tick_nohz_irq_enter(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
- if (!ts->idle_active && !ts->tick_stopped)
+ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
return;
now = ktime_get();
- if (ts->idle_active)
+ if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
tick_nohz_stop_idle(ts, now);
/*
- * If all CPUs are idle. We may need to update a stale jiffies value.
+ * If all CPUs are idle we may need to update a stale jiffies value.
* Note nohz_full is a special case: a timekeeper is guaranteed to stay
* alive but it might be busy looping with interrupts disabled in some
* rare case (typically stop machine). So we must make sure we have a
* last resort.
*/
- if (ts->tick_stopped)
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
tick_nohz_update_jiffies(now);
}
@@ -1442,12 +1562,12 @@ static inline void tick_nohz_irq_enter(void)
static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
-static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
+static inline void tick_nohz_activate(struct tick_sched *ts) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * Called from irq_enter to notify about the possible interruption of idle()
+ * Called from irq_enter() to notify about the possible interruption of idle()
*/
void tick_irq_enter(void)
{
@@ -1455,41 +1575,6 @@ void tick_irq_enter(void)
tick_nohz_irq_enter();
}
-/*
- * High resolution timer specific code
- */
-#ifdef CONFIG_HIGH_RES_TIMERS
-/*
- * We rearm the timer until we get disabled by the idle code.
- * Called with interrupts disabled.
- */
-static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
-{
- struct tick_sched *ts =
- container_of(timer, struct tick_sched, sched_timer);
- struct pt_regs *regs = get_irq_regs();
- ktime_t now = ktime_get();
-
- tick_sched_do_timer(ts, now);
-
- /*
- * Do not call, when we are not in irq context and have
- * no valid regs pointer
- */
- if (regs)
- tick_sched_handle(ts, regs);
- else
- ts->next_tick = 0;
-
- /* No need to reprogram if we are in idle or full dynticks mode */
- if (unlikely(ts->tick_stopped))
- return HRTIMER_NORESTART;
-
- hrtimer_forward(timer, now, TICK_NSEC);
-
- return HRTIMER_RESTART;
-}
-
static int sched_skew_tick;
static int __init skew_tick(char *str)
@@ -1502,22 +1587,22 @@ early_param("skew_tick", skew_tick);
/**
* tick_setup_sched_timer - setup the tick emulation timer
+ * @hrtimer: whether to use the hrtimer or not
*/
-void tick_setup_sched_timer(void)
+void tick_setup_sched_timer(bool hrtimer)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- ktime_t now = ktime_get();
- /*
- * Emulate tick processing via per-CPU hrtimers:
- */
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- ts->sched_timer.function = tick_sched_timer;
+ /* Emulate tick processing via per-CPU hrtimers: */
+ hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
+ tick_sched_flag_set(ts, TS_FLAG_HIGHRES);
/* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
- /* Offset the tick to avert jiffies_lock contention. */
+ /* Offset the tick to avert 'jiffies_lock' contention. */
if (sched_skew_tick) {
u64 offset = TICK_NSEC >> 1;
do_div(offset, num_possible_cpus());
@@ -1525,25 +1610,38 @@ void tick_setup_sched_timer(void)
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
- hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
- tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
+ hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
+ else
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ tick_nohz_activate(ts);
}
-#endif /* HIGH_RES_TIMERS */
-#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
-void tick_cancel_sched_timer(int cpu)
+/*
+ * Shut down the tick and make sure the CPU won't try to retake the timekeeping
+ * duty before disabling IRQs in idle for the last time.
+ */
+void tick_sched_timer_dying(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t idle_sleeptime, iowait_sleeptime;
+ unsigned long idle_calls, idle_sleeps;
-# ifdef CONFIG_HIGH_RES_TIMERS
- if (ts->sched_timer.base)
+ /* This must happen before hrtimers are migrated! */
+ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
hrtimer_cancel(&ts->sched_timer);
-# endif
+ idle_sleeptime = ts->idle_sleeptime;
+ iowait_sleeptime = ts->iowait_sleeptime;
+ idle_calls = ts->idle_calls;
+ idle_sleeps = ts->idle_sleeps;
memset(ts, 0, sizeof(*ts));
+ ts->idle_sleeptime = idle_sleeptime;
+ ts->iowait_sleeptime = iowait_sleeptime;
+ ts->idle_calls = idle_calls;
+ ts->idle_sleeps = idle_sleeps;
}
-#endif
/*
* Async notification about clocksource changes
@@ -1567,10 +1665,10 @@ void tick_oneshot_notify(void)
}
/*
- * Check, if a change happened, which makes oneshot possible.
+ * Check if a change happened, which makes oneshot possible.
*
- * Called cyclic from the hrtimer softirq (driven by the timer
- * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * Called cyclically from the hrtimer softirq (driven by the timer
+ * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
* mode, because high resolution timers are disabled (either compile
* or runtime). Called with interrupts disabled.
*/
@@ -1581,7 +1679,7 @@ int tick_check_oneshot_change(int allow_nohz)
if (!test_and_clear_bit(0, &ts->check_clocks))
return 0;
- if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
+ if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
return 0;
if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 504649513399..b4a7822f495d 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -14,82 +14,101 @@ struct tick_device {
enum tick_device_mode mode;
};
-enum tick_nohz_mode {
- NOHZ_MODE_INACTIVE,
- NOHZ_MODE_LOWRES,
- NOHZ_MODE_HIGHRES,
-};
+/* The CPU is in the tick idle mode */
+#define TS_FLAG_INIDLE BIT(0)
+/* The idle tick has been stopped */
+#define TS_FLAG_STOPPED BIT(1)
+/*
+ * Indicator that the CPU is actively in the tick idle mode;
+ * it is reset during irq handling phases.
+ */
+#define TS_FLAG_IDLE_ACTIVE BIT(2)
+/* CPU was the last one doing do_timer before going idle */
+#define TS_FLAG_DO_TIMER_LAST BIT(3)
+/* NO_HZ is enabled */
+#define TS_FLAG_NOHZ BIT(4)
+/* High resolution tick mode */
+#define TS_FLAG_HIGHRES BIT(5)
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
+ *
+ * @flags: State flags gathering the TS_FLAG_* features
+ * @got_idle_tick: Tick timer function has run with @inidle set
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @last_tick_jiffies: Value of jiffies seen on last tick
* @sched_timer: hrtimer to schedule the periodic tick in high
* resolution mode
- * @check_clocks: Notification mechanism about clocksource changes
- * @nohz_mode: Mode - one state of tick_nohz_mode
- * @inidle: Indicator that the CPU is in the tick idle mode
- * @tick_stopped: Indicator that the idle tick has been stopped
- * @idle_active: Indicator that the CPU is actively in the tick idle mode;
- * it is reset during irq handling phases.
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
- * @got_idle_tick: Tick timer function has run with @inidle set
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_sleeptime_seq: sequence counter for data consistency
+ * @idle_entrytime: Time when the idle call was entered
+ * @last_jiffies: Base jiffies snapshot when next event was last computed
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_entrytime: Time when the idle call was entered
- * @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
- * @timer_expires_base: Base time clock monotonic for @timer_expires
- * @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
- * @last_tick_jiffies: Value of jiffies seen on last tick
- * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @check_clocks: Notification mechanism about clocksource changes
*/
struct tick_sched {
- struct hrtimer sched_timer;
- unsigned long check_clocks;
- enum tick_nohz_mode nohz_mode;
+ /* Common flags */
+ unsigned long flags;
- unsigned int inidle : 1;
- unsigned int tick_stopped : 1;
- unsigned int idle_active : 1;
- unsigned int do_timer_last : 1;
- unsigned int got_idle_tick : 1;
+ /* Tick handling: jiffies stall check */
+ unsigned int stalled_jiffies;
+ unsigned long last_tick_jiffies;
+ /* Tick handling */
+ struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
unsigned long idle_jiffies;
- unsigned long idle_calls;
- unsigned long idle_sleeps;
- ktime_t idle_entrytime;
ktime_t idle_waketime;
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
+ unsigned int got_idle_tick;
+
+ /* Idle entry */
+ seqcount_t idle_sleeptime_seq;
+ ktime_t idle_entrytime;
+
+ /* Tick stop */
unsigned long last_jiffies;
- u64 timer_expires;
u64 timer_expires_base;
+ u64 timer_expires;
u64 next_timer;
ktime_t idle_expires;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+
+ /* Idle exit */
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+
+ /* Full dynticks handling */
atomic_t tick_dep_mask;
- unsigned long last_tick_jiffies;
- unsigned int stalled_jiffies;
+
+ /* Clocksource changes */
+ unsigned long check_clocks;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
-extern void tick_setup_sched_timer(void);
-#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
-extern void tick_cancel_sched_timer(int cpu);
+extern void tick_setup_sched_timer(bool hrtimer);
+#if defined CONFIG_TICK_ONESHOT
+extern void tick_sched_timer_dying(int cpu);
#else
-static inline void tick_cancel_sched_timer(int cpu) { }
+static inline void tick_sched_timer_dying(int cpu) { }
#endif
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/kernel/time/time.c b/kernel/time/time.c
index f4198af60fee..0ba8e3c50d62 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -365,11 +365,14 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
}
#endif
-/*
- * Convert jiffies to milliseconds and back.
+/**
+ * jiffies_to_msecs - Convert jiffies to milliseconds
+ * @j: jiffies value
*
* Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
+ * two most common HZ cases.
+ *
+ * Return: milliseconds value
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
@@ -388,6 +391,12 @@ unsigned int jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
+/**
+ * jiffies_to_usecs - Convert jiffies to microseconds
+ * @j: jiffies value
+ *
+ * Return: microseconds value
+ */
unsigned int jiffies_to_usecs(const unsigned long j)
{
/*
@@ -408,8 +417,15 @@ unsigned int jiffies_to_usecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_usecs);
-/*
+/**
* mktime64 - Converts date to seconds.
+ * @year0: year to convert
+ * @mon0: month to convert
+ * @day: day to convert
+ * @hour: hour to convert
+ * @min: minute to convert
+ * @sec: second to convert
+ *
* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
@@ -427,6 +443,8 @@ EXPORT_SYMBOL(jiffies_to_usecs);
*
* An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
* tomorrow - (allowable under ISO 8601) is supported.
+ *
+ * Return: seconds since the epoch time for the given input date
*/
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
const unsigned int day, const unsigned int hour,
@@ -471,8 +489,7 @@ EXPORT_SYMBOL(ns_to_kernel_old_timeval);
* Set seconds and nanoseconds field of a timespec variable and
* normalize to the timespec storage format
*
- * Note: The tv_nsec part is always in the range of
- * 0 <= tv_nsec < NSEC_PER_SEC
+ * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
* For negative values only the tv_sec field is negative !
*/
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
@@ -501,7 +518,7 @@ EXPORT_SYMBOL(set_normalized_timespec64);
* ns_to_timespec64 - Convert nanoseconds to timespec64
* @nsec: the nanoseconds value to be converted
*
- * Returns the timespec64 representation of the nsec parameter.
+ * Return: the timespec64 representation of the nsec parameter.
*/
struct timespec64 ns_to_timespec64(s64 nsec)
{
@@ -539,15 +556,17 @@ EXPORT_SYMBOL(ns_to_timespec64);
* - all other values are converted to jiffies by either multiplying
* the input value by a factor or dividing it with a factor and
* handling any 32-bit overflows.
- * for the details see __msecs_to_jiffies()
+ * for the details see _msecs_to_jiffies()
*
- * __msecs_to_jiffies() checks for the passed in value being a constant
+ * msecs_to_jiffies() checks for the passed in value being a constant
* via __builtin_constant_p() allowing gcc to eliminate most of the
* code, __msecs_to_jiffies() is called if the value passed does not
* allow constant folding and the actual conversion must be done at
* runtime.
* The _msecs_to_jiffies helpers are the HZ dependent conversion
* routines found in include/linux/jiffies.h
+ *
+ * Return: jiffies value
*/
unsigned long __msecs_to_jiffies(const unsigned int m)
{
@@ -560,6 +579,12 @@ unsigned long __msecs_to_jiffies(const unsigned int m)
}
EXPORT_SYMBOL(__msecs_to_jiffies);
+/**
+ * __usecs_to_jiffies: - convert microseconds to jiffies
+ * @u: time in milliseconds
+ *
+ * Return: jiffies value
+ */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
@@ -568,7 +593,10 @@ unsigned long __usecs_to_jiffies(const unsigned int u)
}
EXPORT_SYMBOL(__usecs_to_jiffies);
-/*
+/**
+ * timespec64_to_jiffies - convert a timespec64 value to jiffies
+ * @value: pointer to &struct timespec64
+ *
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
* that a remainder subtract here would not do the right thing as the
* resolution values don't fall on second boundaries. I.e. the line:
@@ -582,8 +610,9 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
*
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
+ *
+ * Return: jiffies value
*/
-
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
@@ -601,6 +630,11 @@ timespec64_to_jiffies(const struct timespec64 *value)
}
EXPORT_SYMBOL(timespec64_to_jiffies);
+/**
+ * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
+ * @jiffies: jiffies value
+ * @value: pointer to &struct timespec64
+ */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
@@ -618,6 +652,13 @@ EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* Convert jiffies/jiffies_64 to clock_t and back.
*/
+
+/**
+ * jiffies_to_clock_t - Convert jiffies to clock_t
+ * @x: jiffies value
+ *
+ * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
+ */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
@@ -632,6 +673,12 @@ clock_t jiffies_to_clock_t(unsigned long x)
}
EXPORT_SYMBOL(jiffies_to_clock_t);
+/**
+ * clock_t_to_jiffies - Convert clock_t to jiffies
+ * @x: clock_t value
+ *
+ * Return: clock_t value converted to jiffies
+ */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
@@ -649,6 +696,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
}
EXPORT_SYMBOL(clock_t_to_jiffies);
+/**
+ * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
+ * @x: jiffies_64 value
+ *
+ * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
+ */
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
@@ -671,6 +724,12 @@ u64 jiffies_64_to_clock_t(u64 x)
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);
+/**
+ * nsec_to_clock_t - Convert nsec value to clock_t
+ * @x: nsec value
+ *
+ * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
+ */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
@@ -687,6 +746,12 @@ u64 nsec_to_clock_t(u64 x)
#endif
}
+/**
+ * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
+ * @j: jiffies64 value
+ *
+ * Return: nanoseconds value
+ */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
@@ -697,6 +762,12 @@ u64 jiffies64_to_nsecs(u64 j)
}
EXPORT_SYMBOL(jiffies64_to_nsecs);
+/**
+ * jiffies64_to_msecs - Convert jiffies64 to milliseconds
+ * @j: jiffies64 value
+ *
+ * Return: milliseconds value
+ */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
@@ -719,6 +790,8 @@ EXPORT_SYMBOL(jiffies64_to_msecs);
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ *
+ * Return: nsecs converted to jiffies64 value
*/
u64 nsecs_to_jiffies64(u64 n)
{
@@ -750,6 +823,8 @@ EXPORT_SYMBOL(nsecs_to_jiffies64);
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ *
+ * Return: nsecs converted to jiffies value
*/
unsigned long nsecs_to_jiffies(u64 n)
{
@@ -757,10 +832,16 @@ unsigned long nsecs_to_jiffies(u64 n)
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
-/*
- * Add two timespec64 values and do a safety check for overflow.
+/**
+ * timespec64_add_safe - Add two timespec64 values and do a safety check
+ * for overflow.
+ * @lhs: first (left) timespec64 to add
+ * @rhs: second (right) timespec64 to add
+ *
* It's assumed that both values are valid (>= 0).
* And, each timespec64 is in normalized form.
+ *
+ * Return: sum of @lhs + @rhs
*/
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
const struct timespec64 rhs)
@@ -777,7 +858,17 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+EXPORT_SYMBOL_GPL(timespec64_add_safe);
+/**
+ * get_timespec64 - get user's time value into kernel space
+ * @ts: destination &struct timespec64
+ * @uts: user's time value as &struct __kernel_timespec
+ *
+ * Handles compat or 32-bit modes.
+ *
+ * Return: 0 on success or negative errno on error
+ */
int get_timespec64(struct timespec64 *ts,
const struct __kernel_timespec __user *uts)
{
@@ -801,6 +892,14 @@ int get_timespec64(struct timespec64 *ts,
}
EXPORT_SYMBOL_GPL(get_timespec64);
+/**
+ * put_timespec64 - convert timespec64 value to __kernel_timespec format and
+ * copy the latter to userspace
+ * @ts: input &struct timespec64
+ * @uts: user's &struct __kernel_timespec
+ *
+ * Return: 0 on success or negative errno on error
+ */
int put_timespec64(const struct timespec64 *ts,
struct __kernel_timespec __user *uts)
{
@@ -839,6 +938,15 @@ static int __put_old_timespec32(const struct timespec64 *ts64,
return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}
+/**
+ * get_old_timespec32 - get user's old-format time value into kernel space
+ * @ts: destination &struct timespec64
+ * @uts: user's old-format time value (&struct old_timespec32)
+ *
+ * Handles X86_X32_ABI compatibility conversion.
+ *
+ * Return: 0 on success or negative errno on error
+ */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
@@ -848,6 +956,16 @@ int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
}
EXPORT_SYMBOL_GPL(get_old_timespec32);
+/**
+ * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
+ * copy the latter to userspace
+ * @ts: input &struct timespec64
+ * @uts: user's &struct old_timespec32
+ *
+ * Handles X86_X32_ABI compatibility conversion.
+ *
+ * Return: 0 on success or negative errno on error
+ */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
@@ -857,6 +975,13 @@ int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(put_old_timespec32);
+/**
+ * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
+ * @it: destination &struct itimerspec64
+ * @uit: user's &struct __kernel_itimerspec
+ *
+ * Return: 0 on success or negative errno on error
+ */
int get_itimerspec64(struct itimerspec64 *it,
const struct __kernel_itimerspec __user *uit)
{
@@ -872,6 +997,14 @@ int get_itimerspec64(struct itimerspec64 *it,
}
EXPORT_SYMBOL_GPL(get_itimerspec64);
+/**
+ * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
+ * and copy the latter to userspace
+ * @it: input &struct itimerspec64
+ * @uit: user's &struct __kernel_itimerspec
+ *
+ * Return: 0 on success or negative errno on error
+ */
int put_itimerspec64(const struct itimerspec64 *it,
struct __kernel_itimerspec __user *uit)
{
@@ -887,6 +1020,13 @@ int put_itimerspec64(const struct itimerspec64 *it,
}
EXPORT_SYMBOL_GPL(put_itimerspec64);
+/**
+ * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
+ * @its: destination &struct itimerspec64
+ * @uits: user's &struct old_itimerspec32
+ *
+ * Return: 0 on success or negative errno on error
+ */
int get_old_itimerspec32(struct itimerspec64 *its,
const struct old_itimerspec32 __user *uits)
{
@@ -898,6 +1038,14 @@ int get_old_itimerspec32(struct itimerspec64 *its,
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);
+/**
+ * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
+ * old_itimerspec32 and copy the latter to userspace
+ * @its: input &struct itimerspec64
+ * @uits: user's &struct old_itimerspec32
+ *
+ * Return: 0 on success or negative errno on error
+ */
int put_old_itimerspec32(const struct itimerspec64 *its,
struct old_itimerspec32 __user *uits)
{
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index 831e8e779ace..2889763165e5 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -73,7 +73,7 @@ static void time64_to_tm_test_date_range(struct kunit *test)
days = div_s64(secs, 86400);
- #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \
+ #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %lld", \
year, month, mdday, yday, days
KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG);
@@ -86,7 +86,7 @@ static void time64_to_tm_test_date_range(struct kunit *test)
}
static struct kunit_case time_test_cases[] = {
- KUNIT_CASE(time64_to_tm_test_date_range),
+ KUNIT_CASE_SLOW(time64_to_tm_test_date_range),
{}
};
@@ -96,4 +96,5 @@ static struct kunit_suite time_test_suite = {
};
kunit_test_suite(time_test_suite);
+MODULE_DESCRIPTION("time unit test suite");
MODULE_LICENSE("GPL");
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index e6285288d765..3d2a354cfe1c 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -6,7 +6,7 @@
#include <linux/timecounter.h>
void timecounter_init(struct timecounter *tc,
- const struct cyclecounter *cc,
+ struct cyclecounter *cc,
u64 start_tstamp)
{
tc->cc = cc;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5579ead449f2..3ec3daa4acab 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -6,6 +6,7 @@
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/kobject.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -25,13 +26,16 @@
#include <linux/audit.h>
#include <linux/random.h>
+#include <vdso/auxclock.h>
+
#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"
#define TK_CLEAR_NTP (1 << 0)
-#define TK_MIRROR (1 << 1)
-#define TK_CLOCK_WAS_SET (1 << 2)
+#define TK_CLOCK_WAS_SET (1 << 1)
+
+#define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET)
enum timekeeping_adv_mode {
/* Update timekeeper when a tick has passed */
@@ -41,20 +45,49 @@ enum timekeeping_adv_mode {
TK_ADV_FREQ
};
-DEFINE_RAW_SPINLOCK(timekeeper_lock);
-
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
-static struct {
+struct tk_data {
seqcount_raw_spinlock_t seq;
struct timekeeper timekeeper;
-} tk_core ____cacheline_aligned = {
- .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
-};
+ struct timekeeper shadow_timekeeper;
+ raw_spinlock_t lock;
+} ____cacheline_aligned;
+
+static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];
+
+/* The core timekeeper */
+#define tk_core (timekeeper_data[TIMEKEEPER_CORE])
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
+}
+#else
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return false;
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return false;
+}
+#endif
-static struct timekeeper shadow_timekeeper;
+static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
+{
+ tk->offs_aux = offs;
+ tk->monotonic_to_aux = ktime_to_timespec64(offs);
+}
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -114,6 +147,46 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static __init void tk_aux_setup(void);
+static void tk_aux_update_clocksource(void);
+static void tk_aux_advance(void);
+#else
+static inline void tk_aux_setup(void) { }
+static inline void tk_aux_update_clocksource(void) { }
+static inline void tk_aux_advance(void) { }
+#endif
+
+unsigned long timekeeper_lock_irqsave(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tk_core.lock, flags);
+ return flags;
+}
+
+void timekeeper_unlock_irqrestore(unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&tk_core.lock, flags);
+}
+
+/*
+ * Multigrain timestamps require tracking the latest fine-grained timestamp
+ * that has been issued, and never returning a coarse-grained timestamp that is
+ * earlier than that value.
+ *
+ * mg_floor represents the latest fine-grained time that has been handed out as
+ * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
+ * converted to a realtime clock value on an as-needed basis.
+ *
+ * Maintaining mg_floor ensures the multigrain interfaces never issue a
+ * timestamp earlier than one that has been previously issued.
+ *
+ * The exception to this rule is when there is a backward realtime clock jump. If
+ * such an event occurs, a timestamp can appear to be earlier than a previous one.
+ */
+static __cacheline_aligned_in_smp atomic64_t mg_floor;
+
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -135,10 +208,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
return ts;
}
+static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = tk->coarse_nsec;
+ return ts;
+}
+
+/*
+ * Update the nanoseconds part for the coarse time keepers. They can't rely
+ * on xtime_nsec because xtime_nsec could be adjusted by a small negative
+ * amount when the multiplication factor of the clock is adjusted, which
+ * could cause the coarse clocks to go slightly backwards. See
+ * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
+ * clockids which only is updated when the clock has been set or we have
+ * accumulated time.
+ */
+static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
+{
+ tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+}
+
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
+ tk_update_coarse_nsecs(tk);
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
@@ -146,6 +243,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
tk->xtime_sec += ts->tv_sec;
tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
+ tk_update_coarse_nsecs(tk);
}
static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
@@ -161,13 +259,15 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
tk->wall_to_monotonic = wtm;
set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
- tk->offs_real = timespec64_to_ktime(tmp);
- tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
+ /* Paired with READ_ONCE() in ktime_mono_to_any() */
+ WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp));
+ WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)));
}
static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
- tk->offs_boot = ktime_add(tk->offs_boot, delta);
+ /* Paired with READ_ONCE() in ktime_mono_to_any() */
+ WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta));
/*
* Timespec representation for VDSO update to avoid 64bit division
* on every update.
@@ -184,7 +284,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* the tkr's clocksource may change between the read reference, and the
* clock reference passed to the read function. This can cause crashes if
* the wrong clocksource is passed to the wrong read function.
- * This isn't necessary to use when holding the timekeeper_lock or doing
+ * This isn't necessary to use when holding the tk_core.lock or doing
* a read of the fast-timekeeper tkrs (which is protected by its own locking
* and update logic).
*/
@@ -195,106 +295,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr)
return clock->read(clock);
}
-#ifdef CONFIG_DEBUG_TIMEKEEPING
-#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-
-static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
-{
-
- u64 max_cycles = tk->tkr_mono.clock->max_cycles;
- const char *name = tk->tkr_mono.clock->name;
-
- if (offset > max_cycles) {
- printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
- offset, name, max_cycles);
- printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
- } else {
- if (offset > (max_cycles >> 1)) {
- printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
- offset, name, max_cycles >> 1);
- printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
- }
- }
-
- if (tk->underflow_seen) {
- if (jiffies - tk->last_warning > WARNING_FREQ) {
- printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
- printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
- printk_deferred(" Your kernel is probably still fine.\n");
- tk->last_warning = jiffies;
- }
- tk->underflow_seen = 0;
- }
-
- if (tk->overflow_seen) {
- if (jiffies - tk->last_warning > WARNING_FREQ) {
- printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
- printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
- printk_deferred(" Your kernel is probably still fine.\n");
- tk->last_warning = jiffies;
- }
- tk->overflow_seen = 0;
- }
-}
-
-static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- u64 now, last, mask, max, delta;
- unsigned int seq;
-
- /*
- * Since we're called holding a seqcount, the data may shift
- * under us while we're doing the calculation. This can cause
- * false positives, since we'd note a problem but throw the
- * results away. So nest another seqcount here to atomically
- * grab the points we are checking with.
- */
- do {
- seq = read_seqcount_begin(&tk_core.seq);
- now = tk_clock_read(tkr);
- last = tkr->cycle_last;
- mask = tkr->mask;
- max = tkr->clock->max_cycles;
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- delta = clocksource_delta(now, last, mask);
-
- /*
- * Try to catch underflows by checking if we are seeing small
- * mask-relative negative values.
- */
- if (unlikely((~delta & mask) < (mask >> 3))) {
- tk->underflow_seen = 1;
- delta = 0;
- }
-
- /* Cap delta value to the max_cycles values to avoid mult overflows */
- if (unlikely(delta > max)) {
- tk->overflow_seen = 1;
- delta = tkr->clock->max_cycles;
- }
-
- return delta;
-}
-#else
-static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
-{
-}
-static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
-{
- u64 cycle_now, delta;
-
- /* read clocksource */
- cycle_now = tk_clock_read(tkr);
-
- /* calculate the delta since the last update_wall_time */
- delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
-
- return delta;
-}
-#endif
-
/**
* tk_setup_internals - Set up internals to use clocksource clock.
*
@@ -370,32 +370,38 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
}
/* Timekeeper helper functions. */
-
-static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
+static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
{
- u64 nsec;
-
- nsec = delta * tkr->mult + tkr->xtime_nsec;
- nsec >>= tkr->shift;
-
- return nsec;
+ return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}
-static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
+static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
- u64 delta;
+ /* Calculate the delta since the last update_wall_time() */
+ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
+
+ /*
+ * This detects both negative motion and the case where the delta
+ * overflows the multiplication with tkr->mult.
+ */
+ if (unlikely(delta > tkr->clock->max_cycles)) {
+ /*
+ * Handle clocksource inconsistency between CPUs to prevent
+ * time from going backwards by checking for the MSB of the
+ * mask being set in the delta.
+ */
+ if (delta & ~(mask >> 1))
+ return tkr->xtime_nsec >> tkr->shift;
+
+ return delta_to_ns_safe(tkr, delta);
+ }
- delta = timekeeping_get_delta(tkr);
- return timekeeping_delta_to_ns(tkr, delta);
+ return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
+static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
- u64 delta;
-
- /* calculate the delta since the last update_wall_time */
- delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
- return timekeeping_delta_to_ns(tkr, delta);
+ return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}
/**
@@ -406,7 +412,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
*
- * Employ the latch technique; see @raw_write_seqcount_latch.
+ * Employ the latch technique; see @write_seqcount_latch.
*
* So if a NMI hits the update of base[0] then it will use base[1]
* which is still consistent. In the worst case this can result is a
@@ -419,24 +425,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
struct tk_read_base *base = tkf->base;
/* Force readers off to base[1] */
- raw_write_seqcount_latch(&tkf->seq);
+ write_seqcount_latch_begin(&tkf->seq);
/* Update base[0] */
memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
- raw_write_seqcount_latch(&tkf->seq);
+ write_seqcount_latch(&tkf->seq);
/* Update base[1] */
memcpy(base + 1, base, sizeof(*base));
-}
-
-static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
-{
- u64 delta, cycles = tk_clock_read(tkr);
- delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
- return timekeeping_delta_to_ns(tkr, delta);
+ write_seqcount_latch_end(&tkf->seq);
}
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
@@ -446,10 +446,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
u64 now;
do {
- seq = raw_read_seqcount_latch(&tkf->seq);
+ seq = read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
- now += fast_tk_get_delta_ns(tkr);
+ now += timekeeping_get_ns(tkr);
} while (read_seqcount_latch_retry(&tkf->seq, seq));
return now;
@@ -520,13 +520,13 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
* timekeeping_inject_sleeptime64()
* __timekeeping_inject_sleeptime(tk, delta);
* timestamp();
- * timekeeping_update(tk, TK_CLEAR_NTP...);
+ * timekeeping_update_staged(tkd, TK_CLEAR_NTP...);
*
* (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
* partially updated. Since the tk->offs_boot update is a rare event, this
* should be a rare occurrence which postprocessing should be able to handle.
*
- * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns()
+ * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
* apply as well.
*/
u64 notrace ktime_get_boot_fast_ns(void)
@@ -554,91 +554,30 @@ u64 notrace ktime_get_tai_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);
-static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
+/**
+ * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ *
+ * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
+ */
+u64 ktime_get_real_fast_ns(void)
{
+ struct tk_fast *tkf = &tk_fast_mono;
struct tk_read_base *tkr;
- u64 basem, baser, delta;
+ u64 baser, delta;
unsigned int seq;
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
- delta = fast_tk_get_delta_ns(tkr);
- } while (read_seqcount_latch_retry(&tkf->seq, seq));
+ delta = timekeeping_get_ns(tkr);
+ } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
- if (mono)
- *mono = basem + delta;
return baser + delta;
}
-
-/**
- * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
- *
- * See ktime_get_fast_ns() for documentation of the time stamp ordering.
- */
-u64 ktime_get_real_fast_ns(void)
-{
- return __ktime_get_real_fast(&tk_fast_mono, NULL);
-}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
/**
- * ktime_get_fast_timestamps: - NMI safe timestamps
- * @snapshot: Pointer to timestamp storage
- *
- * Stores clock monotonic, boottime and realtime timestamps.
- *
- * Boot time is a racy access on 32bit systems if the sleep time injection
- * happens late during resume and not in timekeeping_resume(). That could
- * be avoided by expanding struct tk_read_base with boot offset for 32bit
- * and adding more overhead to the update. As this is a hard to observe
- * once per resume event which can be filtered with reasonable effort using
- * the accurate mono/real timestamps, it's probably not worth the trouble.
- *
- * Aside of that it might be possible on 32 and 64 bit to observe the
- * following when the sleep time injection happens late:
- *
- * CPU 0 CPU 1
- * timekeeping_resume()
- * ktime_get_fast_timestamps()
- * mono, real = __ktime_get_real_fast()
- * inject_sleep_time()
- * update boot offset
- * boot = mono + bootoffset;
- *
- * That means that boot time already has the sleep time adjustment, but
- * real time does not. On the next readout both are in sync again.
- *
- * Preventing this for 64bit is not really feasible without destroying the
- * careful cache layout of the timekeeper because the sequence count and
- * struct tk_read_base would then need two cache lines instead of one.
- *
- * Access to the time keeper clock source is disabled across the innermost
- * steps of suspend/resume. The accessors still work, but the timestamps
- * are frozen until time keeping is resumed which happens very early.
- *
- * For regular suspend/resume there is no observable difference vs. sched
- * clock, but it might affect some of the nasty low level debug printks.
- *
- * OTOH, access to sched clock is not guaranteed across suspend/resume on
- * all systems either so it depends on the hardware in use.
- *
- * If that turns out to be a real problem then this could be mitigated by
- * using sched clock in a similar way as during early boot. But it's not as
- * trivial as on early boot because it needs some careful protection
- * against the clock monotonic timestamp jumping backwards on resume.
- */
-void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
-
- snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono);
- snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot));
-}
-
-/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
*
@@ -679,13 +618,11 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
int ret;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
update_pvclock_gtod(tk, true);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return ret;
}
@@ -698,14 +635,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
*/
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-
- return ret;
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+ return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
@@ -714,13 +645,25 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
*/
static inline void tk_update_leap_state(struct timekeeper *tk)
{
- tk->next_leap_ktime = ntp_get_next_leap();
+ tk->next_leap_ktime = ntp_get_next_leap(tk->id);
if (tk->next_leap_ktime != KTIME_MAX)
/* Convert to monotonic time */
tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}
/*
+ * Leap state update for both shadow and the real timekeeper
+ * Separate to spare a full memcpy() of the timekeeper.
+ */
+static void tk_update_leap_state_all(struct tk_data *tkd)
+{
+ write_seqcount_begin(&tkd->seq);
+ tk_update_leap_state(&tkd->shadow_timekeeper);
+ tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime;
+ write_seqcount_end(&tkd->seq);
+}
+
+/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data(struct timekeeper *tk)
@@ -753,34 +696,62 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
-/* must hold timekeeper_lock */
-static void timekeeping_update(struct timekeeper *tk, unsigned int action)
+/*
+ * Restore the shadow timekeeper from the real timekeeper.
+ */
+static void timekeeping_restore_shadow(struct tk_data *tkd)
+{
+ lockdep_assert_held(&tkd->lock);
+ memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
+}
+
+static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
+
+ lockdep_assert_held(&tkd->lock);
+
+ /*
+ * Block out readers before running the updates below because that
+ * updates VDSO and other time related infrastructure. Not blocking
+ * the readers might let a reader see time going backwards when
+ * reading from the VDSO after the VDSO update and then reading in
+ * the kernel from the timekeeper before that got updated.
+ */
+ write_seqcount_begin(&tkd->seq);
+
if (action & TK_CLEAR_NTP) {
tk->ntp_error = 0;
- ntp_clear();
+ ntp_clear(tk->id);
}
tk_update_leap_state(tk);
tk_update_ktime_data(tk);
+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_vsyscall(tk);
- update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ if (tk->id == TIMEKEEPER_CORE) {
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
- tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
- update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ } else if (tk_is_aux(tk)) {
+ vdso_time_update_aux(tk);
+ }
if (action & TK_CLOCK_WAS_SET)
tk->clock_was_set_seq++;
+
/*
- * The mirroring of the data to the shadow-timekeeper needs
- * to happen last here to ensure we don't over-write the
- * timekeeper structure on the next update with stale data
+ * Update the real timekeeper.
+ *
+ * We could avoid this memcpy() by switching pointers, but that has
+ * the downside that the reader side does not longer benefit from
+ * the cacheline optimized data layout of the timekeeper and requires
+ * another indirection.
*/
- if (action & TK_MIRROR)
- memcpy(&shadow_timekeeper, &tk_core.timekeeper,
- sizeof(tk_core.timekeeper));
+ memcpy(&tkd->timekeeper, tk, sizeof(*tk));
+ write_seqcount_end(&tkd->seq);
}
/**
@@ -796,14 +767,21 @@ static void timekeeping_forward_now(struct timekeeper *tk)
u64 cycle_now, delta;
cycle_now = tk_clock_read(&tk->tkr_mono);
- delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+ delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+ tk->tkr_mono.clock->max_raw_delta);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
- tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
- tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+ while (delta > 0) {
+ u64 max = tk->tkr_mono.clock->max_cycles;
+ u64 incr = delta < max ? delta : max;
- tk_normalize_xtime(tk);
+ tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
+ tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
+ tk_normalize_xtime(tk);
+ delta -= incr;
+ }
+ tk_update_coarse_nsecs(tk);
}
/**
@@ -900,8 +878,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -909,7 +887,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsecs = tk->coarse_nsec;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -928,6 +906,14 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
unsigned int seq;
ktime_t tconv;
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ /*
+ * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and
+ * tk_update_sleep_time().
+ */
+ return ktime_add(tmono, READ_ONCE(*offset));
+ }
+
do {
seq = read_seqcount_begin(&tk_core.seq);
tconv = ktime_add(tmono, *offset);
@@ -1037,9 +1023,14 @@ time64_t ktime_get_real_seconds(void)
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
/**
- * __ktime_get_real_seconds - The same as ktime_get_real_seconds
- * but without the sequence counter protect. This internal function
- * is called just when timekeeping lock is already held.
+ * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
+ *
+ * The same as ktime_get_real_seconds() but without the sequence counter
+ * protection. This function is used in restricted contexts like the x86 MCE
+ * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
+ * completed modification and only to be used for such critical contexts.
+ *
+ * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
*/
noinstr time64_t __ktime_get_real_seconds(void)
{
@@ -1058,6 +1049,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
unsigned int seq;
ktime_t base_raw;
ktime_t base_real;
+ ktime_t base_boot;
u64 nsec_raw;
u64 nsec_real;
u64 now;
@@ -1072,6 +1064,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
tk_core.timekeeper.offs_real);
+ base_boot = ktime_add(tk->tkr_mono.base,
+ tk_core.timekeeper.offs_boot);
base_raw = tk->tkr_raw.base;
nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
@@ -1079,6 +1073,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
systime_snapshot->cycles = now;
systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
+ systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);
@@ -1180,17 +1175,121 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
}
/*
- * cycle_between - true if test occurs chronologically between before and after
+ * timestamp_in_interval - true if ts is chronologically in [start, end]
+ *
+ * True if ts occurs chronologically at or after start, and before or at end.
*/
-static bool cycle_between(u64 before, u64 test, u64 after)
+static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
{
- if (test > before && test < after)
+ if (ts >= start && ts <= end)
return true;
- if (test < before && before > after)
+ if (start > end && (ts >= start || ts <= end))
return true;
return false;
}
+static bool convert_clock(u64 *val, u32 numerator, u32 denominator)
+{
+ u64 rem, res;
+
+ if (!numerator || !denominator)
+ return false;
+
+ res = div64_u64_rem(*val, denominator, &rem) * numerator;
+ *val = res + div_u64(rem * numerator, denominator);
+ return true;
+}
+
+static bool convert_base_to_cs(struct system_counterval_t *scv)
+{
+ struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
+ struct clocksource_base *base;
+ u32 num, den;
+
+ /* The timestamp was taken from the time keeper clock source */
+ if (cs->id == scv->cs_id)
+ return true;
+
+ /*
+ * Check whether cs_id matches the base clock. Prevent the compiler from
+ * re-evaluating @base as the clocksource might change concurrently.
+ */
+ base = READ_ONCE(cs->base);
+ if (!base || base->id != scv->cs_id)
+ return false;
+
+ num = scv->use_nsecs ? cs->freq_khz : base->numerator;
+ den = scv->use_nsecs ? USEC_PER_SEC : base->denominator;
+
+ if (!convert_clock(&scv->cycles, num, den))
+ return false;
+
+ scv->cycles += base->offset;
+ return true;
+}
+
+static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id)
+{
+ struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
+ struct clocksource_base *base;
+
+ /*
+ * Check whether base_id matches the base clock. Prevent the compiler from
+ * re-evaluating @base as the clocksource might change concurrently.
+ */
+ base = READ_ONCE(cs->base);
+ if (!base || base->id != base_id)
+ return false;
+
+ *cycles -= base->offset;
+ if (!convert_clock(cycles, base->denominator, base->numerator))
+ return false;
+ return true;
+}
+
+static bool convert_ns_to_cs(u64 *delta)
+{
+ struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
+
+ if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta))
+ return false;
+
+ *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult);
+ return true;
+}
+
+/**
+ * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp
+ * @treal: CLOCK_REALTIME timestamp to convert
+ * @base_id: base clocksource id
+ * @cycles: pointer to store the converted base clock timestamp
+ *
+ * Converts a supplied, future realtime clock value to the corresponding base clock value.
+ *
+ * Return: true if the conversion is successful, false otherwise.
+ */
+bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ u64 delta;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ if ((u64)treal < tk->tkr_mono.base_real)
+ return false;
+ delta = (u64)treal - tk->tkr_mono.base_real;
+ if (!convert_ns_to_cs(&delta))
+ return false;
+ *cycles = tk->tkr_mono.cycle_last + delta;
+ if (!convert_cs_to_base(cycles, base_id))
+ return false;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);
+
/**
* get_device_system_crosststamp - Synchronously capture system/device timestamp
* @get_time_fn: Callback to get simultaneous device time and
@@ -1210,7 +1309,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
- struct system_counterval_t system_counterval;
+ struct system_counterval_t system_counterval = {};
struct timekeeper *tk = &tk_core.timekeeper;
u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
@@ -1232,11 +1331,12 @@ int get_device_system_crosststamp(int (*get_time_fn)
return ret;
/*
- * Verify that the clocksource associated with the captured
- * system counter value is the same as the currently installed
- * timekeeper clocksource
+ * Verify that the clocksource ID associated with the captured
+ * system counter value is the same as for the currently
+ * installed timekeeper clocksource
*/
- if (tk->tkr_mono.clock != system_counterval.cs)
+ if (system_counterval.cs_id == CSID_GENERIC ||
+ !convert_base_to_cs(&system_counterval))
return -ENODEV;
cycles = system_counterval.cycles;
@@ -1246,7 +1346,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
*/
now = tk_clock_read(&tk->tkr_mono);
interval_start = tk->tkr_mono.cycle_last;
- if (!cycle_between(interval_start, cycles, now)) {
+ if (!timestamp_in_interval(interval_start, now, cycles)) {
clock_was_set_seq = tk->clock_was_set_seq;
cs_was_changed_seq = tk->cs_was_changed_seq;
cycles = interval_start;
@@ -1259,10 +1359,8 @@ int get_device_system_crosststamp(int (*get_time_fn)
tk_core.timekeeper.offs_real);
base_raw = tk->tkr_raw.base;
- nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
- system_counterval.cycles);
- nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
- system_counterval.cycles);
+ nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
+ nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
} while (read_seqcount_retry(&tk_core.seq, seq));
xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
@@ -1277,13 +1375,13 @@ int get_device_system_crosststamp(int (*get_time_fn)
bool discontinuity;
/*
- * Check that the counter value occurs after the provided
+ * Check that the counter value is not before the provided
* history reference and that the history doesn't cross a
* clocksource change
*/
if (!history_begin ||
- !cycle_between(history_begin->cycles,
- system_counterval.cycles, cycles) ||
+ !timestamp_in_interval(history_begin->cycles,
+ cycles, system_counterval.cycles) ||
history_begin->cs_was_changed_seq != cs_was_changed_seq)
return -EINVAL;
partial_history_cycles = cycles - system_counterval.cycles;
@@ -1304,6 +1402,30 @@ int get_device_system_crosststamp(int (*get_time_fn)
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
/**
+ * timekeeping_clocksource_has_base - Check whether the current clocksource
+ * is based on given a base clock
+ * @id: base clocksource ID
+ *
+ * Note: The return value is a snapshot which can become invalid right
+ * after the function returns.
+ *
+ * Return: true if the timekeeper clocksource has a base clock with @id,
+ * false otherwise
+ */
+bool timekeeping_clocksource_has_base(enum clocksource_ids id)
+{
+ /*
+ * This is a snapshot, so no point in using the sequence
+ * count. Just prevent the compiler from re-evaluating @base as the
+ * clocksource might change concurrently.
+ */
+ struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base);
+
+ return base ? base->id == id : false;
+}
+EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);
+
+/**
* do_settimeofday64 - Sets the time of day.
* @ts: pointer to the timespec64 variable containing the new time
*
@@ -1311,89 +1433,102 @@ EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
*/
int do_settimeofday64(const struct timespec64 *ts)
{
- struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts_delta, xt;
- unsigned long flags;
- int ret = 0;
if (!timespec64_valid_settod(ts))
return -EINVAL;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
- timekeeping_forward_now(tk);
-
- xt = tk_xtime(tk);
- ts_delta = timespec64_sub(*ts, xt);
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
- if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
- ret = -EINVAL;
- goto out;
- }
+ timekeeping_forward_now(tks);
- tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
+ xt = tk_xtime(tks);
+ ts_delta = timespec64_sub(*ts, xt);
- tk_set_xtime(tk, ts);
-out:
- timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+ if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) {
+ timekeeping_restore_shadow(&tk_core);
+ return -EINVAL;
+ }
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta));
+ tk_set_xtime(tks, ts);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ }
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL);
- if (!ret) {
- audit_tk_injoffset(ts_delta);
- add_device_randomness(ts, sizeof(*ts));
- }
-
- return ret;
+ audit_tk_injoffset(ts_delta);
+ add_device_randomness(ts, sizeof(*ts));
+ return 0;
}
EXPORT_SYMBOL(do_settimeofday64);
+static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
+{
+ return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
+}
+
/**
- * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * __timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tkd: Pointer to the timekeeper to modify
* @ts: Pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time.
*/
-static int timekeeping_inject_offset(const struct timespec64 *ts)
+static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
struct timespec64 tmp;
- int ret = 0;
if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ timekeeping_forward_now(tks);
- timekeeping_forward_now(tk);
+ if (timekeeper_is_core_tk(tks)) {
+ /* Make sure the proposed value is valid */
+ tmp = timespec64_add(tk_xtime(tks), *ts);
+ if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
+ !timespec64_valid_settod(&tmp)) {
+ timekeeping_restore_shadow(tkd);
+ return -EINVAL;
+ }
+
+ tk_xtime_add(tks, ts);
+ tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
+ } else {
+ struct tk_read_base *tkr_mono = &tks->tkr_mono;
+ ktime_t now, offs;
- /* Make sure the proposed value is valid */
- tmp = timespec64_add(tk_xtime(tk), *ts);
- if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
- !timespec64_valid_settod(&tmp)) {
- ret = -EINVAL;
- goto error;
+ /* Get the current time */
+ now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
+ /* Add the relative offset change */
+ offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));
+
+ /* Prevent that the resulting time becomes negative */
+ if (ktime_add(now, offs) < 0) {
+ timekeeping_restore_shadow(tkd);
+ return -EINVAL;
+ }
+ tk_update_aux_offs(tks, offs);
}
- tk_xtime_add(tk, ts);
- tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
+ return 0;
+}
-error: /* even if we error out, we forwarded the time, so call update */
- timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+static int timekeeping_inject_offset(const struct timespec64 *ts)
+{
+ int ret;
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
+ ret = __timekeeping_inject_offset(&tk_core, ts);
/* Signal hrtimers about time change */
- clock_was_set(CLOCK_SET_WALL);
-
+ if (!ret)
+ clock_was_set(CLOCK_SET_WALL);
return ret;
}
@@ -1447,43 +1582,36 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
*/
static int change_clocksource(void *data)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *new, *old = NULL;
- unsigned long flags;
- bool change = false;
-
- new = (struct clocksource *) data;
+ struct clocksource *new = data, *old = NULL;
/*
- * If the cs is in module, get a module reference. Succeeds
- * for built-in code (owner == NULL) as well.
+ * If the clocksource is in a module, get a module reference.
+ * Succeeds for built-in code (owner == NULL) as well. Abort if the
+ * reference can't be acquired.
*/
- if (try_module_get(new->owner)) {
- if (!new->enable || new->enable(new) == 0)
- change = true;
- else
- module_put(new->owner);
- }
+ if (!try_module_get(new->owner))
+ return 0;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ /* Abort if the device can't be enabled */
+ if (new->enable && new->enable(new) != 0) {
+ module_put(new->owner);
+ return 0;
+ }
- timekeeping_forward_now(tk);
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
- if (change) {
- old = tk->tkr_mono.clock;
- tk_setup_internals(tk, new);
+ timekeeping_forward_now(tks);
+ old = tks->tkr_mono.clock;
+ tk_setup_internals(tks, new);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
}
- timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
-
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ tk_aux_update_clocksource();
if (old) {
if (old->disable)
old->disable(old);
-
module_put(old->owner);
}
@@ -1532,6 +1660,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_raw_ts64);
+/**
+ * ktime_get_clock_ts64 - Returns time of a clock in a timespec
+ * @id: POSIX clock ID of the clock to read
+ * @ts: Pointer to the timespec64 to be set
+ *
+ * The timestamp is invalidated (@ts->sec is set to -1) if the
+ * clock @id is not available.
+ */
+void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
+{
+ /* Invalidate time stamp */
+ ts->tv_sec = -1;
+ ts->tv_nsec = 0;
+
+ switch (id) {
+ case CLOCK_REALTIME:
+ ktime_get_real_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC_RAW:
+ ktime_get_raw_ts64(ts);
+ return;
+ case CLOCK_AUX ... CLOCK_AUX_LAST:
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
+ ktime_get_aux_ts64(id, ts);
+ return;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1608,6 +1769,14 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
*boot_offset = ns_to_timespec64(local_clock());
}
+static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
+{
+ raw_spin_lock_init(&tkd->lock);
+ seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
+ tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
+ tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
+}
+
/*
* Flag reflecting whether timekeeping_resume() has injected sleeptime.
*
@@ -1632,9 +1801,11 @@ static bool persistent_clock_exists;
void __init timekeeping_init(void)
{
struct timespec64 wall_time, boot_offset, wall_to_mono;
- struct timekeeper *tk = &tk_core.timekeeper;
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
struct clocksource *clock;
- unsigned long flags;
+
+ tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
+ tk_aux_setup();
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
if (timespec64_valid_settod(&wall_time) &&
@@ -1654,24 +1825,21 @@ void __init timekeeping_init(void)
*/
wall_to_mono = timespec64_sub(boot_offset, wall_time);
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+
ntp_init();
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
- tk_setup_internals(tk, clock);
+ tk_setup_internals(tks, clock);
- tk_set_xtime(tk, &wall_time);
- tk->raw_sec = 0;
+ tk_set_xtime(tks, &wall_time);
+ tks->raw_sec = 0;
- tk_set_wall_to_mono(tk, wall_to_mono);
+ tk_set_wall_to_mono(tks, wall_to_mono);
- timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
-
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
}
/* time in seconds when suspend began for persistent clock */
@@ -1749,22 +1917,14 @@ bool timekeeping_rtc_skipsuspend(void)
*/
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
- suspend_timing_needed = false;
+ scoped_guard(raw_spinlock_irqsave, &tk_core.lock) {
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
- timekeeping_forward_now(tk);
-
- __timekeeping_inject_sleeptime(tk, delta);
-
- timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
-
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ suspend_timing_needed = false;
+ timekeeping_forward_now(tks);
+ __timekeeping_inject_sleeptime(tks, delta);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ }
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
@@ -1776,20 +1936,19 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
*/
void timekeeping_resume(void)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *clock = tk->tkr_mono.clock;
- unsigned long flags;
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
+ struct clocksource *clock = tks->tkr_mono.clock;
struct timespec64 ts_new, ts_delta;
- u64 cycle_now, nsec;
bool inject_sleeptime = false;
+ u64 cycle_now, nsec;
+ unsigned long flags;
read_persistent_clock64(&ts_new);
clockevents_resume();
clocksource_resume();
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ raw_spin_lock_irqsave(&tk_core.lock, flags);
/*
* After system resumes, we need to calculate the suspended time and
@@ -1803,7 +1962,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk_clock_read(&tk->tkr_mono);
+ cycle_now = tk_clock_read(&tks->tkr_mono);
nsec = clocksource_stop_suspend_timing(clock, cycle_now);
if (nsec > 0) {
ts_delta = ns_to_timespec64(nsec);
@@ -1815,18 +1974,17 @@ void timekeeping_resume(void)
if (inject_sleeptime) {
suspend_timing_needed = false;
- __timekeeping_inject_sleeptime(tk, &ts_delta);
+ __timekeeping_inject_sleeptime(tks, &ts_delta);
}
/* Re-base the last cycle value */
- tk->tkr_mono.cycle_last = cycle_now;
- tk->tkr_raw.cycle_last = cycle_now;
+ tks->tkr_mono.cycle_last = cycle_now;
+ tks->tkr_raw.cycle_last = cycle_now;
- tk->ntp_error = 0;
+ tks->ntp_error = 0;
timekeeping_suspended = 0;
- timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
+ raw_spin_unlock_irqrestore(&tk_core.lock, flags);
touch_softlockup_watchdog();
@@ -1836,13 +1994,18 @@ void timekeeping_resume(void)
timerfd_resume();
}
+static void timekeeping_syscore_resume(void *data)
+{
+ timekeeping_resume();
+}
+
int timekeeping_suspend(void)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
- struct timespec64 delta, delta_delta;
- static struct timespec64 old_delta;
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
+ struct timespec64 delta, delta_delta;
+ static struct timespec64 old_delta;
struct clocksource *curr_clock;
+ unsigned long flags;
u64 cycle_now;
read_persistent_clock64(&timekeeping_suspend_time);
@@ -1857,9 +2020,8 @@ int timekeeping_suspend(void)
suspend_timing_needed = true;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
- timekeeping_forward_now(tk);
+ raw_spin_lock_irqsave(&tk_core.lock, flags);
+ timekeeping_forward_now(tks);
timekeeping_suspended = 1;
/*
@@ -1867,8 +2029,8 @@ int timekeeping_suspend(void)
* just read from the current clocksource. Save this to potentially
* use in suspend timing.
*/
- curr_clock = tk->tkr_mono.clock;
- cycle_now = tk->tkr_mono.cycle_last;
+ curr_clock = tks->tkr_mono.clock;
+ cycle_now = tks->tkr_mono.cycle_last;
clocksource_start_suspend_timing(curr_clock, cycle_now);
if (persistent_clock_exists) {
@@ -1878,7 +2040,7 @@ int timekeeping_suspend(void)
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant.
*/
- delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+ delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time);
delta_delta = timespec64_sub(delta, old_delta);
if (abs(delta_delta.tv_sec) >= 2) {
/*
@@ -1893,10 +2055,9 @@ int timekeeping_suspend(void)
}
}
- timekeeping_update(tk, TK_MIRROR);
- halt_fast_timekeeper(tk);
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ timekeeping_update_from_shadow(&tk_core, 0);
+ halt_fast_timekeeper(tks);
+ raw_spin_unlock_irqrestore(&tk_core.lock, flags);
tick_suspend();
clocksource_suspend();
@@ -1905,15 +2066,24 @@ int timekeeping_suspend(void)
return 0;
}
+static int timekeeping_syscore_suspend(void *data)
+{
+ return timekeeping_suspend();
+}
+
/* sysfs resume/suspend bits for timekeeping */
-static struct syscore_ops timekeeping_syscore_ops = {
- .resume = timekeeping_resume,
- .suspend = timekeeping_suspend,
+static const struct syscore_ops timekeeping_syscore_ops = {
+ .resume = timekeeping_syscore_resume,
+ .suspend = timekeeping_syscore_suspend,
+};
+
+static struct syscore timekeeping_syscore = {
+ .ops = &timekeeping_syscore_ops,
};
static int __init timekeeping_init_ops(void)
{
- register_syscore_ops(&timekeeping_syscore_ops);
+ register_syscore(&timekeeping_syscore);
return 0;
}
device_initcall(timekeeping_init_ops);
@@ -2001,16 +2171,17 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*/
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
+ u64 ntp_tl = ntp_tick_length(tk->id);
u32 mult;
/*
* Determine the multiplier from the current NTP tick length.
* Avoid expensive division when the tick length doesn't change.
*/
- if (likely(tk->ntp_tick == ntp_tick_length())) {
+ if (likely(tk->ntp_tick == ntp_tl)) {
mult = tk->tkr_mono.mult - tk->ntp_err_mult;
} else {
- tk->ntp_tick = ntp_tick_length();
+ tk->ntp_tick = ntp_tl;
mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
tk->xtime_remainder, tk->cycle_interval);
}
@@ -2081,7 +2252,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
}
/* Figure out if its a leap sec and apply if needed */
- leap = second_overflow(tk->xtime_sec);
+ leap = second_overflow(tk->id, tk->xtime_sec);
if (unlikely(leap)) {
struct timespec64 ts;
@@ -2147,30 +2318,25 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
*/
-static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
{
- struct timekeeper *real_tk = &tk_core.timekeeper;
- struct timekeeper *tk = &shadow_timekeeper;
- u64 offset;
- int shift = 0, maxshift;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
+ struct timekeeper *real_tk = &tkd->timekeeper;
unsigned int clock_set = 0;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ int shift = 0, maxshift;
+ u64 offset, orig_offset;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
- goto out;
+ return false;
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
- tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
-
+ tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+ tk->tkr_mono.clock->max_raw_delta);
+ orig_offset = offset;
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
- goto out;
-
- /* Do some additional sanity checking */
- timekeeping_check_update(tk, offset);
+ return false;
/*
* With NO_HZ we may have to accumulate many cycle_intervals
@@ -2183,11 +2349,10 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
shift = ilog2(offset) - ilog2(tk->cycle_interval);
shift = max(0, shift);
/* Bound shift to one less than what overflows tick_length */
- maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
+ maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
shift = min(shift, maxshift);
while (offset >= tk->cycle_interval) {
- offset = logarithmic_accumulation(tk, offset, shift,
- &clock_set);
+ offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
if (offset < tk->cycle_interval<<shift)
shift--;
}
@@ -2201,35 +2366,35 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
*/
clock_set |= accumulate_nsecs_to_secs(tk);
- write_seqcount_begin(&tk_core.seq);
/*
- * Update the real timekeeper.
- *
- * We could avoid this memcpy by switching pointers, but that
- * requires changes to all other timekeeper usage sites as
- * well, i.e. move the timekeeper pointer getter into the
- * spinlocked/seqcount protected sections. And we trade this
- * memcpy under the tk_core.seq against one before we start
- * updating.
+ * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
+ * making small negative adjustments to the base xtime_nsec
+ * value, only update the coarse clocks if we accumulated time
*/
- timekeeping_update(tk, clock_set);
- memcpy(real_tk, tk, sizeof(*tk));
- /* The memcpy must come last. Do not put anything here! */
- write_seqcount_end(&tk_core.seq);
-out:
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ if (orig_offset != offset)
+ tk_update_coarse_nsecs(tk);
+
+ timekeeping_update_from_shadow(tkd, clock_set);
return !!clock_set;
}
+static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+{
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+ return __timekeeping_advance(&tk_core, mode);
+}
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
+ * It also updates the enabled auxiliary clock timekeepers
*/
void update_wall_time(void)
{
if (timekeeping_advance(TK_ADV_TICK))
clock_was_set_delayed();
+ tk_aux_advance();
}
/**
@@ -2260,11 +2425,99 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
+/**
+ * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
+ * @ts: timespec64 to be filled
+ *
+ * Fetch the global mg_floor value, convert it to realtime and compare it
+ * to the current coarse-grained time. Fill @ts with whichever is
+ * latest. Note that this is a filesystem-specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ u64 floor = atomic64_read(&mg_floor);
+ ktime_t f_real, offset, coarse;
+ unsigned int seq;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ *ts = tk_xtime_coarse(tk);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ coarse = timespec64_to_ktime(*ts);
+ f_real = ktime_add(floor, offset);
+ if (ktime_after(f_real, coarse))
+ *ts = ktime_to_timespec64(f_real);
+}
+
+/**
+ * ktime_get_real_ts64_mg - attempt to update floor value and return result
+ * @ts: pointer to the timespec to be set
+ *
+ * Get a monotonic fine-grained time value and attempt to swap it into
+ * mg_floor. If that succeeds then accept the new floor value. If it fails
+ * then another task raced in during the interim time and updated the
+ * floor. Since any update to the floor must be later than the previous
+ * floor, either outcome is acceptable.
+ *
+ * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
+ * and determining that the resulting coarse-grained timestamp did not effect
+ * a change in ctime. Any more recent floor value would effect a change to
+ * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
+ *
+ * @ts will be filled with the latest floor value, regardless of the outcome of
+ * the cmpxchg. Note that this is a filesystem specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ ktime_t old = atomic64_read(&mg_floor);
+ ktime_t offset, mono;
+ unsigned int seq;
+ u64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ ts->tv_sec = tk->xtime_sec;
+ mono = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ mono = ktime_add_ns(mono, nsecs);
+
+ /*
+ * Attempt to update the floor with the new time value. As any
+ * update must be later then the existing floor, and would effect
+ * a change to ctime from the perspective of the current task,
+ * accept the resulting floor value regardless of the outcome of
+ * the swap.
+ */
+ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
+ timekeeping_inc_mg_floor_swaps();
+ } else {
+ /*
+ * Another task changed mg_floor since "old" was fetched.
+ * "old" has been updated with the latest value of "mg_floor".
+ * That value is newer than the previous floor value, which
+ * is enough to effect a change to ctime. Accept it.
+ */
+ *ts = ktime_to_timespec64(ktime_add(old, offset));
+ }
+}
+
void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
@@ -2274,12 +2527,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ now = tk_xtime_coarse(tk);
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
- now.tv_nsec + mono.tv_nsec);
+ now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
@@ -2339,7 +2592,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
/*
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-static int timekeeping_validate_timex(const struct __kernel_timex *txc)
+static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
@@ -2398,6 +2651,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
return -EINVAL;
}
+ if (aux_clock) {
+ /* Auxiliary clocks are similar to TAI and do not have leap seconds */
+ if (txc->status & (STA_INS | STA_DEL))
+ return -EINVAL;
+
+ /* No TAI offset setting */
+ if (txc->modes & ADJ_TAI)
+ return -EINVAL;
+
+ /* No PPS support either */
+ if (txc->status & (STA_PPSFREQ | STA_PPSTIME))
+ return -EINVAL;
+ }
+
return 0;
}
@@ -2416,88 +2683,431 @@ unsigned long random_get_entropy_fallback(void)
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
-/**
- * do_adjtimex() - Accessor function to NTP __do_adjtimex function
- */
-int do_adjtimex(struct __kernel_timex *txc)
+struct adjtimex_result {
+ struct audit_ntp_data ad;
+ struct timespec64 delta;
+ bool clock_set;
+};
+
+static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
+ struct adjtimex_result *result)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct audit_ntp_data ad;
- bool clock_set = false;
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ bool aux_clock = !timekeeper_is_core_tk(tks);
struct timespec64 ts;
- unsigned long flags;
s32 orig_tai, tai;
int ret;
/* Validate the data before disabling interrupts */
- ret = timekeeping_validate_timex(txc);
+ ret = timekeeping_validate_timex(txc, aux_clock);
if (ret)
return ret;
add_device_randomness(txc, sizeof(*txc));
+ if (!aux_clock)
+ ktime_get_real_ts64(&ts);
+ else
+ tk_get_aux_ts64(tkd->timekeeper.id, &ts);
+
+ add_device_randomness(&ts, sizeof(ts));
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+
+ if (!tks->clock_valid)
+ return -ENODEV;
+
if (txc->modes & ADJ_SETOFFSET) {
- struct timespec64 delta;
- delta.tv_sec = txc->time.tv_sec;
- delta.tv_nsec = txc->time.tv_usec;
+ result->delta.tv_sec = txc->time.tv_sec;
+ result->delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
- delta.tv_nsec *= 1000;
- ret = timekeeping_inject_offset(&delta);
+ result->delta.tv_nsec *= 1000;
+ ret = __timekeeping_inject_offset(tkd, &result->delta);
if (ret)
return ret;
-
- audit_tk_injoffset(delta);
+ result->clock_set = true;
}
- audit_ntp_init(&ad);
+ orig_tai = tai = tks->tai_offset;
+ ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);
- ktime_get_real_ts64(&ts);
- add_device_randomness(&ts, sizeof(ts));
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tks, tai);
+ timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
+ result->clock_set = true;
+ } else {
+ tk_update_leap_state_all(&tk_core);
+ }
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ /* Update the multiplier immediately if frequency was set directly */
+ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
+ result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);
- orig_tai = tai = tk->tai_offset;
- ret = __do_adjtimex(txc, &ts, &tai, &ad);
+ return ret;
+}
- if (tai != orig_tai) {
- __timekeeping_set_tai_offset(tk, tai);
- timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- clock_set = true;
- }
- tk_update_leap_state(tk);
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ * @txc: Pointer to kernel_timex structure containing NTP parameters
+ */
+int do_adjtimex(struct __kernel_timex *txc)
+{
+ struct adjtimex_result result = { };
+ int ret;
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ ret = __do_adjtimex(&tk_core, txc, &result);
+ if (ret < 0)
+ return ret;
- audit_ntp_log(&ad);
+ if (txc->modes & ADJ_SETOFFSET)
+ audit_tk_injoffset(result.delta);
- /* Update the multiplier immediately if frequency was set directly */
- if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
- clock_set |= timekeeping_advance(TK_ADV_FREQ);
+ audit_ntp_log(&result.ad);
- if (clock_set)
- clock_was_set(CLOCK_REALTIME);
+ if (result.clock_set)
+ clock_was_set(CLOCK_SET_WALL);
- ntp_notify_cmos_timer();
+ ntp_notify_cmos_timer(result.delta.tv_sec != 0);
return ret;
}
+/*
+ * Invoked from NTP with the time keeper lock held, so lockless access is
+ * fine.
+ */
+long ktime_get_ntp_seconds(unsigned int id)
+{
+ return timekeeper_data[id].timekeeper.xtime_sec;
+}
+
#ifdef CONFIG_NTP_PPS
/**
* hardpps() - Accessor function to NTP __hardpps function
+ * @phase_ts: Pointer to timespec64 structure representing phase timestamp
+ * @raw_ts: Pointer to timespec64 structure representing raw timestamp
*/
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
__hardpps(phase_ts, raw_ts);
-
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+#include "posix-timers.h"
+
+/*
+ * Bitmap for the activated auxiliary timekeepers to allow lockless quick
+ * checks in the hot paths without touching extra cache lines. If set, then
+ * the state of the corresponding timekeeper has to be re-checked under
+ * timekeeper::lock.
+ */
+static unsigned long aux_timekeepers;
+
+static inline unsigned int clockid_to_tkid(unsigned int id)
+{
+ return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
+}
+
+static inline struct tk_data *aux_get_tk_data(clockid_t id)
+{
+ if (!clockid_aux_valid(id))
+ return NULL;
+ return &timekeeper_data[clockid_to_tkid(id)];
+}
+
+/* Invoked from timekeeping after a clocksource change */
+static void tk_aux_update_clocksource(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+ if (!tks->clock_valid)
+ continue;
+
+ timekeeping_forward_now(tks);
+ tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
+ }
+}
+
+static void tk_aux_advance(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ /* Lockless quick check to avoid extra cache lines */
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+
+ guard(raw_spinlock)(&aux_tkd->lock);
+ if (aux_tkd->shadow_timekeeper.clock_valid)
+ __timekeeping_advance(aux_tkd, TK_ADV_TICK);
+ }
+}
+
+/**
+ * ktime_get_aux - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @kt: Pointer to ktime_t to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux(clockid_t id, ktime_t *kt)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tk;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ if (!aux_tkd)
+ return false;
+
+ aux_tk = &aux_tkd->timekeeper;
+ do {
+ seq = read_seqcount_begin(&aux_tkd->seq);
+ if (!aux_tk->clock_valid)
+ return false;
+
+ base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
+ nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
+ } while (read_seqcount_retry(&aux_tkd->seq, seq));
+
+ *kt = ktime_add_ns(base, nsecs);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux);
+
+/**
+ * ktime_get_aux_ts64 - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @ts: Pointer to timespec64 to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
+{
+ ktime_t now;
+
+ if (!ktime_get_aux(id, &now))
+ return false;
+ *ts = ktime_to_timespec64(now);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);
+
+static int aux_get_res(clockid_t id, struct timespec64 *tp)
+{
+ if (!clockid_aux_valid(id))
+ return -ENODEV;
+
+ tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
+ tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
+ return 0;
+}
+
+static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
+{
+ return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
+}
+
+static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks;
+ ktime_t tnow, nsecs;
+
+ if (!timespec64_valid_settod(tnew))
+ return -EINVAL;
+ if (!aux_tkd)
+ return -ENODEV;
+
+ aux_tks = &aux_tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ if (!aux_tks->clock_valid)
+ return -ENODEV;
+
+ /* Forward the timekeeper base time */
+ timekeeping_forward_now(aux_tks);
+ /*
+ * Get the updated base time. tkr_mono.base has not been
+ * updated yet, so do that first. That makes the update
+ * in timekeeping_update_from_shadow() redundant, but
+ * that's harmless. After that @tnow can be calculated
+ * by using tkr_mono::cycle_last, which has been set
+ * by timekeeping_forward_now().
+ */
+ tk_update_ktime_data(aux_tks);
+ nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
+ tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);
+
+ /*
+ * Calculate the new AUX offset as delta to @tnow ("monotonic").
+ * That avoids all the tk::xtime back and forth conversions as
+ * xtime ("realtime") is not applicable for auxiliary clocks and
+ * kept in sync with "monotonic".
+ */
+ tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow));
+
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+ return 0;
+}
+
+static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct adjtimex_result result = { };
+
+ if (!aux_tkd)
+ return -ENODEV;
+
+ /*
+ * @result is ignored for now as there are neither hrtimers nor a
+ * RTC related to auxiliary clocks for now.
+ */
+ return __do_adjtimex(aux_tkd, txc, &result);
+}
+
+const struct k_clock clock_aux = {
+ .clock_getres = aux_get_res,
+ .clock_get_timespec = aux_get_timespec,
+ .clock_set = aux_clock_set,
+ .clock_adj = aux_clock_adj,
+};
+
+static void aux_clock_enable(clockid_t id)
+{
+ struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;
+
+ /* Prevent the core timekeeper from changing. */
+ guard(raw_spinlock_irq)(&tk_core.lock);
+
+ /*
+ * Setup the auxiliary clock assuming that the raw core timekeeper
+ * clock frequency conversion is close enough. Userspace has to
+ * adjust for the deviation via clock_adjtime(2).
+ */
+ guard(raw_spinlock_nested)(&aux_tkd->lock);
+
+ /* Remove leftovers of a previous registration */
+ memset(aux_tks, 0, sizeof(*aux_tks));
+ /* Restore the timekeeper id */
+ aux_tks->id = aux_tkd->timekeeper.id;
+ /* Setup the timekeeper based on the current system clocksource */
+ tk_setup_internals(aux_tks, tkr_raw->clock);
+
+ /* Mark it valid and set it live */
+ aux_tks->clock_valid = true;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static void aux_clock_disable(clockid_t id)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ aux_tkd->shadow_timekeeper.clock_valid = false;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static DEFINE_MUTEX(aux_clock_mutex);
+
+static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+ bool enable;
+
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (kstrtobool(buf, &enable) < 0)
+ return -EINVAL;
+
+ guard(mutex)(&aux_clock_mutex);
+ if (enable == test_bit(id, &aux_timekeepers))
+ return count;
+
+ if (enable) {
+ aux_clock_enable(CLOCK_AUX + id);
+ set_bit(id, &aux_timekeepers);
+ } else {
+ aux_clock_disable(CLOCK_AUX + id);
+ clear_bit(id, &aux_timekeepers);
+ }
+ return count;
+}
+
+static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+
+ return sysfs_emit(buf, "%d\n", test_bit(id, &active));
+}
+
+static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);
+
+static struct attribute *aux_clock_enable_attrs[] = {
+ &aux_clock_enable_attr.attr,
+ NULL
+};
+
+static const struct attribute_group aux_clock_enable_attr_group = {
+ .attrs = aux_clock_enable_attrs,
+};
+
+static int __init tk_aux_sysfs_init(void)
+{
+ struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
+ int ret = -ENOMEM;
+
+ if (!tko)
+ return ret;
+
+ auxo = kobject_create_and_add("aux_clocks", tko);
+ if (!auxo)
+ goto err_clean;
+
+ for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
+ char id[2] = { [0] = '0' + i, };
+ struct kobject *clk = kobject_create_and_add(id, auxo);
+
+ if (!clk) {
+ ret = -ENOMEM;
+ goto err_clean;
+ }
+
+ ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
+ if (ret)
+ goto err_clean;
+ }
+ return 0;
+
+err_clean:
+ kobject_put(auxo);
+ kobject_put(tko);
+ return ret;
+}
+late_initcall(tk_aux_sysfs_init);
+
+static __init void tk_aux_setup(void)
+{
+ for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
+ tkd_basic_setup(&timekeeper_data[i], i, false);
+}
+#endif /* CONFIG_POSIX_AUX_CLOCKS */
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index b73e8850e58d..badeb222eab9 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -17,6 +17,9 @@
#define NUM_BINS 32
+/* Incremented every time mg_floor is updated */
+DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
@@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t)
(s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
+unsigned long timekeeping_get_mg_floor_swaps(void)
+{
+ unsigned long sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu));
+
+ return sum;
+}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ca2787d1642..973ede670a36 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -10,30 +10,42 @@
* timekeeping debug functions
*/
#ifdef CONFIG_DEBUG_FS
+
+DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+ this_cpu_inc(timekeeping_mg_floor_swaps);
+}
+
extern void tk_debug_account_sleep_time(const struct timespec64 *t);
+
#else
+
#define tk_debug_account_sleep_time(x)
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+}
+
#endif
-#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
-static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
{
u64 ret = (now - last) & mask;
/*
- * Prevent time going backwards by checking the MSB of mask in
- * the result. If set, return 0.
+ * Prevent time going backwards by checking the result against
+ * @max_delta. If greater, return 0.
*/
- return ret & ~(mask >> 1) ? 0 : ret;
-}
-#else
-static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
-{
- return (now - last) & mask;
+ return ret > max_delta ? 0 : ret;
}
-#endif
/* Semi public for serialization of non timekeeper VDSO updates. */
-extern raw_spinlock_t timekeeper_lock;
+unsigned long timekeeper_lock_irqsave(void);
+void timekeeper_unlock_irqrestore(unsigned long flags);
+
+/* NTP specific interface to access the current seconds value */
+long ktime_get_ntp_seconds(unsigned int id);
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 63a8ce7177dd..1f2364126894 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -37,7 +37,6 @@
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
-#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
@@ -53,6 +52,7 @@
#include <asm/io.h>
#include "tick-internal.h"
+#include "timer_migration.h"
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
@@ -63,15 +63,15 @@ EXPORT_SYMBOL(jiffies_64);
/*
* The timer wheel has LVL_DEPTH array levels. Each level provides an array of
- * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
* level has a different granularity.
*
- * The level granularity is: LVL_CLK_DIV ^ lvl
+ * The level granularity is: LVL_CLK_DIV ^ level
* The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
*
* The array level of a newly armed timer depends on the relative expiry
* time. The farther the expiry time is away the higher the array level and
- * therefor the granularity becomes.
+ * therefore the granularity becomes.
*
* Contrary to the original timer wheel implementation, which aims for 'exact'
* expiry of the timers, this implementation removes the need for recascading
@@ -187,15 +187,66 @@ EXPORT_SYMBOL(jiffies_64);
#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
#ifdef CONFIG_NO_HZ_COMMON
-# define NR_BASES 2
-# define BASE_STD 0
-# define BASE_DEF 1
+/*
+ * If multiple bases need to be locked, use the base ordering for lock
+ * nesting, i.e. lowest number first.
+ */
+# define NR_BASES 3
+# define BASE_LOCAL 0
+# define BASE_GLOBAL 1
+# define BASE_DEF 2
#else
# define NR_BASES 1
-# define BASE_STD 0
+# define BASE_LOCAL 0
+# define BASE_GLOBAL 0
# define BASE_DEF 0
#endif
+/**
+ * struct timer_base - Per CPU timer base (number of base depends on config)
+ * @lock: Lock protecting the timer_base
+ * @running_timer: When expiring timers, the lock is dropped. To make
+ * sure not to race against deleting/modifying a
+ * currently running timer, the pointer is set to the
+ * timer, which expires at the moment. If no timer is
+ * running, the pointer is NULL.
+ * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around
+ * timer expiry callback execution and when trying to
+ * delete a running timer and it wasn't successful in
+ * the first glance. It prevents priority inversion
+ * when callback was preempted on a remote CPU and a
+ * caller tries to delete the running timer. It also
+ * prevents a life lock, when the task which tries to
+ * delete a timer preempted the softirq thread which
+ * is running the timer callback function.
+ * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter
+ * waiting for the end of the timer callback function
+ * execution.
+ * @clk: clock of the timer base; is updated before enqueue
+ * of a timer; during expiry, it is 1 offset ahead of
+ * jiffies to avoid endless requeuing to current
+ * jiffies
+ * @next_expiry: expiry value of the first timer; it is updated when
+ * finding the next timer and during enqueue; the
+ * value is not valid, when next_expiry_recalc is set
+ * @cpu: Number of CPU the timer base belongs to
+ * @next_expiry_recalc: States, whether a recalculation of next_expiry is
+ * required. Value is set true, when a timer was
+ * deleted.
+ * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ
+ * code. This state is only used in standard
+ * base. Deferrable timers, which are enqueued remotely
+ * never wake up an idle CPU. So no matter of supporting it
+ * for this base.
+ * @timers_pending: Is set, when a timer is pending in the base. It is only
+ * reliable when next_expiry_recalc is not set.
+ * @pending_map: bitmap of the timer wheel; each bit reflects a
+ * bucket of the wheel. When a bit is set, at least a
+ * single timer is enqueued in the related bucket.
+ * @vectors: Array of lists; Each array member reflects a bucket
+ * of the timer wheel. The list contains all timers
+ * which are enqueued into a specific bucket.
+ */
struct timer_base {
raw_spinlock_t lock;
struct timer_list *running_timer;
@@ -237,7 +288,7 @@ static void timers_update_migration(void)
}
#ifdef CONFIG_SYSCTL
-static int timer_migration_handler(struct ctl_table *table, int write,
+static int timer_migration_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -250,7 +301,7 @@ static int timer_migration_handler(struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table timer_sysctl[] = {
+static const struct ctl_table timer_sysctl[] = {
{
.procname = "timer_migration",
.data = &sysctl_timer_migration,
@@ -260,7 +311,6 @@ static struct ctl_table timer_sysctl[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static int __init timer_sysctl_init(void)
@@ -314,7 +364,7 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
rem = j % HZ;
/*
- * If the target jiffie is just after a whole second (which can happen
+ * If the target jiffy is just after a whole second (which can happen
* due to delays of the timer irq, long irq off times etc etc) then
* we should round down to the whole second, not up. Use 1/4th second
* as cutoff for this rounding as an extreme upper bound for this.
@@ -336,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
}
/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, false);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies);
-
-/**
* __round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -433,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j)
EXPORT_SYMBOL_GPL(round_jiffies_relative);
/**
- * __round_jiffies_up - function to round jiffies up to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * This is the same as __round_jiffies() except that it will never
- * round down. This is useful for timeouts for which the exact time
- * of firing does not matter too much, as long as they don't fire too
- * early.
- */
-unsigned long __round_jiffies_up(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, true);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies_up);
-
-/**
* __round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -571,26 +579,29 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- if (!is_timers_nohz_active())
- return;
-
/*
- * TODO: This wants some optimizing similar to the code below, but we
- * will do that when we switch from push to pull for deferrable timers.
+ * Deferrable timers do not prevent the CPU from entering dynticks and
+ * are not taken into account on the idle/nohz_full path. An IPI when a
+ * new deferrable timer is enqueued will wake up the remote CPU but
+ * nothing will be done with the deferrable timer base. Therefore skip
+ * the remote IPI for deferrable timers completely.
*/
- if (timer->flags & TIMER_DEFERRABLE) {
- if (tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
+ if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
return;
- }
/*
* We might have to IPI the remote CPU if the base is idle and the
- * timer is not deferrable. If the other CPU is on the way to idle
- * then it can't set base->is_idle as we hold the base lock:
+ * timer is pinned. If it is a non pinned timer, it is only queued
+ * on the remote CPU, when timer was running during queueing. Then
+ * everything is handled by remote CPU anyway. If the other CPU is
+ * on the way to idle then it can't set base->is_idle as we hold
+ * the base lock:
*/
- if (base->is_idle)
+ if (base->is_idle) {
+ WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
+ tick_nohz_full_cpu(base->cpu)));
wake_up_nohz_cpu(base->cpu);
+ }
}
/*
@@ -606,7 +617,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
__set_bit(idx, base->pending_map);
timer_set_idx(timer, idx);
- trace_timer_start(timer, timer->expires, timer->flags);
+ trace_timer_start(timer, bucket_expiry);
/*
* Check whether this is the new first expiring timer. The
@@ -618,7 +629,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* Set the next expiry time and kick the CPU so it
* can reevaluate the wheel:
*/
- base->next_expiry = bucket_expiry;
+ WRITE_ONCE(base->next_expiry, bucket_expiry);
base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
@@ -682,7 +693,7 @@ static bool timer_is_static_object(void *addr)
}
/*
- * fixup_init is called when:
+ * timer_fixup_init is called when:
* - an active object is initialized
*/
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
@@ -691,7 +702,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
- del_timer_sync(timer);
+ timer_delete_sync(timer);
debug_object_init(timer, &timer_debug_descr);
return true;
default:
@@ -706,7 +717,7 @@ static void stub_timer(struct timer_list *unused)
}
/*
- * fixup_activate is called when:
+ * timer_fixup_activate is called when:
* - an active object is activated
* - an unknown non-static object is activated
*/
@@ -728,7 +739,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
}
/*
- * fixup_free is called when:
+ * timer_fixup_free is called when:
* - an active object is freed
*/
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
@@ -737,7 +748,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
- del_timer_sync(timer);
+ timer_delete_sync(timer);
debug_object_free(timer, &timer_debug_descr);
return true;
default:
@@ -746,7 +757,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state)
}
/*
- * fixup_assert_init is called when:
+ * timer_fixup_assert_init is called when:
* - an untracked/uninit-ed object is found
*/
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
@@ -797,7 +808,7 @@ static void do_init_timer(struct timer_list *timer,
unsigned int flags,
const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer,
+void timer_init_key_on_stack(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name, struct lock_class_key *key)
@@ -805,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer,
debug_object_init_on_stack(timer, &timer_debug_descr);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
+EXPORT_SYMBOL_GPL(timer_init_key_on_stack);
-void destroy_timer_on_stack(struct timer_list *timer)
+void timer_destroy_on_stack(struct timer_list *timer)
{
debug_object_free(timer, &timer_debug_descr);
}
-EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+EXPORT_SYMBOL_GPL(timer_destroy_on_stack);
#else
static inline void debug_timer_init(struct timer_list *timer) { }
@@ -851,7 +862,7 @@ static void do_init_timer(struct timer_list *timer,
}
/**
- * init_timer_key - initialize a timer
+ * timer_init_key - initialize a timer
* @timer: the timer to be initialized
* @func: timer callback function
* @flags: timer flags
@@ -859,17 +870,17 @@ static void do_init_timer(struct timer_list *timer,
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
- * init_timer_key() must be done to a timer prior calling *any* of the
+ * timer_init_key() must be done to a timer prior to calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer,
+void timer_init_key(struct timer_list *timer,
void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL(init_timer_key);
+EXPORT_SYMBOL(timer_init_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
@@ -902,28 +913,30 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
- struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
* to use the deferrable base.
*/
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
- base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
- return base;
+ index = BASE_DEF;
+
+ return per_cpu_ptr(&timer_bases[index], cpu);
}
static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
* to use the deferrable base.
*/
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
- base = this_cpu_ptr(&timer_bases[BASE_DEF]);
- return base;
+ index = BASE_DEF;
+
+ return this_cpu_ptr(&timer_bases[index]);
}
static inline struct timer_base *get_timer_base(u32 tflags)
@@ -931,42 +944,34 @@ static inline struct timer_base *get_timer_base(u32 tflags)
return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
- if (static_branch_likely(&timers_migration_enabled) &&
- !(tflags & TIMER_PINNED))
- return get_timer_cpu_base(tflags, get_nohz_timer_target());
-#endif
- return get_timer_this_cpu_base(tflags);
-}
-
-static inline void forward_timer_base(struct timer_base *base)
+static inline void __forward_timer_base(struct timer_base *base,
+ unsigned long basej)
{
- unsigned long jnow = READ_ONCE(jiffies);
-
/*
- * No need to forward if we are close enough below jiffies.
- * Also while executing timers, base->clk is 1 offset ahead
- * of jiffies to avoid endless requeuing to current jiffies.
+ * Check whether we can forward the base. We can only do that when
+ * @basej is past base->clk otherwise we might rewind base->clk.
*/
- if ((long)(jnow - base->clk) < 1)
+ if (time_before_eq(basej, base->clk))
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jnow)) {
- base->clk = jnow;
+ if (time_after(base->next_expiry, basej)) {
+ base->clk = basej;
} else {
if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
return;
base->clk = base->next_expiry;
}
+
}
+static inline void forward_timer_base(struct timer_base *base)
+{
+ __forward_timer_base(base, READ_ONCE(jiffies));
+}
/*
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -1093,7 +1098,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
- new_base = get_target_base(base, timer->flags);
+ new_base = get_timer_this_cpu_base(timer->flags);
if (base != new_base) {
/*
@@ -1165,10 +1170,10 @@ EXPORT_SYMBOL(mod_timer_pending);
*
* mod_timer(timer, expires) is equivalent to:
*
- * del_timer(timer); timer->expires = expires; add_timer(timer);
+ * timer_delete(timer); timer->expires = expires; add_timer(timer);
*
* mod_timer() is more efficient than the above open coded sequence. In
- * case that the timer is inactive, the del_timer() part is a NOP. The
+ * case that the timer is inactive, the timer_delete() part is a NOP. The
* timer is in any case activated with the new expiry time @expires.
*
* Note that if there are multiple unserialized concurrent users of the
@@ -1246,11 +1251,48 @@ void add_timer(struct timer_list *timer)
EXPORT_SYMBOL(add_timer);
/**
+ * add_timer_local() - Start a timer on the local CPU
+ * @timer: The timer to be started
+ *
+ * Same as add_timer() except that the timer flag TIMER_PINNED is set.
+ *
+ * See add_timer() for further details.
+ */
+void add_timer_local(struct timer_list *timer)
+{
+ if (WARN_ON_ONCE(timer_pending(timer)))
+ return;
+ timer->flags |= TIMER_PINNED;
+ __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
+}
+EXPORT_SYMBOL(add_timer_local);
+
+/**
+ * add_timer_global() - Start a timer without TIMER_PINNED flag set
+ * @timer: The timer to be started
+ *
+ * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
+ *
+ * See add_timer() for further details.
+ */
+void add_timer_global(struct timer_list *timer)
+{
+ if (WARN_ON_ONCE(timer_pending(timer)))
+ return;
+ timer->flags &= ~TIMER_PINNED;
+ __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
+}
+EXPORT_SYMBOL(add_timer_global);
+
+/**
* add_timer_on - Start a timer on a particular CPU
* @timer: The timer to be started
* @cpu: The CPU to start it on
*
- * Same as add_timer() except that it starts the timer on the given CPU.
+ * Same as add_timer() except that it starts the timer on the given CPU and
+ * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
+ * the next round, add_timer_global() should be used instead as it unsets
+ * the TIMER_PINNED flag.
*
* See add_timer() for further details.
*/
@@ -1264,6 +1306,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
if (WARN_ON_ONCE(timer_pending(timer)))
return;
+ /* Make sure timer flags have TIMER_PINNED flag set */
+ timer->flags |= TIMER_PINNED;
+
new_base = get_timer_cpu_base(timer->flags, cpu);
/*
@@ -1324,7 +1369,7 @@ static int __timer_delete(struct timer_list *timer, bool shutdown)
* If @shutdown is set then the lock has to be taken whether the
* timer is pending or not to protect against a concurrent rearm
* which might hit between the lockless pending check and the lock
- * aquisition. By taking the lock it is ensured that such a newly
+ * acquisition. By taking the lock it is ensured that such a newly
* enqueued timer is dequeued and cannot end up with
* timer->function == NULL in the expiry code.
*
@@ -1413,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
base = lock_timer_base(timer, &flags);
- if (base->running_timer != timer)
+ if (base->running_timer != timer) {
ret = detach_if_pending(timer, base, true);
- if (shutdown)
- timer->function = NULL;
+ if (shutdown)
+ timer->function = NULL;
+ }
raw_spin_unlock_irqrestore(&base->lock, flags);
@@ -1424,7 +1470,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
}
/**
- * try_to_del_timer_sync - Try to deactivate a timer
+ * timer_delete_sync_try - Try to deactivate a timer
* @timer: Timer to deactivate
*
* This function tries to deactivate a timer. On success the timer is not
@@ -1439,11 +1485,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
* * %1 - The timer was pending and deactivated
* * %-1 - The timer callback function is running on a different CPU
*/
-int try_to_del_timer_sync(struct timer_list *timer)
+int timer_delete_sync_try(struct timer_list *timer)
{
return __try_to_del_timer_sync(timer, false);
}
-EXPORT_SYMBOL(try_to_del_timer_sync);
+EXPORT_SYMBOL(timer_delete_sync_try);
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
@@ -1469,6 +1515,8 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
* the waiter to acquire the lock and make progress.
*/
static void timer_sync_wait_running(struct timer_base *base)
+ __releases(&base->lock) __releases(&base->expiry_lock)
+ __acquires(&base->expiry_lock) __acquires(&base->lock)
{
if (atomic_read(&base->timer_waiters)) {
raw_spin_unlock_irq(&base->lock);
@@ -1803,13 +1851,15 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
/*
* Search the first expiring timer in the various clock levels. Caller must
* hold base->lock.
+ *
+ * Store next expiry time in base->next_expiry.
*/
-static unsigned long __next_timer_interrupt(struct timer_base *base)
+static void timer_recalc_next_expiry(struct timer_base *base)
{
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
- next = base->clk + NEXT_TIMER_MAX_DELTA;
+ next = base->clk + TIMER_NEXT_MAX_DELTA;
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
@@ -1834,7 +1884,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
* bits are zero, we look at the next level as is. If not we
* need to advance it by one because that's going to be the
* next expiring bucket in that level. base->clk is the next
- * expiring jiffie. So in case of:
+ * expiring jiffy. So in case of:
*
* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
* 0 0 0 0 0 0
@@ -1870,10 +1920,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk += adj;
}
+ WRITE_ONCE(base->next_expiry, next);
base->next_expiry_recalc = false;
- base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
-
- return next;
+ base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -1900,7 +1949,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
return basem;
/*
- * Round up to the next jiffie. High resolution timers are
+ * Round up to the next jiffy. High resolution timers are
* off, so the hrtimers are expired in the tick and we need to
* make sure that this tick really expires the timer to avoid
* a ping pong of the nohz stop code.
@@ -1910,63 +1959,357 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}
+static unsigned long next_timer_interrupt(struct timer_base *base,
+ unsigned long basej)
+{
+ if (base->next_expiry_recalc)
+ timer_recalc_next_expiry(base);
+
+ /*
+ * Move next_expiry for the empty base into the future to prevent an
+ * unnecessary raise of the timer softirq when the next_expiry value
+ * will be reached even if there is no timer pending.
+ *
+ * This update is also required to make timer_base::next_expiry values
+ * easy comparable to find out which base holds the first pending timer.
+ */
+ if (!base->timers_pending)
+ WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);
+
+ return base->next_expiry;
+}
+
+static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
+ struct timer_base *base_local,
+ struct timer_base *base_global,
+ struct timer_events *tevt)
+{
+ unsigned long nextevt, nextevt_local, nextevt_global;
+ bool local_first;
+
+ nextevt_local = next_timer_interrupt(base_local, basej);
+ nextevt_global = next_timer_interrupt(base_global, basej);
+
+ local_first = time_before_eq(nextevt_local, nextevt_global);
+
+ nextevt = local_first ? nextevt_local : nextevt_global;
+
+ /*
+ * If the @nextevt is at max. one tick away, use @nextevt and store
+ * it in the local expiry value. The next global event is irrelevant in
+ * this case and can be left as KTIME_MAX.
+ */
+ if (time_before_eq(nextevt, basej + 1)) {
+ /* If we missed a tick already, force 0 delta */
+ if (time_before(nextevt, basej))
+ nextevt = basej;
+ tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;
+
+ /*
+ * This is required for the remote check only but it doesn't
+ * hurt, when it is done for both call sites:
+ *
+ * * The remote callers will only take care of the global timers
+ * as local timers will be handled by CPU itself. When not
+ * updating tevt->global with the already missed first global
+ * timer, it is possible that it will be missed completely.
+ *
+ * * The local callers will ignore the tevt->global anyway, when
+ * nextevt is max. one tick away.
+ */
+ if (!local_first)
+ tevt->global = tevt->local;
+ return nextevt;
+ }
+
+ /*
+ * Update tevt.* values:
+ *
+ * If the local queue expires first, then the global event can be
+ * ignored. If the global queue is empty, nothing to do either.
+ */
+ if (!local_first && base_global->timers_pending)
+ tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;
+
+ if (base_local->timers_pending)
+ tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;
+
+ return nextevt;
+}
+
+# ifdef CONFIG_SMP
/**
- * get_next_timer_interrupt - return the time (clock mono) of the next timer
+ * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
* @basej: base time jiffies
* @basem: base time clock monotonic
+ * @tevt: Pointer to the storage for the expiry values
+ * @cpu: Remote CPU
*
- * Returns the tick aligned clock monotonic time of the next pending
- * timer or KTIME_MAX if no timer is pending.
+ * Stores the next pending local and global timer expiry values in the
+ * struct pointed to by @tevt. If a queue is empty the corresponding
+ * field is set to KTIME_MAX. If local event expires before global
+ * event, global event is set to KTIME_MAX as well.
+ *
+ * Caller needs to make sure timer base locks are held (use
+ * timer_lock_remote_bases() for this purpose).
*/
-u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
+void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
+ struct timer_events *tevt,
+ unsigned int cpu)
+{
+ struct timer_base *base_local, *base_global;
+
+ /* Preset local / global events */
+ tevt->local = tevt->global = KTIME_MAX;
+
+ base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
+ base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
+
+ lockdep_assert_held(&base_local->lock);
+ lockdep_assert_held(&base_global->lock);
+
+ fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
+}
+
+/**
+ * timer_unlock_remote_bases - unlock timer bases of cpu
+ * @cpu: Remote CPU
+ *
+ * Unlocks the remote timer bases.
+ */
+void timer_unlock_remote_bases(unsigned int cpu)
+ __releases(timer_bases[BASE_LOCAL]->lock)
+ __releases(timer_bases[BASE_GLOBAL]->lock)
+{
+ struct timer_base *base_local, *base_global;
+
+ base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
+ base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
+
+ raw_spin_unlock(&base_global->lock);
+ raw_spin_unlock(&base_local->lock);
+}
+
+/**
+ * timer_lock_remote_bases - lock timer bases of cpu
+ * @cpu: Remote CPU
+ *
+ * Locks the remote timer bases.
+ */
+void timer_lock_remote_bases(unsigned int cpu)
+ __acquires(timer_bases[BASE_LOCAL]->lock)
+ __acquires(timer_bases[BASE_GLOBAL]->lock)
+{
+ struct timer_base *base_local, *base_global;
+
+ base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
+ base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
+
+ lockdep_assert_irqs_disabled();
+
+ raw_spin_lock(&base_local->lock);
+ raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
+}
+
+/**
+ * timer_base_is_idle() - Return whether timer base is set idle
+ *
+ * Returns value of local timer base is_idle value.
+ */
+bool timer_base_is_idle(void)
+{
+ return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
+}
+
+static void __run_timer_base(struct timer_base *base);
+
+/**
+ * timer_expire_remote() - expire global timers of cpu
+ * @cpu: Remote CPU
+ *
+ * Expire timers of global base of remote CPU.
+ */
+void timer_expire_remote(unsigned int cpu)
+{
+ struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
+
+ __run_timer_base(base);
+}
+
+static void timer_use_tmigr(unsigned long basej, u64 basem,
+ unsigned long *nextevt, bool *tick_stop_path,
+ bool timer_base_idle, struct timer_events *tevt)
+{
+ u64 next_tmigr;
+
+ if (timer_base_idle)
+ next_tmigr = tmigr_cpu_new_timer(tevt->global);
+ else if (tick_stop_path)
+ next_tmigr = tmigr_cpu_deactivate(tevt->global);
+ else
+ next_tmigr = tmigr_quick_check(tevt->global);
+
+ /*
+ * If the CPU is the last going idle in timer migration hierarchy, make
+ * sure the CPU will wake up in time to handle remote timers.
+ * next_tmigr == KTIME_MAX if other CPUs are still active.
+ */
+ if (next_tmigr < tevt->local) {
+ u64 tmp;
+
+ /* If we missed a tick already, force 0 delta */
+ if (next_tmigr < basem)
+ next_tmigr = basem;
+
+ tmp = div_u64(next_tmigr - basem, TICK_NSEC);
+
+ *nextevt = basej + (unsigned long)tmp;
+ tevt->local = next_tmigr;
+ }
+}
+# else
+static void timer_use_tmigr(unsigned long basej, u64 basem,
+ unsigned long *nextevt, bool *tick_stop_path,
+ bool timer_base_idle, struct timer_events *tevt)
+{
+ /*
+ * Make sure first event is written into tevt->local to not miss a
+ * timer on !SMP systems.
+ */
+ tevt->local = min_t(u64, tevt->local, tevt->global);
+}
+# endif /* CONFIG_SMP */
+
+static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
+ bool *idle)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
- u64 expires = KTIME_MAX;
+ struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
+ struct timer_base *base_local, *base_global;
unsigned long nextevt;
+ bool idle_is_possible;
/*
- * Pretend that there is no timer pending if the cpu is offline.
- * Possible pending timers will be migrated later to an active cpu.
+ * When the CPU is offline, the tick is cancelled and nothing is supposed
+ * to try to stop it.
*/
- if (cpu_is_offline(smp_processor_id()))
- return expires;
+ if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
+ if (idle)
+ *idle = true;
+ return tevt.local;
+ }
- raw_spin_lock(&base->lock);
- if (base->next_expiry_recalc)
- base->next_expiry = __next_timer_interrupt(base);
- nextevt = base->next_expiry;
+ base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
+ base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
+
+ raw_spin_lock(&base_local->lock);
+ raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
+
+ nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
+ base_global, &tevt);
+
+ /*
+ * If the next event is only one jiffy ahead there is no need to call
+ * timer migration hierarchy related functions. The value for the next
+ * global timer in @tevt struct equals then KTIME_MAX. This is also
+ * true, when the timer base is idle.
+ *
+ * The proper timer migration hierarchy function depends on the callsite
+ * and whether timer base is idle or not. @nextevt will be updated when
+ * this CPU needs to handle the first timer migration hierarchy
+ * event. See timer_use_tmigr() for detailed information.
+ */
+ idle_is_possible = time_after(nextevt, basej + 1);
+ if (idle_is_possible)
+ timer_use_tmigr(basej, basem, &nextevt, idle,
+ base_local->is_idle, &tevt);
/*
* We have a fresh next event. Check whether we can forward the
- * base. We can only do that when @basej is past base->clk
- * otherwise we might rewind base->clk.
+ * base.
*/
- if (time_after(basej, base->clk)) {
- if (time_after(nextevt, basej))
- base->clk = basej;
- else if (time_after(nextevt, base->clk))
- base->clk = nextevt;
- }
+ __forward_timer_base(base_local, basej);
+ __forward_timer_base(base_global, basej);
+
+ /*
+ * Set base->is_idle only when caller is timer_base_try_to_set_idle()
+ */
+ if (idle) {
+ /*
+ * Bases are idle if the next event is more than a tick
+ * away. Caution: @nextevt could have changed by enqueueing a
+ * global timer into timer migration hierarchy. Therefore a new
+ * check is required here.
+ *
+ * If the base is marked idle then any timer add operation must
+ * forward the base clk itself to keep granularity small. This
+ * idle logic is only maintained for the BASE_LOCAL and
+ * BASE_GLOBAL base, deferrable timers may still see large
+ * granularity skew (by design).
+ */
+ if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
+ base_local->is_idle = true;
+ /*
+ * Global timers queued locally while running in a task
+ * in nohz_full mode need a self-IPI to kick reprogramming
+ * in IRQ tail.
+ */
+ if (tick_nohz_full_cpu(base_local->cpu))
+ base_global->is_idle = true;
+ trace_timer_base_idle(true, base_local->cpu);
+ }
+ *idle = base_local->is_idle;
- if (time_before_eq(nextevt, basej)) {
- expires = basem;
- base->is_idle = false;
- } else {
- if (base->timers_pending)
- expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle.
- * Also the tick is stopped so any added timer must forward
- * the base clk itself to keep granularity small. This idle
- * logic is only maintained for the BASE_STD base, deferrable
- * timers may still see large granularity skew (by design).
+ * When timer base is not set idle, undo the effect of
+ * tmigr_cpu_deactivate() to prevent inconsistent states - active
+ * timer base but inactive timer migration hierarchy.
+ *
+ * When timer base was already marked idle, nothing will be
+ * changed here.
*/
- if ((expires - basem) > TICK_NSEC)
- base->is_idle = true;
+ if (!base_local->is_idle && idle_is_possible)
+ tmigr_cpu_activate();
}
- raw_spin_unlock(&base->lock);
- return cmp_next_hrtimer_event(basem, expires);
+ raw_spin_unlock(&base_global->lock);
+ raw_spin_unlock(&base_local->lock);
+
+ return cmp_next_hrtimer_event(basem, tevt.local);
+}
+
+/**
+ * get_next_timer_interrupt() - return the time (clock mono) of the next timer
+ * @basej: base time jiffies
+ * @basem: base time clock monotonic
+ *
+ * Returns the tick aligned clock monotonic time of the next pending timer or
+ * KTIME_MAX if no timer is pending. If timer of global base was queued into
+ * timer migration hierarchy, first global timer is not taken into account. If
+ * it was the last CPU of timer migration hierarchy going idle, first global
+ * event is taken into account.
+ */
+u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
+{
+ return __get_next_timer_interrupt(basej, basem, NULL);
+}
+
+/**
+ * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
+ * @basej: base time jiffies
+ * @basem: base time clock monotonic
+ * @idle: pointer to store the value of timer_base->is_idle on return;
+ * *idle contains the information whether tick was already stopped
+ *
+ * Returns the tick aligned clock monotonic time of the next pending timer or
+ * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
+ * returned as well.
+ */
+u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
+{
+ if (*idle)
+ return KTIME_MAX;
+
+ return __get_next_timer_interrupt(basej, basem, idle);
}
/**
@@ -1976,15 +2319,20 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
*/
void timer_clear_idle(void)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
-
/*
- * We do this unlocked. The worst outcome is a remote enqueue sending
- * a pointless IPI, but taking the lock would just make the window for
- * sending the IPI a few instructions smaller for the cost of taking
- * the lock in the exit from idle path.
+ * We do this unlocked. The worst outcome is a remote pinned timer
+ * enqueue sending a pointless IPI, but taking the lock would just
+ * make the window for sending the IPI a few instructions smaller
+ * for the cost of taking the lock in the exit from idle
+ * path. Required for BASE_LOCAL only.
*/
- base->is_idle = false;
+ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
+ trace_timer_base_idle(false, smp_processor_id());
+
+ /* Activate without holding the timer_base->lock */
+ tmigr_cpu_activate();
}
#endif
@@ -1997,11 +2345,10 @@ static inline void __run_timers(struct timer_base *base)
struct hlist_head heads[LVL_DEPTH];
int levels;
- if (time_before(jiffies, base->next_expiry))
- return;
+ lockdep_assert_held(&base->lock);
- timer_base_lock_expiry(base);
- raw_spin_lock_irq(&base->lock);
+ if (base->running_timer)
+ return;
while (time_after_eq(jiffies, base->clk) &&
time_after_eq(jiffies, base->next_expiry)) {
@@ -2011,30 +2358,55 @@ static inline void __run_timers(struct timer_base *base)
* timer at this clk are that all matching timers have been
* dequeued or no timer has been queued since
* base::next_expiry was set to base::clk +
- * NEXT_TIMER_MAX_DELTA.
+ * TIMER_NEXT_MAX_DELTA.
*/
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
&& base->timers_pending);
+ /*
+ * While executing timers, base->clk is set 1 offset ahead of
+ * jiffies to avoid endless requeuing to current jiffies.
+ */
base->clk++;
- base->next_expiry = __next_timer_interrupt(base);
+ timer_recalc_next_expiry(base);
while (levels--)
expire_timers(base, heads + levels);
}
+}
+
+static void __run_timer_base(struct timer_base *base)
+{
+ /* Can race against a remote CPU updating next_expiry under the lock */
+ if (time_before(jiffies, READ_ONCE(base->next_expiry)))
+ return;
+
+ timer_base_lock_expiry(base);
+ raw_spin_lock_irq(&base->lock);
+ __run_timers(base);
raw_spin_unlock_irq(&base->lock);
timer_base_unlock_expiry(base);
}
+static void run_timer_base(int index)
+{
+ struct timer_base *base = this_cpu_ptr(&timer_bases[index]);
+
+ __run_timer_base(base);
+}
+
/*
* This function runs timers and the timer-tq in bottom half context.
*/
-static __latent_entropy void run_timer_softirq(struct softirq_action *h)
+static __latent_entropy void run_timer_softirq(void)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ run_timer_base(BASE_LOCAL);
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
+ run_timer_base(BASE_GLOBAL);
+ run_timer_base(BASE_DEF);
- __run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
- __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
+ if (is_timers_nohz_active())
+ tmigr_handle_remote();
+ }
}
/*
@@ -2042,19 +2414,50 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
*/
static void run_local_timers(void)
{
- struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
hrtimer_run_queues();
- /* Raise the softirq only if required. */
- if (time_before(jiffies, base->next_expiry)) {
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
- return;
- /* CPU is awake, so check the deferrable base. */
- base++;
- if (time_before(jiffies, base->next_expiry))
+
+ for (int i = 0; i < NR_BASES; i++, base++) {
+ /*
+ * Raise the softirq only if required.
+ *
+ * timer_base::next_expiry can be written by a remote CPU while
+ * holding the lock. If this write happens at the same time than
+ * the lockless local read, sanity checker could complain about
+ * data corruption.
+ *
+ * There are two possible situations where
+ * timer_base::next_expiry is written by a remote CPU:
+ *
+ * 1. Remote CPU expires global timers of this CPU and updates
+ * timer_base::next_expiry of BASE_GLOBAL afterwards in
+ * next_timer_interrupt() or timer_recalc_next_expiry(). The
+ * worst outcome is a superfluous raise of the timer softirq
+ * when the not yet updated value is read.
+ *
+ * 2. A new first pinned timer is enqueued by a remote CPU
+ * and therefore timer_base::next_expiry of BASE_LOCAL is
+ * updated. When this update is missed, this isn't a
+ * problem, as an IPI is executed nevertheless when the CPU
+ * was idle before. When the CPU wasn't idle but the update
+ * is missed, then the timer would expire one jiffy late -
+ * bad luck.
+ *
+ * Those unlikely corner cases where the worst outcome is only a
+ * one jiffy delay or a superfluous raise of the softirq are
+ * not that expensive as doing the check always while holding
+ * the lock.
+ *
+ * Possible remote writers are using WRITE_ONCE(). Local reader
+ * uses therefore READ_ONCE().
+ */
+ if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
+ (i == BASE_DEF && tmigr_requires_handle_remote())) {
+ raise_timer_softirq(TIMER_SOFTIRQ);
return;
+ }
}
- raise_softirq(TIMER_SOFTIRQ);
}
/*
@@ -2070,149 +2473,14 @@ void update_process_times(int user_tick)
run_local_timers();
rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
- if (in_irq())
+ if (in_hardirq())
irq_work_tick();
#endif
- scheduler_tick();
+ sched_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers();
}
-/*
- * Since schedule_timeout()'s timer is defined on the stack, it must store
- * the target task on the stack as well.
- */
-struct process_timer {
- struct timer_list timer;
- struct task_struct *task;
-};
-
-static void process_timeout(struct timer_list *t)
-{
- struct process_timer *timeout = from_timer(timeout, t, timer);
-
- wake_up_process(timeout->task);
-}
-
-/**
- * schedule_timeout - sleep until timeout
- * @timeout: timeout value in jiffies
- *
- * Make the current task sleep until @timeout jiffies have elapsed.
- * The function behavior depends on the current task state
- * (see also set_current_state() description):
- *
- * %TASK_RUNNING - the scheduler is called, but the task does not sleep
- * at all. That happens because sched_submit_work() does nothing for
- * tasks in %TASK_RUNNING state.
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
- * pass before the routine returns unless the current task is explicitly
- * woken up, (e.g. by wake_up_process()).
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task or the current task is explicitly woken
- * up.
- *
- * The current task state is guaranteed to be %TASK_RUNNING when this
- * routine returns.
- *
- * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
- * the CPU away without a bound on the timeout. In this case the return
- * value will be %MAX_SCHEDULE_TIMEOUT.
- *
- * Returns 0 when the timer has expired otherwise the remaining time in
- * jiffies will be returned. In all cases the return value is guaranteed
- * to be non-negative.
- */
-signed long __sched schedule_timeout(signed long timeout)
-{
- struct process_timer timer;
- unsigned long expire;
-
- switch (timeout)
- {
- case MAX_SCHEDULE_TIMEOUT:
- /*
- * These two special cases are useful to be comfortable
- * in the caller. Nothing more. We could take
- * MAX_SCHEDULE_TIMEOUT from one of the negative value
- * but I' d like to return a valid offset (>=0) to allow
- * the caller to do everything it want with the retval.
- */
- schedule();
- goto out;
- default:
- /*
- * Another bit of PARANOID. Note that the retval will be
- * 0 since no piece of kernel is supposed to do a check
- * for a negative retval of schedule_timeout() (since it
- * should never happens anyway). You just have the printk()
- * that will tell you if something is gone wrong and where.
- */
- if (timeout < 0) {
- printk(KERN_ERR "schedule_timeout: wrong timeout "
- "value %lx\n", timeout);
- dump_stack();
- __set_current_state(TASK_RUNNING);
- goto out;
- }
- }
-
- expire = timeout + jiffies;
-
- timer.task = current;
- timer_setup_on_stack(&timer.timer, process_timeout, 0);
- __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
- schedule();
- del_timer_sync(&timer.timer);
-
- /* Remove the timer from the object tracker */
- destroy_timer_on_stack(&timer.timer);
-
- timeout = expire - jiffies;
-
- out:
- return timeout < 0 ? 0 : timeout;
-}
-EXPORT_SYMBOL(schedule_timeout);
-
-/*
- * We can use __set_current_state() here because schedule_timeout() calls
- * schedule() unconditionally.
- */
-signed long __sched schedule_timeout_interruptible(signed long timeout)
-{
- __set_current_state(TASK_INTERRUPTIBLE);
- return schedule_timeout(timeout);
-}
-EXPORT_SYMBOL(schedule_timeout_interruptible);
-
-signed long __sched schedule_timeout_killable(signed long timeout)
-{
- __set_current_state(TASK_KILLABLE);
- return schedule_timeout(timeout);
-}
-EXPORT_SYMBOL(schedule_timeout_killable);
-
-signed long __sched schedule_timeout_uninterruptible(signed long timeout)
-{
- __set_current_state(TASK_UNINTERRUPTIBLE);
- return schedule_timeout(timeout);
-}
-EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-
-/*
- * Like schedule_timeout_uninterruptible(), except this task will not contribute
- * to load average.
- */
-signed long __sched schedule_timeout_idle(signed long timeout)
-{
- __set_current_state(TASK_IDLE);
- return schedule_timeout(timeout);
-}
-EXPORT_SYMBOL(schedule_timeout_idle);
-
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
@@ -2235,7 +2503,7 @@ int timers_prepare_cpu(unsigned int cpu)
for (b = 0; b < NR_BASES; b++) {
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
base->next_expiry_recalc = false;
base->timers_pending = false;
base->is_idle = false;
@@ -2290,7 +2558,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
@@ -2303,65 +2571,9 @@ static void __init init_timer_cpus(void)
init_timer_cpu(cpu);
}
-void __init init_timers(void)
+void __init timers_init(void)
{
init_timer_cpus();
posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
-
-/**
- * msleep - sleep safely even with waitqueue interruptions
- * @msecs: Time in milliseconds to sleep for
- */
-void msleep(unsigned int msecs)
-{
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
-
- while (timeout)
- timeout = schedule_timeout_uninterruptible(timeout);
-}
-
-EXPORT_SYMBOL(msleep);
-
-/**
- * msleep_interruptible - sleep waiting for signals
- * @msecs: Time in milliseconds to sleep for
- */
-unsigned long msleep_interruptible(unsigned int msecs)
-{
- unsigned long timeout = msecs_to_jiffies(msecs) + 1;
-
- while (timeout && !signal_pending(current))
- timeout = schedule_timeout_interruptible(timeout);
- return jiffies_to_msecs(timeout);
-}
-
-EXPORT_SYMBOL(msleep_interruptible);
-
-/**
- * usleep_range_state - Sleep for an approximate time in a given state
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
- * @state: State of the current task that will be while sleeping
- *
- * In non-atomic context where the exact wakeup time is flexible, use
- * usleep_range_state() instead of udelay(). The sleep improves responsiveness
- * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
- * power usage by allowing hrtimers to take advantage of an already-
- * scheduled interrupt instead of scheduling a new one just for this sleep.
- */
-void __sched usleep_range_state(unsigned long min, unsigned long max,
- unsigned int state)
-{
- ktime_t exp = ktime_add_us(ktime_get(), min);
- u64 delta = (u64)(max - min) * NSEC_PER_USEC;
-
- for (;;) {
- __set_current_state(state);
- /* Do not return before the requested sleep time has elapsed */
- if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
- break;
- }
-}
-EXPORT_SYMBOL(usleep_range_state);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ed7d6ad694fb..488e47e96e93 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -46,7 +46,7 @@ static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
- SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
+ SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function));
SEQ_printf(m, ", S:%02x", timer->state);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
@@ -98,12 +98,10 @@ next_one:
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
- SEQ_printf(m, " .base: %pK\n", base);
+ SEQ_printf(m, " .base: %p\n", base);
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
-
- SEQ_printf(m, " .get_time: %ps\n", base->get_time);
#ifdef CONFIG_HIGH_RES_TIMERS
SEQ_printf(m, " .offset: %Lu nsecs\n",
(unsigned long long) ktime_to_ns(base->offset));
@@ -147,11 +145,15 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
# define P_ns(x) \
SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
(unsigned long long)(ktime_to_ns(ts->x)))
+# define P_flag(x, f) \
+ SEQ_printf(m, " .%-15s: %d\n", #x, !!(ts->flags & (f)))
+
{
struct tick_sched *ts = tick_get_tick_sched(cpu);
- P(nohz_mode);
+ P_flag(nohz, TS_FLAG_NOHZ);
+ P_flag(highres, TS_FLAG_HIGHRES);
P_ns(last_tick);
- P(tick_stopped);
+ P_flag(tick_stopped, TS_FLAG_STOPPED);
P(idle_jiffies);
P(idle_calls);
P(idle_sleeps);
@@ -256,7 +258,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.9\n");
+ SEQ_printf(m, "Timer List Version: v0.10\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
new file mode 100644
index 000000000000..18dda1aa782d
--- /dev/null
+++ b/kernel/time/timer_migration.c
@@ -0,0 +1,2023 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Infrastructure for migratable timers
+ *
+ * Copyright(C) 2022 linutronix GmbH
+ */
+#include <linux/cpuhotplug.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/timerqueue.h>
+#include <trace/events/ipi.h>
+#include <linux/sched/isolation.h>
+
+#include "timer_migration.h"
+#include "tick-internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/timer_migration.h>
+
+/*
+ * The timer migration mechanism is built on a hierarchy of groups. The
+ * lowest level group contains CPUs, the next level groups of CPU groups
+ * and so forth. The CPU groups are kept per node so for the normal case
+ * lock contention won't happen across nodes. Depending on the number of
+ * CPUs per node even the next level might be kept as groups of CPU groups
+ * per node and only the levels above cross the node topology.
+ *
+ * Example topology for a two node system with 24 CPUs each.
+ *
+ * LVL 2 [GRP2:0]
+ * GRP1:0 = GRP1:M
+ *
+ * LVL 1 [GRP1:0] [GRP1:1]
+ * GRP0:0 - GRP0:2 GRP0:3 - GRP0:5
+ *
+ * LVL 0 [GRP0:0] [GRP0:1] [GRP0:2] [GRP0:3] [GRP0:4] [GRP0:5]
+ * CPUS 0-7 8-15 16-23 24-31 32-39 40-47
+ *
+ * The groups hold a timer queue of events sorted by expiry time. These
+ * queues are updated when CPUs go in idle. When they come out of idle
+ * ignore flag of events is set.
+ *
+ * Each group has a designated migrator CPU/group as long as a CPU/group is
+ * active in the group. This designated role is necessary to avoid that all
+ * active CPUs in a group try to migrate expired timers from other CPUs,
+ * which would result in massive lock bouncing.
+ *
+ * When a CPU is awake, it checks in it's own timer tick the group
+ * hierarchy up to the point where it is assigned the migrator role or if
+ * no CPU is active, it also checks the groups where no migrator is set
+ * (TMIGR_NONE).
+ *
+ * If it finds expired timers in one of the group queues it pulls them over
+ * from the idle CPU and runs the timer function. After that it updates the
+ * group and the parent groups if required.
+ *
+ * CPUs which go idle arm their CPU local timer hardware for the next local
+ * (pinned) timer event. If the next migratable timer expires after the
+ * next local timer or the CPU has no migratable timer pending then the
+ * CPU does not queue an event in the LVL0 group. If the next migratable
+ * timer expires before the next local timer then the CPU queues that timer
+ * in the LVL0 group. In both cases the CPU marks itself idle in the LVL0
+ * group.
+ *
+ * When CPU comes out of idle and when a group has at least a single active
+ * child, the ignore flag of the tmigr_event is set. This indicates, that
+ * the event is ignored even if it is still enqueued in the parent groups
+ * timer queue. It will be removed when touching the timer queue the next
+ * time. This spares locking in active path as the lock protects (after
+ * setup) only event information. For more information about locking,
+ * please read the section "Locking rules".
+ *
+ * If the CPU is the migrator of the group then it delegates that role to
+ * the next active CPU in the group or sets migrator to TMIGR_NONE when
+ * there is no active CPU in the group. This delegation needs to be
+ * propagated up the hierarchy so hand over from other leaves can happen at
+ * all hierarchy levels w/o doing a search.
+ *
+ * When the last CPU in the system goes idle, then it drops all migrator
+ * duties up to the top level of the hierarchy (LVL2 in the example). It
+ * then has to make sure, that it arms it's own local hardware timer for
+ * the earliest event in the system.
+ *
+ *
+ * Lifetime rules:
+ * ---------------
+ *
+ * The groups are built up at init time or when CPUs come online. They are
+ * not destroyed when a group becomes empty due to offlining. The group
+ * just won't participate in the hierarchy management anymore. Destroying
+ * groups would result in interesting race conditions which would just make
+ * the whole mechanism slow and complex.
+ *
+ *
+ * Locking rules:
+ * --------------
+ *
+ * For setting up new groups and handling events it's required to lock both
+ * child and parent group. The lock ordering is always bottom up. This also
+ * includes the per CPU locks in struct tmigr_cpu. For updating the migrator and
+ * active CPU/group information atomic_try_cmpxchg() is used instead and only
+ * the per CPU tmigr_cpu->lock is held.
+ *
+ * During the setup of groups tmigr_level_list is required. It is protected by
+ * @tmigr_mutex.
+ *
+ * When @timer_base->lock as well as tmigr related locks are required, the lock
+ * ordering is: first @timer_base->lock, afterwards tmigr related locks.
+ *
+ *
+ * Protection of the tmigr group state information:
+ * ------------------------------------------------
+ *
+ * The state information with the list of active children and migrator needs to
+ * be protected by a sequence counter. It prevents a race when updates in child
+ * groups are propagated in changed order. The state update is performed
+ * lockless and group wise. The following scenario describes what happens
+ * without updating the sequence counter:
+ *
+ * Therefore, let's take three groups and four CPUs (CPU2 and CPU3 as well
+ * as GRP0:1 will not change during the scenario):
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = GRP0:1
+ * active = GRP0:0, GRP0:1
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = CPU0 migrator = CPU2
+ * active = CPU0 active = CPU2
+ * / \ / \
+ * CPUs 0 1 2 3
+ * active idle active idle
+ *
+ *
+ * 1. CPU0 goes idle. As the update is performed group wise, in the first step
+ * only GRP0:0 is updated. The update of GRP1:0 is pending as CPU0 has to
+ * walk the hierarchy.
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = GRP0:1
+ * active = GRP0:0, GRP0:1
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * --> migrator = TMIGR_NONE migrator = CPU2
+ * --> active = active = CPU2
+ * / \ / \
+ * CPUs 0 1 2 3
+ * --> idle idle active idle
+ *
+ * 2. While CPU0 goes idle and continues to update the state, CPU1 comes out of
+ * idle. CPU1 updates GRP0:0. The update for GRP1:0 is pending as CPU1 also
+ * has to walk the hierarchy. Both CPUs (CPU0 and CPU1) now walk the
+ * hierarchy to perform the needed update from their point of view. The
+ * currently visible state looks the following:
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = GRP0:1
+ * active = GRP0:0, GRP0:1
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * --> migrator = CPU1 migrator = CPU2
+ * --> active = CPU1 active = CPU2
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle --> active active idle
+ *
+ * 3. Here is the race condition: CPU1 managed to propagate its changes (from
+ * step 2) through the hierarchy to GRP1:0 before CPU0 (step 1) did. The
+ * active members of GRP1:0 remain unchanged after the update since it is
+ * still valid from CPU1 current point of view:
+ *
+ * LVL 1 [GRP1:0]
+ * --> migrator = GRP0:1
+ * --> active = GRP0:0, GRP0:1
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = CPU1 migrator = CPU2
+ * active = CPU1 active = CPU2
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle active active idle
+ *
+ * 4. Now CPU0 finally propagates its changes (from step 1) to GRP1:0.
+ *
+ * LVL 1 [GRP1:0]
+ * --> migrator = GRP0:1
+ * --> active = GRP0:1
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = CPU1 migrator = CPU2
+ * active = CPU1 active = CPU2
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle active active idle
+ *
+ *
+ * The race of CPU0 vs. CPU1 led to an inconsistent state in GRP1:0. CPU1 is
+ * active and is correctly listed as active in GRP0:0. However GRP1:0 does not
+ * have GRP0:0 listed as active, which is wrong. The sequence counter has been
+ * added to avoid inconsistent states during updates. The state is updated
+ * atomically only if all members, including the sequence counter, match the
+ * expected value (compare-and-exchange).
+ *
+ * Looking back at the previous example with the addition of the sequence
+ * counter: The update as performed by CPU0 in step 4 will fail. CPU1 changed
+ * the sequence number during the update in step 3 so the expected old value (as
+ * seen by CPU0 before starting the walk) does not match.
+ *
+ * Prevent race between new event and last CPU going inactive
+ * ----------------------------------------------------------
+ *
+ * When the last CPU is going idle and there is a concurrent update of a new
+ * first global timer of an idle CPU, the group and child states have to be read
+ * while holding the lock in tmigr_update_events(). The following scenario shows
+ * what happens, when this is not done.
+ *
+ * 1. Only CPU2 is active:
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = GRP0:1
+ * active = GRP0:1
+ * next_expiry = KTIME_MAX
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = TMIGR_NONE migrator = CPU2
+ * active = active = CPU2
+ * next_expiry = KTIME_MAX next_expiry = KTIME_MAX
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle idle active idle
+ *
+ * 2. Now CPU 2 goes idle (and has no global timer, that has to be handled) and
+ * propagates that to GRP0:1:
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = GRP0:1
+ * active = GRP0:1
+ * next_expiry = KTIME_MAX
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = TMIGR_NONE --> migrator = TMIGR_NONE
+ * active = --> active =
+ * next_expiry = KTIME_MAX next_expiry = KTIME_MAX
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle idle --> idle idle
+ *
+ * 3. Now the idle state is propagated up to GRP1:0. As this is now the last
+ * child going idle in top level group, the expiry of the next group event
+ * has to be handed back to make sure no event is lost. As there is no event
+ * enqueued, KTIME_MAX is handed back to CPU2.
+ *
+ * LVL 1 [GRP1:0]
+ * --> migrator = TMIGR_NONE
+ * --> active =
+ * next_expiry = KTIME_MAX
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = TMIGR_NONE migrator = TMIGR_NONE
+ * active = active =
+ * next_expiry = KTIME_MAX next_expiry = KTIME_MAX
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle idle --> idle idle
+ *
+ * 4. CPU 0 has a new timer queued from idle and it expires at TIMER0. CPU0
+ * propagates that to GRP0:0:
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = TMIGR_NONE
+ * active =
+ * next_expiry = KTIME_MAX
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = TMIGR_NONE migrator = TMIGR_NONE
+ * active = active =
+ * --> next_expiry = TIMER0 next_expiry = KTIME_MAX
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle idle idle idle
+ *
+ * 5. GRP0:0 is not active, so the new timer has to be propagated to
+ * GRP1:0. Therefore the GRP1:0 state has to be read. When the stalled value
+ * (from step 2) is read, the timer is enqueued into GRP1:0, but nothing is
+ * handed back to CPU0, as it seems that there is still an active child in
+ * top level group.
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = TMIGR_NONE
+ * active =
+ * --> next_expiry = TIMER0
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = TMIGR_NONE migrator = TMIGR_NONE
+ * active = active =
+ * next_expiry = TIMER0 next_expiry = KTIME_MAX
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle idle idle idle
+ *
+ * This is prevented by reading the state when holding the lock (when a new
+ * timer has to be propagated from idle path)::
+ *
+ * CPU2 (tmigr_inactive_up()) CPU0 (tmigr_new_timer_up())
+ * -------------------------- ---------------------------
+ * // step 3:
+ * cmpxchg(&GRP1:0->state);
+ * tmigr_update_events() {
+ * spin_lock(&GRP1:0->lock);
+ * // ... update events ...
+ * // hand back first expiry when GRP1:0 is idle
+ * spin_unlock(&GRP1:0->lock);
+ * // ^^^ release state modification
+ * }
+ * tmigr_update_events() {
+ * spin_lock(&GRP1:0->lock)
+ * // ^^^ acquire state modification
+ * group_state = atomic_read(&GRP1:0->state)
+ * // .... update events ...
+ * // hand back first expiry when GRP1:0 is idle
+ * spin_unlock(&GRP1:0->lock) <3>
+ * // ^^^ makes state visible for other
+ * // callers of tmigr_new_timer_up()
+ * }
+ *
+ * When CPU0 grabs the lock directly after cmpxchg, the first timer is reported
+ * back to CPU0 and also later on to CPU2. So no timer is missed. A concurrent
+ * update of the group state from active path is no problem, as the upcoming CPU
+ * will take care of the group events.
+ *
+ * Required event and timerqueue update after a remote expiry:
+ * -----------------------------------------------------------
+ *
+ * After expiring timers of a remote CPU, a walk through the hierarchy and
+ * update of events and timerqueues is required. It is obviously needed if there
+ * is a 'new' global timer but also if there is no new global timer but the
+ * remote CPU is still idle.
+ *
+ * 1. CPU0 and CPU1 are idle and have both a global timer expiring at the same
+ * time. So both have an event enqueued in the timerqueue of GRP0:0. CPU3 is
+ * also idle and has no global timer pending. CPU2 is the only active CPU and
+ * thus also the migrator:
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = GRP0:1
+ * active = GRP0:1
+ * --> timerqueue = evt-GRP0:0
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = TMIGR_NONE migrator = CPU2
+ * active = active = CPU2
+ * groupevt.ignore = false groupevt.ignore = true
+ * groupevt.cpu = CPU0 groupevt.cpu =
+ * timerqueue = evt-CPU0, timerqueue =
+ * evt-CPU1
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle idle active idle
+ *
+ * 2. CPU2 starts to expire remote timers. It starts with LVL0 group
+ * GRP0:1. There is no event queued in the timerqueue, so CPU2 continues with
+ * the parent of GRP0:1: GRP1:0. In GRP1:0 it dequeues the first event. It
+ * looks at tmigr_event::cpu struct member and expires the pending timer(s)
+ * of CPU0.
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = GRP0:1
+ * active = GRP0:1
+ * --> timerqueue =
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = TMIGR_NONE migrator = CPU2
+ * active = active = CPU2
+ * groupevt.ignore = false groupevt.ignore = true
+ * --> groupevt.cpu = CPU0 groupevt.cpu =
+ * timerqueue = evt-CPU0, timerqueue =
+ * evt-CPU1
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle idle active idle
+ *
+ * 3. Some work has to be done after expiring the timers of CPU0. If we stop
+ * here, then CPU1's pending global timer(s) will not expire in time and the
+ * timerqueue of GRP0:0 has still an event for CPU0 enqueued which has just
+ * been processed. So it is required to walk the hierarchy from CPU0's point
+ * of view and update it accordingly. CPU0's event will be removed from the
+ * timerqueue because it has no pending timer. If CPU0 would have a timer
+ * pending then it has to expire after CPU1's first timer because all timers
+ * from this period were just expired. Either way CPU1's event will be first
+ * in GRP0:0's timerqueue and therefore set in the CPU field of the group
+ * event which is then enqueued in GRP1:0's timerqueue as GRP0:0 is still not
+ * active:
+ *
+ * LVL 1 [GRP1:0]
+ * migrator = GRP0:1
+ * active = GRP0:1
+ * --> timerqueue = evt-GRP0:0
+ * / \
+ * LVL 0 [GRP0:0] [GRP0:1]
+ * migrator = TMIGR_NONE migrator = CPU2
+ * active = active = CPU2
+ * groupevt.ignore = false groupevt.ignore = true
+ * --> groupevt.cpu = CPU1 groupevt.cpu =
+ * --> timerqueue = evt-CPU1 timerqueue =
+ * / \ / \
+ * CPUs 0 1 2 3
+ * idle idle active idle
+ *
+ * Now CPU2 (migrator) will continue step 2 at GRP1:0 and will expire the
+ * timer(s) of CPU1.
+ *
+ * The hierarchy walk in step 3 can be skipped if the migrator notices that a
+ * CPU of GRP0:0 is active again. The CPU will mark GRP0:0 active and take care
+ * of the group as migrator and any needed updates within the hierarchy.
+ */
+
+static DEFINE_MUTEX(tmigr_mutex);
+static struct list_head *tmigr_level_list __read_mostly;
+
+static unsigned int tmigr_hierarchy_levels __read_mostly;
+static unsigned int tmigr_crossnode_level __read_mostly;
+
+static struct tmigr_group *tmigr_root;
+
+static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
+
+/*
+ * CPUs available for timer migration.
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ * Additionally tmigr_available_mutex serializes set/clear operations with each other.
+ */
+static cpumask_var_t tmigr_available_cpumask;
+static DEFINE_MUTEX(tmigr_available_mutex);
+
+/* Enabled during late initcall */
+static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated);
+
+#define TMIGR_NONE 0xFF
+#define BIT_CNT 8
+
+static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc)
+{
+ return !(tmc->tmgroup && tmc->available);
+}
+
+/*
+ * Returns true if @cpu should be excluded from the hierarchy as isolated.
+ * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs
+ * are still part of the hierarchy but become idle (from a tick and timer
+ * migration perspective) when they stop their tick. This lets the timekeeping
+ * CPU handle their global timers. Marking also isolated CPUs as idle would be
+ * too costly, hence they are completely excluded from the hierarchy.
+ * This check is necessary, for instance, to prevent offline isolated CPUs from
+ * being incorrectly marked as available once getting back online.
+ *
+ * This function returns false during early boot and the isolation logic is
+ * enabled only after isolated CPUs are marked as unavailable at late boot.
+ * The tick CPU can be isolated at boot, however we cannot mark it as
+ * unavailable to avoid having no global migrator for the nohz_full CPUs. This
+ * should be ensured by the callers of this function: implicitly from hotplug
+ * callbacks and explicitly in tmigr_init_isolation() and
+ * tmigr_isolated_exclude_cpumask().
+ */
+static inline bool tmigr_is_isolated(int cpu)
+{
+ if (!static_branch_unlikely(&tmigr_exclude_isolated))
+ return false;
+ return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) ||
+ cpuset_cpu_is_isolated(cpu)) &&
+ housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
+}
+
+/*
+ * Returns true, when @childmask corresponds to the group migrator or when the
+ * group is not active - so no migrator is set.
+ */
+static bool tmigr_check_migrator(struct tmigr_group *group, u8 childmask)
+{
+ union tmigr_state s;
+
+ s.state = atomic_read(&group->migr_state);
+
+ if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE))
+ return true;
+
+ return false;
+}
+
+static bool tmigr_check_migrator_and_lonely(struct tmigr_group *group, u8 childmask)
+{
+ bool lonely, migrator = false;
+ unsigned long active;
+ union tmigr_state s;
+
+ s.state = atomic_read(&group->migr_state);
+
+ if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE))
+ migrator = true;
+
+ active = s.active;
+ lonely = bitmap_weight(&active, BIT_CNT) <= 1;
+
+ return (migrator && lonely);
+}
+
+static bool tmigr_check_lonely(struct tmigr_group *group)
+{
+ unsigned long active;
+ union tmigr_state s;
+
+ s.state = atomic_read(&group->migr_state);
+
+ active = s.active;
+
+ return bitmap_weight(&active, BIT_CNT) <= 1;
+}
+
+/**
+ * struct tmigr_walk - data required for walking the hierarchy
+ * @nextexp: Next CPU event expiry information which is handed into
+ * the timer migration code by the timer code
+ * (get_next_timer_interrupt())
+ * @firstexp: Contains the first event expiry information when
+ * hierarchy is completely idle. When CPU itself was the
+ * last going idle, information makes sure, that CPU will
+ * be back in time. When using this value in the remote
+ * expiry case, firstexp is stored in the per CPU tmigr_cpu
+ * struct of CPU which expires remote timers. It is updated
+ * in top level group only. Be aware, there could occur a
+ * new top level of the hierarchy between the 'top level
+ * call' in tmigr_update_events() and the check for the
+ * parent group in walk_groups(). Then @firstexp might
+ * contain a value != KTIME_MAX even if it was not the
+ * final top level. This is not a problem, as the worst
+ * outcome is a CPU which might wake up a little early.
+ * @evt: Pointer to tmigr_event which needs to be queued (of idle
+ * child group)
+ * @childmask: groupmask of child group
+ * @remote: Is set, when the new timer path is executed in
+ * tmigr_handle_remote_cpu()
+ * @basej: timer base in jiffies
+ * @now: timer base monotonic
+ * @check: is set if there is the need to handle remote timers;
+ * required in tmigr_requires_handle_remote() only
+ */
+struct tmigr_walk {
+ u64 nextexp;
+ u64 firstexp;
+ struct tmigr_event *evt;
+ u8 childmask;
+ bool remote;
+ unsigned long basej;
+ u64 now;
+ bool check;
+};
+
+typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
+
+static void __walk_groups_from(up_f up, struct tmigr_walk *data,
+ struct tmigr_group *child, struct tmigr_group *group)
+{
+ do {
+ WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
+
+ if (up(group, child, data))
+ break;
+
+ child = group;
+ /*
+ * Pairs with the store release on group connection
+ * to make sure group initialization is visible.
+ */
+ group = READ_ONCE(group->parent);
+ data->childmask = child->groupmask;
+ WARN_ON_ONCE(!data->childmask);
+ } while (group);
+}
+
+static void __walk_groups(up_f up, struct tmigr_walk *data,
+ struct tmigr_cpu *tmc)
+{
+ __walk_groups_from(up, data, NULL, tmc->tmgroup);
+}
+
+static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
+{
+ lockdep_assert_held(&tmc->lock);
+
+ __walk_groups(up, data, tmc);
+}
+
+/*
+ * Returns the next event of the timerqueue @group->events
+ *
+ * Removes timers with ignore flag and update next_expiry of the group. Values
+ * of the group event are updated in tmigr_update_events() only.
+ */
+static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
+{
+ struct timerqueue_node *node = NULL;
+ struct tmigr_event *evt = NULL;
+
+ lockdep_assert_held(&group->lock);
+
+ WRITE_ONCE(group->next_expiry, KTIME_MAX);
+
+ while ((node = timerqueue_getnext(&group->events))) {
+ evt = container_of(node, struct tmigr_event, nextevt);
+
+ if (!READ_ONCE(evt->ignore)) {
+ WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
+ return evt;
+ }
+
+ /*
+ * Remove next timers with ignore flag, because the group lock
+ * is held anyway
+ */
+ if (!timerqueue_del(&group->events, node))
+ break;
+ }
+
+ return NULL;
+}
+
+/*
+ * Return the next event (with the expiry equal or before @now)
+ *
+ * Event, which is returned, is also removed from the queue.
+ */
+static struct tmigr_event *tmigr_next_expired_groupevt(struct tmigr_group *group,
+ u64 now)
+{
+ struct tmigr_event *evt = tmigr_next_groupevt(group);
+
+ if (!evt || now < evt->nextevt.expires)
+ return NULL;
+
+ /*
+ * The event is ready to expire. Remove it and update next group event.
+ */
+ timerqueue_del(&group->events, &evt->nextevt);
+ tmigr_next_groupevt(group);
+
+ return evt;
+}
+
+static u64 tmigr_next_groupevt_expires(struct tmigr_group *group)
+{
+ struct tmigr_event *evt;
+
+ evt = tmigr_next_groupevt(group);
+
+ if (!evt)
+ return KTIME_MAX;
+ else
+ return evt->nextevt.expires;
+}
+
+static bool tmigr_active_up(struct tmigr_group *group,
+ struct tmigr_group *child,
+ struct tmigr_walk *data)
+{
+ union tmigr_state curstate, newstate;
+ bool walk_done;
+ u8 childmask;
+
+ childmask = data->childmask;
+ /*
+ * No memory barrier is required here in contrast to
+ * tmigr_inactive_up(), as the group state change does not depend on the
+ * child state.
+ */
+ curstate.state = atomic_read(&group->migr_state);
+
+ do {
+ newstate = curstate;
+ walk_done = true;
+
+ if (newstate.migrator == TMIGR_NONE) {
+ newstate.migrator = childmask;
+
+ /* Changes need to be propagated */
+ walk_done = false;
+ }
+
+ newstate.active |= childmask;
+ newstate.seq++;
+
+ } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state));
+
+ trace_tmigr_group_set_cpu_active(group, newstate, childmask);
+
+ /*
+ * The group is active (again). The group event might be still queued
+ * into the parent group's timerqueue but can now be handled by the
+ * migrator of this group. Therefore the ignore flag for the group event
+ * is updated to reflect this.
+ *
+ * The update of the ignore flag in the active path is done lockless. In
+ * worst case the migrator of the parent group observes the change too
+ * late and expires remotely all events belonging to this group. The
+ * lock is held while updating the ignore flag in idle path. So this
+ * state change will not be lost.
+ */
+ WRITE_ONCE(group->groupevt.ignore, true);
+
+ return walk_done;
+}
+
+static void __tmigr_cpu_activate(struct tmigr_cpu *tmc)
+{
+ struct tmigr_walk data;
+
+ data.childmask = tmc->groupmask;
+
+ trace_tmigr_cpu_active(tmc);
+
+ tmc->cpuevt.ignore = true;
+ WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+
+ walk_groups(&tmigr_active_up, &data, tmc);
+}
+
+/**
+ * tmigr_cpu_activate() - set this CPU active in timer migration hierarchy
+ *
+ * Call site timer_clear_idle() is called with interrupts disabled.
+ */
+void tmigr_cpu_activate(void)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+
+ if (tmigr_is_not_available(tmc))
+ return;
+
+ if (WARN_ON_ONCE(!tmc->idle))
+ return;
+
+ raw_spin_lock(&tmc->lock);
+ tmc->idle = false;
+ __tmigr_cpu_activate(tmc);
+ raw_spin_unlock(&tmc->lock);
+}
+
+/*
+ * Returns true, if there is nothing to be propagated to the next level
+ *
+ * @data->firstexp is set to expiry of first global event of the (top level of
+ * the) hierarchy, but only when hierarchy is completely idle.
+ *
+ * The child and group states need to be read under the lock, to prevent a race
+ * against a concurrent tmigr_inactive_up() run when the last CPU goes idle. See
+ * also section "Prevent race between new event and last CPU going inactive" in
+ * the documentation at the top.
+ *
+ * This is the only place where the group event expiry value is set.
+ */
+static
+bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
+ struct tmigr_walk *data)
+{
+ struct tmigr_event *evt, *first_childevt;
+ union tmigr_state childstate, groupstate;
+ bool remote = data->remote;
+ bool walk_done = false;
+ bool ignore;
+ u64 nextexp;
+
+ if (child) {
+ raw_spin_lock(&child->lock);
+ raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING);
+
+ childstate.state = atomic_read(&child->migr_state);
+ groupstate.state = atomic_read(&group->migr_state);
+
+ if (childstate.active) {
+ walk_done = true;
+ goto unlock;
+ }
+
+ first_childevt = tmigr_next_groupevt(child);
+ nextexp = child->next_expiry;
+ evt = &child->groupevt;
+
+ /*
+ * This can race with concurrent idle exit (activate).
+ * If the current writer wins, a useless remote expiration may
+ * be scheduled. If the activate wins, the event is properly
+ * ignored.
+ */
+ ignore = (nextexp == KTIME_MAX) ? true : false;
+ WRITE_ONCE(evt->ignore, ignore);
+ } else {
+ nextexp = data->nextexp;
+
+ first_childevt = evt = data->evt;
+ ignore = evt->ignore;
+
+ /*
+ * Walking the hierarchy is required in any case when a
+ * remote expiry was done before. This ensures to not lose
+ * already queued events in non active groups (see section
+ * "Required event and timerqueue update after a remote
+ * expiry" in the documentation at the top).
+ *
+ * The two call sites which are executed without a remote expiry
+ * before, are not prevented from propagating changes through
+ * the hierarchy by the return:
+ * - When entering this path by tmigr_new_timer(), @evt->ignore
+ * is never set.
+ * - tmigr_inactive_up() takes care of the propagation by
+ * itself and ignores the return value. But an immediate
+ * return is possible if there is a parent, sparing group
+ * locking at this level, because the upper walking call to
+ * the parent will take care about removing this event from
+ * within the group and update next_expiry accordingly.
+ *
+ * However if there is no parent, ie: the hierarchy has only a
+ * single level so @group is the top level group, make sure the
+ * first event information of the group is updated properly and
+ * also handled properly, so skip this fast return path.
+ */
+ if (ignore && !remote && group->parent)
+ return true;
+
+ raw_spin_lock(&group->lock);
+
+ childstate.state = 0;
+ groupstate.state = atomic_read(&group->migr_state);
+ }
+
+ /*
+ * If the child event is already queued in the group, remove it from the
+ * queue when the expiry time changed only or when it could be ignored.
+ */
+ if (timerqueue_node_queued(&evt->nextevt)) {
+ if ((evt->nextevt.expires == nextexp) && !ignore) {
+ /* Make sure not to miss a new CPU event with the same expiry */
+ evt->cpu = first_childevt->cpu;
+ goto check_toplvl;
+ }
+
+ if (!timerqueue_del(&group->events, &evt->nextevt))
+ WRITE_ONCE(group->next_expiry, KTIME_MAX);
+ }
+
+ if (ignore) {
+ /*
+ * When the next child event could be ignored (nextexp is
+ * KTIME_MAX) and there was no remote timer handling before or
+ * the group is already active, there is no need to walk the
+ * hierarchy even if there is a parent group.
+ *
+ * The other way round: even if the event could be ignored, but
+ * if a remote timer handling was executed before and the group
+ * is not active, walking the hierarchy is required to not miss
+ * an enqueued timer in the non active group. The enqueued timer
+ * of the group needs to be propagated to a higher level to
+ * ensure it is handled.
+ */
+ if (!remote || groupstate.active)
+ walk_done = true;
+ } else {
+ evt->nextevt.expires = nextexp;
+ evt->cpu = first_childevt->cpu;
+
+ if (timerqueue_add(&group->events, &evt->nextevt))
+ WRITE_ONCE(group->next_expiry, nextexp);
+ }
+
+check_toplvl:
+ if (!group->parent && (groupstate.migrator == TMIGR_NONE)) {
+ walk_done = true;
+
+ /*
+ * Nothing to do when update was done during remote timer
+ * handling. First timer in top level group which needs to be
+ * handled when top level group is not active, is calculated
+ * directly in tmigr_handle_remote_up().
+ */
+ if (remote)
+ goto unlock;
+
+ /*
+ * The top level group is idle and it has to be ensured the
+ * global timers are handled in time. (This could be optimized
+ * by keeping track of the last global scheduled event and only
+ * arming it on the CPU if the new event is earlier. Not sure if
+ * its worth the complexity.)
+ */
+ data->firstexp = tmigr_next_groupevt_expires(group);
+ }
+
+ trace_tmigr_update_events(child, group, childstate, groupstate,
+ nextexp);
+
+unlock:
+ raw_spin_unlock(&group->lock);
+
+ if (child)
+ raw_spin_unlock(&child->lock);
+
+ return walk_done;
+}
+
+static bool tmigr_new_timer_up(struct tmigr_group *group,
+ struct tmigr_group *child,
+ struct tmigr_walk *data)
+{
+ return tmigr_update_events(group, child, data);
+}
+
+/*
+ * Returns the expiry of the next timer that needs to be handled. KTIME_MAX is
+ * returned, if an active CPU will handle all the timer migration hierarchy
+ * timers.
+ */
+static u64 tmigr_new_timer(struct tmigr_cpu *tmc, u64 nextexp)
+{
+ struct tmigr_walk data = { .nextexp = nextexp,
+ .firstexp = KTIME_MAX,
+ .evt = &tmc->cpuevt };
+
+ lockdep_assert_held(&tmc->lock);
+
+ if (tmc->remote)
+ return KTIME_MAX;
+
+ trace_tmigr_cpu_new_timer(tmc);
+
+ tmc->cpuevt.ignore = false;
+ data.remote = false;
+
+ walk_groups(&tmigr_new_timer_up, &data, tmc);
+
+ /* If there is a new first global event, make sure it is handled */
+ return data.firstexp;
+}
+
+static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
+ unsigned long jif)
+{
+ struct timer_events tevt;
+ struct tmigr_walk data;
+ struct tmigr_cpu *tmc;
+
+ tmc = per_cpu_ptr(&tmigr_cpu, cpu);
+
+ raw_spin_lock_irq(&tmc->lock);
+
+ /*
+ * If the remote CPU is offline then the timers have been migrated to
+ * another CPU.
+ *
+ * If tmigr_cpu::remote is set, at the moment another CPU already
+ * expires the timers of the remote CPU.
+ *
+ * If tmigr_event::ignore is set, then the CPU returns from idle and
+ * takes care of its timers.
+ *
+ * If the next event expires in the future, then the event has been
+ * updated and there are no timers to expire right now. The CPU which
+ * updated the event takes care when hierarchy is completely
+ * idle. Otherwise the migrator does it as the event is enqueued.
+ */
+ if (!tmc->available || tmc->remote || tmc->cpuevt.ignore ||
+ now < tmc->cpuevt.nextevt.expires) {
+ raw_spin_unlock_irq(&tmc->lock);
+ return;
+ }
+
+ trace_tmigr_handle_remote_cpu(tmc);
+
+ tmc->remote = true;
+ WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+
+ /* Drop the lock to allow the remote CPU to exit idle */
+ raw_spin_unlock_irq(&tmc->lock);
+
+ if (cpu != smp_processor_id())
+ timer_expire_remote(cpu);
+
+ /*
+ * Lock ordering needs to be preserved - timer_base locks before tmigr
+ * related locks (see section "Locking rules" in the documentation at
+ * the top). During fetching the next timer interrupt, also tmc->lock
+ * needs to be held. Otherwise there is a possible race window against
+ * the CPU itself when it comes out of idle, updates the first timer in
+ * the hierarchy and goes back to idle.
+ *
+ * timer base locks are dropped as fast as possible: After checking
+ * whether the remote CPU went offline in the meantime and after
+ * fetching the next remote timer interrupt. Dropping the locks as fast
+ * as possible keeps the locking region small and prevents holding
+ * several (unnecessary) locks during walking the hierarchy for updating
+ * the timerqueue and group events.
+ */
+ local_irq_disable();
+ timer_lock_remote_bases(cpu);
+ raw_spin_lock(&tmc->lock);
+
+ /*
+ * When the CPU went offline in the meantime, no hierarchy walk has to
+ * be done for updating the queued events, because the walk was
+ * already done during marking the CPU offline in the hierarchy.
+ *
+ * When the CPU is no longer idle, the CPU takes care of the timers and
+ * also of the timers in the hierarchy.
+ *
+ * (See also section "Required event and timerqueue update after a
+ * remote expiry" in the documentation at the top)
+ */
+ if (!tmc->available || !tmc->idle) {
+ timer_unlock_remote_bases(cpu);
+ goto unlock;
+ }
+
+ /* next event of CPU */
+ fetch_next_timer_interrupt_remote(jif, now, &tevt, cpu);
+ timer_unlock_remote_bases(cpu);
+
+ data.nextexp = tevt.global;
+ data.firstexp = KTIME_MAX;
+ data.evt = &tmc->cpuevt;
+ data.remote = true;
+
+ /*
+ * The update is done even when there is no 'new' global timer pending
+ * on the remote CPU (see section "Required event and timerqueue update
+ * after a remote expiry" in the documentation at the top)
+ */
+ walk_groups(&tmigr_new_timer_up, &data, tmc);
+
+unlock:
+ tmc->remote = false;
+ raw_spin_unlock_irq(&tmc->lock);
+}
+
+static bool tmigr_handle_remote_up(struct tmigr_group *group,
+ struct tmigr_group *child,
+ struct tmigr_walk *data)
+{
+ struct tmigr_event *evt;
+ unsigned long jif;
+ u8 childmask;
+ u64 now;
+
+ jif = data->basej;
+ now = data->now;
+
+ childmask = data->childmask;
+
+ trace_tmigr_handle_remote(group);
+again:
+ /*
+ * Handle the group only if @childmask is the migrator or if the
+ * group has no migrator. Otherwise the group is active and is
+ * handled by its own migrator.
+ */
+ if (!tmigr_check_migrator(group, childmask))
+ return true;
+
+ raw_spin_lock_irq(&group->lock);
+
+ evt = tmigr_next_expired_groupevt(group, now);
+
+ if (evt) {
+ unsigned int remote_cpu = evt->cpu;
+
+ raw_spin_unlock_irq(&group->lock);
+
+ tmigr_handle_remote_cpu(remote_cpu, now, jif);
+
+ /* check if there is another event, that needs to be handled */
+ goto again;
+ }
+
+ /*
+ * Keep track of the expiry of the first event that needs to be handled
+ * (group->next_expiry was updated by tmigr_next_expired_groupevt(),
+ * next was set by tmigr_handle_remote_cpu()).
+ */
+ data->firstexp = group->next_expiry;
+
+ raw_spin_unlock_irq(&group->lock);
+
+ return false;
+}
+
+/**
+ * tmigr_handle_remote() - Handle global timers of remote idle CPUs
+ *
+ * Called from the timer soft interrupt with interrupts enabled.
+ */
+void tmigr_handle_remote(void)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ struct tmigr_walk data;
+
+ if (tmigr_is_not_available(tmc))
+ return;
+
+ data.childmask = tmc->groupmask;
+ data.firstexp = KTIME_MAX;
+
+ /*
+ * NOTE: This is a doubled check because the migrator test will be done
+ * in tmigr_handle_remote_up() anyway. Keep this check to speed up the
+ * return when nothing has to be done.
+ */
+ if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) {
+ /*
+ * If this CPU was an idle migrator, make sure to clear its wakeup
+ * value so it won't chase timers that have already expired elsewhere.
+ * This avoids endless requeue from tmigr_new_timer().
+ */
+ if (READ_ONCE(tmc->wakeup) == KTIME_MAX)
+ return;
+ }
+
+ data.now = get_jiffies_update(&data.basej);
+
+ /*
+ * Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to
+ * KTIME_MAX. Even if tmc->lock is not held during the whole remote
+ * handling, tmc->wakeup is fine to be stale as it is called in
+ * interrupt context and tick_nohz_next_event() is executed in interrupt
+ * exit path only after processing the last pending interrupt.
+ */
+
+ __walk_groups(&tmigr_handle_remote_up, &data, tmc);
+
+ raw_spin_lock_irq(&tmc->lock);
+ WRITE_ONCE(tmc->wakeup, data.firstexp);
+ raw_spin_unlock_irq(&tmc->lock);
+}
+
+static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
+ struct tmigr_group *child,
+ struct tmigr_walk *data)
+{
+ u8 childmask;
+
+ childmask = data->childmask;
+
+ /*
+ * Handle the group only if the child is the migrator or if the group
+ * has no migrator. Otherwise the group is active and is handled by its
+ * own migrator.
+ */
+ if (!tmigr_check_migrator(group, childmask))
+ return true;
+ /*
+ * The lock is required on 32bit architectures to read the variable
+ * consistently with a concurrent writer. On 64bit the lock is not
+ * required because the read operation is not split and so it is always
+ * consistent.
+ */
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ data->firstexp = READ_ONCE(group->next_expiry);
+ if (data->now >= data->firstexp) {
+ data->check = true;
+ return true;
+ }
+ } else {
+ raw_spin_lock(&group->lock);
+ data->firstexp = group->next_expiry;
+ if (data->now >= group->next_expiry) {
+ data->check = true;
+ raw_spin_unlock(&group->lock);
+ return true;
+ }
+ raw_spin_unlock(&group->lock);
+ }
+
+ return false;
+}
+
+/**
+ * tmigr_requires_handle_remote() - Check the need of remote timer handling
+ *
+ * Must be called with interrupts disabled.
+ */
+bool tmigr_requires_handle_remote(void)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ struct tmigr_walk data;
+ unsigned long jif;
+ bool ret = false;
+
+ if (tmigr_is_not_available(tmc))
+ return ret;
+
+ data.now = get_jiffies_update(&jif);
+ data.childmask = tmc->groupmask;
+ data.firstexp = KTIME_MAX;
+ data.check = false;
+
+ /*
+ * If the CPU is active, walk the hierarchy to check whether a remote
+ * expiry is required.
+ *
+ * Check is done lockless as interrupts are disabled and @tmc->idle is
+ * set only by the local CPU.
+ */
+ if (!tmc->idle) {
+ __walk_groups(&tmigr_requires_handle_remote_up, &data, tmc);
+
+ return data.check;
+ }
+
+ /*
+ * When the CPU is idle, compare @tmc->wakeup with @data.now. The lock
+ * is required on 32bit architectures to read the variable consistently
+ * with a concurrent writer. On 64bit the lock is not required because
+ * the read operation is not split and so it is always consistent.
+ */
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ if (data.now >= READ_ONCE(tmc->wakeup))
+ return true;
+ } else {
+ raw_spin_lock(&tmc->lock);
+ if (data.now >= tmc->wakeup)
+ ret = true;
+ raw_spin_unlock(&tmc->lock);
+ }
+
+ return ret;
+}
+
+/**
+ * tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc)
+ * @nextexp: Next expiry of global timer (or KTIME_MAX if not)
+ *
+ * The CPU is already deactivated in the timer migration
+ * hierarchy. tick_nohz_get_sleep_length() calls tick_nohz_next_event()
+ * and thereby the timer idle path is executed once more. @tmc->wakeup
+ * holds the first timer, when the timer migration hierarchy is
+ * completely idle.
+ *
+ * Returns the first timer that needs to be handled by this CPU or KTIME_MAX if
+ * nothing needs to be done.
+ */
+u64 tmigr_cpu_new_timer(u64 nextexp)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ u64 ret;
+
+ if (tmigr_is_not_available(tmc))
+ return nextexp;
+
+ raw_spin_lock(&tmc->lock);
+
+ ret = READ_ONCE(tmc->wakeup);
+ if (nextexp != KTIME_MAX) {
+ if (nextexp != tmc->cpuevt.nextevt.expires ||
+ tmc->cpuevt.ignore) {
+ ret = tmigr_new_timer(tmc, nextexp);
+ /*
+ * Make sure the reevaluation of timers in idle path
+ * will not miss an event.
+ */
+ WRITE_ONCE(tmc->wakeup, ret);
+ }
+ }
+ trace_tmigr_cpu_new_timer_idle(tmc, nextexp);
+ raw_spin_unlock(&tmc->lock);
+ return ret;
+}
+
+static bool tmigr_inactive_up(struct tmigr_group *group,
+ struct tmigr_group *child,
+ struct tmigr_walk *data)
+{
+ union tmigr_state curstate, newstate, childstate;
+ bool walk_done;
+ u8 childmask;
+
+ childmask = data->childmask;
+ childstate.state = 0;
+
+ /*
+ * The memory barrier is paired with the cmpxchg() in tmigr_active_up()
+ * to make sure the updates of child and group states are ordered. The
+ * ordering is mandatory, as the group state change depends on the child
+ * state.
+ */
+ curstate.state = atomic_read_acquire(&group->migr_state);
+
+ for (;;) {
+ if (child)
+ childstate.state = atomic_read(&child->migr_state);
+
+ newstate = curstate;
+ walk_done = true;
+
+ /* Reset active bit when the child is no longer active */
+ if (!childstate.active)
+ newstate.active &= ~childmask;
+
+ if (newstate.migrator == childmask) {
+ /*
+ * Find a new migrator for the group, because the child
+ * group is idle!
+ */
+ if (!childstate.active) {
+ unsigned long new_migr_bit, active = newstate.active;
+
+ new_migr_bit = find_first_bit(&active, BIT_CNT);
+
+ if (new_migr_bit != BIT_CNT) {
+ newstate.migrator = BIT(new_migr_bit);
+ } else {
+ newstate.migrator = TMIGR_NONE;
+
+ /* Changes need to be propagated */
+ walk_done = false;
+ }
+ }
+ }
+
+ newstate.seq++;
+
+ WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active));
+
+ if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) {
+ trace_tmigr_group_set_cpu_inactive(group, newstate, childmask);
+ break;
+ }
+
+ /*
+ * The memory barrier is paired with the cmpxchg() in
+ * tmigr_active_up() to make sure the updates of child and group
+ * states are ordered. It is required only when the above
+ * try_cmpxchg() fails.
+ */
+ smp_mb__after_atomic();
+ }
+
+ data->remote = false;
+
+ /* Event Handling */
+ tmigr_update_events(group, child, data);
+
+ return walk_done;
+}
+
+static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp)
+{
+ struct tmigr_walk data = { .nextexp = nextexp,
+ .firstexp = KTIME_MAX,
+ .evt = &tmc->cpuevt,
+ .childmask = tmc->groupmask };
+
+ /*
+ * If nextexp is KTIME_MAX, the CPU event will be ignored because the
+ * local timer expires before the global timer, no global timer is set
+ * or CPU goes offline.
+ */
+ if (nextexp != KTIME_MAX)
+ tmc->cpuevt.ignore = false;
+
+ walk_groups(&tmigr_inactive_up, &data, tmc);
+ return data.firstexp;
+}
+
+/**
+ * tmigr_cpu_deactivate() - Put current CPU into inactive state
+ * @nextexp: The next global timer expiry of the current CPU
+ *
+ * Must be called with interrupts disabled.
+ *
+ * Return: the next event expiry of the current CPU or the next event expiry
+ * from the hierarchy if this CPU is the top level migrator or the hierarchy is
+ * completely idle.
+ */
+u64 tmigr_cpu_deactivate(u64 nextexp)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ u64 ret;
+
+ if (tmigr_is_not_available(tmc))
+ return nextexp;
+
+ raw_spin_lock(&tmc->lock);
+
+ ret = __tmigr_cpu_deactivate(tmc, nextexp);
+
+ tmc->idle = true;
+
+ /*
+ * Make sure the reevaluation of timers in idle path will not miss an
+ * event.
+ */
+ WRITE_ONCE(tmc->wakeup, ret);
+
+ trace_tmigr_cpu_idle(tmc, nextexp);
+ raw_spin_unlock(&tmc->lock);
+ return ret;
+}
+
+/**
+ * tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to
+ * go idle
+ * @nextevt: The next global timer expiry of the current CPU
+ *
+ * Return:
+ * * KTIME_MAX - when it is probable that nothing has to be done (not
+ * the only one in the level 0 group; and if it is the
+ * only one in level 0 group, but there are more than a
+ * single group active on the way to top level)
+ * * nextevt - when CPU is offline and has to handle timer on its own
+ * or when on the way to top in every group only a single
+ * child is active but @nextevt is before the lowest
+ * next_expiry encountered while walking up to top level.
+ * * next_expiry - value of lowest expiry encountered while walking groups
+ * if only a single child is active on each and @nextevt
+ * is after this lowest expiry.
+ */
+u64 tmigr_quick_check(u64 nextevt)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ struct tmigr_group *group = tmc->tmgroup;
+
+ if (tmigr_is_not_available(tmc))
+ return nextevt;
+
+ if (WARN_ON_ONCE(tmc->idle))
+ return nextevt;
+
+ if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask))
+ return KTIME_MAX;
+
+ do {
+ if (!tmigr_check_lonely(group))
+ return KTIME_MAX;
+
+ /*
+ * Since current CPU is active, events may not be sorted
+ * from bottom to the top because the CPU's event is ignored
+ * up to the top and its sibling's events not propagated upwards.
+ * Thus keep track of the lowest observed expiry.
+ */
+ nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
+ group = group->parent;
+ } while (group);
+
+ return nextevt;
+}
+
+/*
+ * tmigr_trigger_active() - trigger a CPU to become active again
+ *
+ * This function is executed on a CPU which is part of cpu_online_mask, when the
+ * last active CPU in the hierarchy is offlining. With this, it is ensured that
+ * the other CPU is active and takes over the migrator duty.
+ */
+static long tmigr_trigger_active(void *unused)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+
+ WARN_ON_ONCE(!tmc->available || tmc->idle);
+
+ return 0;
+}
+
+static int tmigr_clear_cpu_available(unsigned int cpu)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+ int migrator;
+ u64 firstexp;
+
+ guard(mutex)(&tmigr_available_mutex);
+
+ cpumask_clear_cpu(cpu, tmigr_available_cpumask);
+ scoped_guard(raw_spinlock_irq, &tmc->lock) {
+ if (!tmc->available)
+ return 0;
+ tmc->available = false;
+ WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+
+ /*
+ * CPU has to handle the local events on his own, when on the way to
+ * offline; Therefore nextevt value is set to KTIME_MAX
+ */
+ firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
+ trace_tmigr_cpu_unavailable(tmc);
+ }
+
+ if (firstexp != KTIME_MAX) {
+ migrator = cpumask_any(tmigr_available_cpumask);
+ work_on_cpu(migrator, tmigr_trigger_active, NULL);
+ }
+
+ return 0;
+}
+
+static int tmigr_set_cpu_available(unsigned int cpu)
+{
+ struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+
+ /* Check whether CPU data was successfully initialized */
+ if (WARN_ON_ONCE(!tmc->tmgroup))
+ return -EINVAL;
+
+ if (tmigr_is_isolated(cpu))
+ return 0;
+
+ guard(mutex)(&tmigr_available_mutex);
+
+ cpumask_set_cpu(cpu, tmigr_available_cpumask);
+ scoped_guard(raw_spinlock_irq, &tmc->lock) {
+ if (tmc->available)
+ return 0;
+ trace_tmigr_cpu_available(tmc);
+ tmc->idle = timer_base_is_idle();
+ if (!tmc->idle)
+ __tmigr_cpu_activate(tmc);
+ tmc->available = true;
+ }
+ return 0;
+}
+
+static void tmigr_cpu_isolate(struct work_struct *ignored)
+{
+ tmigr_clear_cpu_available(smp_processor_id());
+}
+
+static void tmigr_cpu_unisolate(struct work_struct *ignored)
+{
+ tmigr_set_cpu_available(smp_processor_id());
+}
+
+/**
+ * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy
+ * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy
+ *
+ * This function can be called from cpuset code to provide the new set of
+ * isolated CPUs that should be excluded from the hierarchy.
+ * Online CPUs not present in exclude_cpumask but already excluded are brought
+ * back to the hierarchy.
+ * Functions to isolate/unisolate need to be called locally and can sleep.
+ */
+int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
+{
+ struct work_struct __percpu *works __free(free_percpu) =
+ alloc_percpu(struct work_struct);
+ cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ if (!works)
+ return -ENOMEM;
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * First set previously isolated CPUs as available (unisolate).
+ * This cpumask contains only CPUs that switched to available now.
+ */
+ cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask);
+ cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask);
+
+ for_each_cpu(cpu, cpumask) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, tmigr_cpu_unisolate);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, cpumask)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ /*
+ * Then clear previously available CPUs (isolate).
+ * This cpumask contains only CPUs that switched to not available now.
+ * There cannot be overlap with the newly available ones.
+ */
+ cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask);
+ cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+ /*
+ * Handle this here and not in the cpuset code because exclude_cpumask
+ * might include also the tick CPU if included in isolcpus.
+ */
+ for_each_cpu(cpu, cpumask) {
+ if (!tick_nohz_cpu_hotpluggable(cpu)) {
+ cpumask_clear_cpu(cpu, cpumask);
+ break;
+ }
+ }
+
+ for_each_cpu(cpu, cpumask) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, tmigr_cpu_isolate);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, cpumask)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ return 0;
+}
+
+static int __init tmigr_init_isolation(void)
+{
+ cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+
+ static_branch_enable(&tmigr_exclude_isolated);
+
+ if (!housekeeping_enabled(HK_TYPE_DOMAIN))
+ return 0;
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ /* Protect against RCU torture hotplug testing */
+ guard(cpus_read_lock)();
+ return tmigr_isolated_exclude_cpumask(cpumask);
+}
+late_initcall(tmigr_init_isolation);
+
+static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
+ int node)
+{
+ union tmigr_state s;
+
+ raw_spin_lock_init(&group->lock);
+
+ group->level = lvl;
+ group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE;
+
+ group->num_children = 0;
+
+ s.migrator = TMIGR_NONE;
+ s.active = 0;
+ s.seq = 0;
+ atomic_set(&group->migr_state, s.state);
+
+ timerqueue_init_head(&group->events);
+ timerqueue_init(&group->groupevt.nextevt);
+ group->groupevt.nextevt.expires = KTIME_MAX;
+ WRITE_ONCE(group->next_expiry, KTIME_MAX);
+ group->groupevt.ignore = true;
+}
+
+static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+{
+ struct tmigr_group *tmp, *group = NULL;
+
+ lockdep_assert_held(&tmigr_mutex);
+
+ /* Try to attach to an existing group first */
+ list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
+ /*
+ * If @lvl is below the cross NUMA node level, check whether
+ * this group belongs to the same NUMA node.
+ */
+ if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
+ continue;
+
+ /* Capacity left? */
+ if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP)
+ continue;
+
+ /*
+ * TODO: A possible further improvement: Make sure that all CPU
+ * siblings end up in the same group of the lowest level of the
+ * hierarchy. Rely on the topology sibling mask would be a
+ * reasonable solution.
+ */
+
+ group = tmp;
+ break;
+ }
+
+ if (group)
+ return group;
+
+ /* Allocate and set up a new group */
+ group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+
+ tmigr_init_group(group, lvl, node);
+
+ /* Setup successful. Add it to the hierarchy */
+ list_add(&group->list, &tmigr_level_list[lvl]);
+ trace_tmigr_group_set(group);
+ return group;
+}
+
+static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+{
+ if (!group->parent && group != tmigr_root) {
+ /*
+ * This is the new top-level, prepare its groupmask in advance
+ * to avoid accidents where yet another new top-level is
+ * created in the future and made visible before this groupmask.
+ */
+ group->groupmask = BIT(0);
+ WARN_ON_ONCE(activate);
+
+ return true;
+ }
+
+ return false;
+
+}
+
+static void tmigr_connect_child_parent(struct tmigr_group *child,
+ struct tmigr_group *parent,
+ bool activate)
+{
+ if (tmigr_init_root(parent, activate)) {
+ /*
+ * The previous top level had prepared its groupmask already,
+ * simply account it in advance as the first child. If some groups
+ * have been created between the old and new root due to node
+ * mismatch, the new root's child will be intialized accordingly.
+ */
+ parent->num_children = 1;
+ }
+
+ /* Connecting old root to new root ? */
+ if (!parent->parent && activate) {
+ /*
+ * @child is the old top, or in case of node mismatch, some
+ * intermediate group between the old top and the new one in
+ * @parent. In this case the @child must be pre-accounted above
+ * as the first child. Its new inactive sibling corresponding
+ * to the CPU going up has been accounted as the second child.
+ */
+ WARN_ON_ONCE(parent->num_children != 2);
+ child->groupmask = BIT(0);
+ } else {
+ /* Common case adding @child for the CPU going up to @parent. */
+ child->groupmask = BIT(parent->num_children++);
+ }
+
+ /*
+ * Make sure parent initialization is visible before publishing it to a
+ * racing CPU entering/exiting idle. This RELEASE barrier enforces an
+ * address dependency that pairs with the READ_ONCE() in __walk_groups().
+ */
+ smp_store_release(&child->parent, parent);
+
+ trace_tmigr_connect_child_parent(child);
+}
+
+static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
+ struct tmigr_group *start, bool activate)
+{
+ struct tmigr_group *group, *child, **stack;
+ int i, top = 0, err = 0, start_lvl = 0;
+ bool root_mismatch = false;
+
+ stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL);
+ if (!stack)
+ return -ENOMEM;
+
+ if (start) {
+ stack[start->level] = start;
+ start_lvl = start->level + 1;
+ }
+
+ if (tmigr_root)
+ root_mismatch = tmigr_root->numa_node != node;
+
+ for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
+ group = tmigr_get_group(node, i);
+ if (IS_ERR(group)) {
+ err = PTR_ERR(group);
+ i--;
+ break;
+ }
+
+ top = i;
+ stack[i] = group;
+
+ /*
+ * When booting only less CPUs of a system than CPUs are
+ * available, not all calculated hierarchy levels are required,
+ * unless a node mismatch is detected.
+ *
+ * The loop is aborted as soon as the highest level, which might
+ * be different from tmigr_hierarchy_levels, contains only a
+ * single group, unless the nodes mismatch below tmigr_crossnode_level
+ */
+ if (group->parent)
+ break;
+ if ((!root_mismatch || i >= tmigr_crossnode_level) &&
+ list_is_singular(&tmigr_level_list[i]))
+ break;
+ }
+
+ /* Assert single root without parent */
+ if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels))
+ return -EINVAL;
+
+ for (; i >= start_lvl; i--) {
+ group = stack[i];
+
+ if (err < 0) {
+ list_del(&group->list);
+ kfree(group);
+ continue;
+ }
+
+ WARN_ON_ONCE(i != group->level);
+
+ /*
+ * Update tmc -> group / child -> group connection
+ */
+ if (i == 0) {
+ struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
+
+ tmc->tmgroup = group;
+ tmc->groupmask = BIT(group->num_children++);
+
+ tmigr_init_root(group, activate);
+
+ trace_tmigr_connect_cpu_parent(tmc);
+
+ /* There are no children that need to be connected */
+ continue;
+ } else {
+ child = stack[i - 1];
+ tmigr_connect_child_parent(child, group, activate);
+ }
+ }
+
+ if (err < 0)
+ goto out;
+
+ if (activate) {
+ struct tmigr_walk data;
+ union tmigr_state state;
+
+ /*
+ * To prevent inconsistent states, active children need to be active in
+ * the new parent as well. Inactive children are already marked inactive
+ * in the parent group:
+ *
+ * * When new groups were created by tmigr_setup_groups() starting from
+ * the lowest level, then they are not active. They will be set active
+ * when the new online CPU comes active.
+ *
+ * * But if new groups above the current top level are required, it is
+ * mandatory to propagate the active state of the already existing
+ * child to the new parents. So tmigr_active_up() activates the
+ * new parents while walking up from the old root to the new.
+ *
+ * * It is ensured that @start is active, as this setup path is
+ * executed in hotplug prepare callback. This is executed by an
+ * already connected and !idle CPU. Even if all other CPUs go idle,
+ * the CPU executing the setup will be responsible up to current top
+ * level group. And the next time it goes inactive, it will release
+ * the new childmask and parent to subsequent walkers through this
+ * @child. Therefore propagate active state unconditionally.
+ */
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(!state.active);
+ WARN_ON_ONCE(!start->parent);
+ data.childmask = start->groupmask;
+ __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ }
+
+ /* Root update */
+ if (list_is_singular(&tmigr_level_list[top])) {
+ group = list_first_entry(&tmigr_level_list[top],
+ typeof(*group), list);
+ WARN_ON_ONCE(group->parent);
+ if (tmigr_root) {
+ /* Old root should be the same or below */
+ WARN_ON_ONCE(tmigr_root->level > top);
+ }
+ tmigr_root = group;
+ }
+out:
+ kfree(stack);
+
+ return err;
+}
+
+static int tmigr_add_cpu(unsigned int cpu)
+{
+ struct tmigr_group *old_root = tmigr_root;
+ int node = cpu_to_node(cpu);
+ int ret;
+
+ guard(mutex)(&tmigr_mutex);
+
+ ret = tmigr_setup_groups(cpu, node, NULL, false);
+
+ /* Root has changed? Connect the old one to the new */
+ if (ret >= 0 && old_root && old_root != tmigr_root) {
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == raw_smp_processor_id());
+ /*
+ * The (likely) current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
+ ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+ }
+
+ return ret;
+}
+
+static int tmigr_cpu_prepare(unsigned int cpu)
+{
+ struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
+ int ret = 0;
+
+ /* Not first online attempt? */
+ if (tmc->tmgroup)
+ return ret;
+
+ raw_spin_lock_init(&tmc->lock);
+ timerqueue_init(&tmc->cpuevt.nextevt);
+ tmc->cpuevt.nextevt.expires = KTIME_MAX;
+ tmc->cpuevt.ignore = true;
+ tmc->cpuevt.cpu = cpu;
+ tmc->remote = false;
+ WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+
+ ret = tmigr_add_cpu(cpu);
+ if (ret < 0)
+ return ret;
+
+ if (tmc->groupmask == 0)
+ return -EINVAL;
+
+ return ret;
+}
+
+static int __init tmigr_init(void)
+{
+ unsigned int cpulvl, nodelvl, cpus_per_node, i;
+ unsigned int nnodes = num_possible_nodes();
+ unsigned int ncpus = num_possible_cpus();
+ int ret = -ENOMEM;
+
+ BUILD_BUG_ON_NOT_POWER_OF_2(TMIGR_CHILDREN_PER_GROUP);
+
+ /* Nothing to do if running on UP */
+ if (ncpus == 1)
+ return 0;
+
+ if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /*
+ * Calculate the required hierarchy levels. Unfortunately there is no
+ * reliable information available, unless all possible CPUs have been
+ * brought up and all NUMA nodes are populated.
+ *
+ * Estimate the number of levels with the number of possible nodes and
+ * the number of possible CPUs. Assume CPUs are spread evenly across
+ * nodes. We cannot rely on cpumask_of_node() because it only works for
+ * online CPUs.
+ */
+ cpus_per_node = DIV_ROUND_UP(ncpus, nnodes);
+
+ /* Calc the hierarchy levels required to hold the CPUs of a node */
+ cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node),
+ ilog2(TMIGR_CHILDREN_PER_GROUP));
+
+ /* Calculate the extra levels to connect all nodes */
+ nodelvl = DIV_ROUND_UP(order_base_2(nnodes),
+ ilog2(TMIGR_CHILDREN_PER_GROUP));
+
+ tmigr_hierarchy_levels = cpulvl + nodelvl;
+
+ /*
+ * If a NUMA node spawns more than one CPU level group then the next
+ * level(s) of the hierarchy contains groups which handle all CPU groups
+ * of the same NUMA node. The level above goes across NUMA nodes. Store
+ * this information for the setup code to decide in which level node
+ * matching is no longer required.
+ */
+ tmigr_crossnode_level = cpulvl;
+
+ tmigr_level_list = kcalloc(tmigr_hierarchy_levels, sizeof(struct list_head), GFP_KERNEL);
+ if (!tmigr_level_list)
+ goto err;
+
+ for (i = 0; i < tmigr_hierarchy_levels; i++)
+ INIT_LIST_HEAD(&tmigr_level_list[i]);
+
+ pr_info("Timer migration: %d hierarchy levels; %d children per group;"
+ " %d crossnode level\n",
+ tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
+ tmigr_crossnode_level);
+
+ ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare",
+ tmigr_cpu_prepare, NULL);
+ if (ret)
+ goto err;
+
+ ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
+ tmigr_set_cpu_available, tmigr_clear_cpu_available);
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ pr_err("Timer migration setup failed\n");
+ return ret;
+}
+early_initcall(tmigr_init);
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
new file mode 100644
index 000000000000..70879cde6fdd
--- /dev/null
+++ b/kernel/time/timer_migration.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _KERNEL_TIME_MIGRATION_H
+#define _KERNEL_TIME_MIGRATION_H
+
+/* Per group capacity. Must be a power of 2! */
+#define TMIGR_CHILDREN_PER_GROUP 8
+
+/**
+ * struct tmigr_event - a timer event associated to a CPU
+ * @nextevt: The node to enqueue an event in the parent group queue
+ * @cpu: The CPU to which this event belongs
+ * @ignore: Hint whether the event could be ignored; it is set when
+ * CPU or group is active;
+ */
+struct tmigr_event {
+ struct timerqueue_node nextevt;
+ unsigned int cpu;
+ bool ignore;
+};
+
+/**
+ * struct tmigr_group - timer migration hierarchy group
+ * @lock: Lock protecting the event information and group hierarchy
+ * information during setup
+ * @parent: Pointer to the parent group. Pointer is updated when a
+ * new hierarchy level is added because of a CPU coming
+ * online the first time. Once it is set, the pointer will
+ * not be removed or updated. When accessing parent pointer
+ * lock less to decide whether to abort a propagation or
+ * not, it is not a problem. The worst outcome is an
+ * unnecessary/early CPU wake up. But do not access parent
+ * pointer several times in the same 'action' (like
+ * activation, deactivation, check for remote expiry,...)
+ * without holding the lock as it is not ensured that value
+ * will not change.
+ * @groupevt: Next event of the group which is only used when the
+ * group is !active. The group event is then queued into
+ * the parent timer queue.
+ * Ignore bit of @groupevt is set when the group is active.
+ * @next_expiry: Base monotonic expiry time of the next event of the
+ * group; It is used for the racy lockless check whether a
+ * remote expiry is required; it is always reliable
+ * @events: Timer queue for child events queued in the group
+ * @migr_state: State of the group (see union tmigr_state)
+ * @level: Hierarchy level of the group; Required during setup
+ * @numa_node: Required for setup only to make sure CPU and low level
+ * group information is NUMA local. It is set to NUMA node
+ * as long as the group level is per NUMA node (level <
+ * tmigr_crossnode_level); otherwise it is set to
+ * NUMA_NO_NODE
+ * @num_children: Counter of group children to make sure the group is only
+ * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup
+ * only
+ * @groupmask: mask of the group in the parent group; is set during
+ * setup and will never change; can be read lockless
+ * @list: List head that is added to the per level
+ * tmigr_level_list; is required during setup when a
+ * new group needs to be connected to the existing
+ * hierarchy groups
+ */
+struct tmigr_group {
+ raw_spinlock_t lock;
+ struct tmigr_group *parent;
+ struct tmigr_event groupevt;
+ u64 next_expiry;
+ struct timerqueue_head events;
+ atomic_t migr_state;
+ unsigned int level;
+ int numa_node;
+ unsigned int num_children;
+ u8 groupmask;
+ struct list_head list;
+};
+
+/**
+ * struct tmigr_cpu - timer migration per CPU group
+ * @lock: Lock protecting the tmigr_cpu group information
+ * @online: Indicates whether the CPU is online; In deactivate path
+ * it is required to know whether the migrator in the top
+ * level group is to be set offline, while a timer is
+ * pending. Then another online CPU needs to be notified to
+ * take over the migrator role. Furthermore the information
+ * is required in CPU hotplug path as the CPU is able to go
+ * idle before the timer migration hierarchy hotplug AP is
+ * reached. During this phase, the CPU has to handle the
+ * global timers on its own and must not act as a migrator.
+ * @idle: Indicates whether the CPU is idle in the timer migration
+ * hierarchy
+ * @remote: Is set when timers of the CPU are expired remotely
+ * @tmgroup: Pointer to the parent group
+ * @groupmask: mask of tmigr_cpu in the parent group
+ * @wakeup: Stores the first timer when the timer migration
+ * hierarchy is completely idle and remote expiry was done;
+ * is returned to timer code in the idle path and is only
+ * used in idle path.
+ * @cpuevt: CPU event which could be enqueued into the parent group
+ */
+struct tmigr_cpu {
+ raw_spinlock_t lock;
+ bool available;
+ bool idle;
+ bool remote;
+ struct tmigr_group *tmgroup;
+ u8 groupmask;
+ u64 wakeup;
+ struct tmigr_event cpuevt;
+};
+
+/**
+ * union tmigr_state - state of tmigr_group
+ * @state: Combined version of the state - only used for atomic
+ * read/cmpxchg function
+ * &anon struct: Split version of the state - only use the struct members to
+ * update information to stay independent of endianness
+ * @active: Contains each mask bit of the active children
+ * @migrator: Contains mask of the child which is migrator
+ * @seq: Sequence counter needs to be increased when an update
+ * to the tmigr_state is done. It prevents a race when
+ * updates in the child groups are propagated in changed
+ * order. Detailed information about the scenario is
+ * given in the documentation at the begin of
+ * timer_migration.c.
+ */
+union tmigr_state {
+ u32 state;
+ struct {
+ u8 active;
+ u8 migrator;
+ u16 seq;
+ } __packed;
+};
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void tmigr_handle_remote(void);
+extern bool tmigr_requires_handle_remote(void);
+extern void tmigr_cpu_activate(void);
+extern u64 tmigr_cpu_deactivate(u64 nextevt);
+extern u64 tmigr_cpu_new_timer(u64 nextevt);
+extern u64 tmigr_quick_check(u64 nextevt);
+#else
+static inline void tmigr_handle_remote(void) { }
+static inline bool tmigr_requires_handle_remote(void) { return false; }
+static inline void tmigr_cpu_activate(void) { }
+#endif
+
+#endif
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index f0d5062d9cbc..aa59919b8f2c 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -15,23 +15,28 @@
#include "timekeeping_internal.h"
-static inline void update_vdso_data(struct vdso_data *vdata,
- struct timekeeper *tk)
+static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base)
{
+ vc->cycle_last = base->cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vc->max_cycles = base->clock->max_cycles;
+#endif
+ vc->mask = base->mask;
+ vc->mult = base->mult;
+ vc->shift = base->shift;
+}
+
+static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk)
+{
+ struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
u64 nsec, sec;
- vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
- vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
- vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
- vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
- vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
- vdata[CS_RAW].mask = tk->tkr_raw.mask;
- vdata[CS_RAW].mult = tk->tkr_raw.mult;
- vdata[CS_RAW].shift = tk->tkr_raw.shift;
+ fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono);
+ fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw);
/* CLOCK_MONOTONIC */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
nsec = tk->tkr_mono.xtime_nsec;
@@ -49,7 +54,7 @@ static inline void update_vdso_data(struct vdso_data *vdata,
nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift;
/* CLOCK_BOOTTIME */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
vdso_ts->sec = sec;
while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
@@ -59,19 +64,20 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdso_ts->nsec = nsec;
/* CLOCK_MONOTONIC_RAW */
- vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+ vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
vdso_ts->sec = tk->raw_sec;
vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
/* CLOCK_TAI */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI];
vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
}
void update_vsyscall(struct timekeeper *tk)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
s32 clock_mode;
u64 nsec;
@@ -80,55 +86,95 @@ void update_vsyscall(struct timekeeper *tk)
vdso_write_begin(vdata);
clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
- vdata[CS_HRES_COARSE].clock_mode = clock_mode;
- vdata[CS_RAW].clock_mode = clock_mode;
+ vc[CS_HRES_COARSE].clock_mode = clock_mode;
+ vc[CS_RAW].clock_mode = clock_mode;
/* CLOCK_REALTIME also required for time() */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
vdso_ts->sec = tk->xtime_sec;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
/* CLOCK_REALTIME_COARSE */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ vdso_ts->nsec = tk->coarse_nsec;
/* CLOCK_MONOTONIC_COARSE */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = tk->coarse_nsec;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
/*
* Read without the seqlock held by clock_getres().
- * Note: No need to have a second copy.
*/
- WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+ WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution);
/*
* If the current clocksource is not VDSO capable, then spare the
* update of the high resolution parts.
*/
if (clock_mode != VDSO_CLOCKMODE_NONE)
- update_vdso_data(vdata, tk);
+ update_vdso_time_data(vdata, tk);
- __arch_update_vsyscall(vdata, tk);
+ __arch_update_vdso_clock(&vc[CS_HRES_COARSE]);
+ __arch_update_vdso_clock(&vc[CS_RAW]);
vdso_write_end(vdata);
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
}
void update_vsyscall_tz(void)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
+
+ vdata->tz_minuteswest = sys_tz.tz_minuteswest;
+ vdata->tz_dsttime = sys_tz.tz_dsttime;
+
+ __arch_sync_vdso_time_data(vdata);
+}
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+void vdso_time_update_aux(struct timekeeper *tk)
+{
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_timestamp *vdso_ts;
+ struct vdso_clock *vc;
+ s32 clock_mode;
+ u64 nsec;
+
+ vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST];
+ vdso_ts = &vc->basetime[VDSO_BASE_AUX];
+ clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+ if (!tk->clock_valid)
+ clock_mode = VDSO_CLOCKMODE_NONE;
+
+ /* copy vsyscall data */
+ vdso_write_begin_clock(vc);
+
+ vc->clock_mode = clock_mode;
+
+ if (clock_mode != VDSO_CLOCKMODE_NONE) {
+ fill_clock_configuration(vc, &tk->tkr_mono);
+
+ vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec;
+
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec += tk->monotonic_to_aux.tv_nsec;
+ vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
+ nsec = nsec << tk->tkr_mono.shift;
+ vdso_ts->nsec = nsec;
+ }
+
+ __arch_update_vdso_clock(vc);
- vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
- vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
+ vdso_write_end_clock(vc);
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
}
+#endif
/**
* vdso_update_begin - Start of a VDSO update section
@@ -144,10 +190,9 @@ void update_vsyscall_tz(void)
*/
unsigned long vdso_update_begin(void)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
- unsigned long flags;
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ unsigned long flags = timekeeper_lock_irqsave();
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
vdso_write_begin(vdata);
return flags;
}
@@ -162,9 +207,9 @@ unsigned long vdso_update_begin(void)
*/
void vdso_update_end(unsigned long flags)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
vdso_write_end(vdata);
- __arch_sync_vdso_data(vdata);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ __arch_sync_vdso_time_data(vdata);
+ timekeeper_unlock_irqrestore(flags);
}