diff options
Diffstat (limited to 'kernel/time')
33 files changed, 2022 insertions, 1078 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b0b97a60aaa6..7c6a52f7836c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE help Tracks idle state on behalf of RCU. -if GENERIC_CLOCKEVENTS menu "Timers subsystem" +if GENERIC_CLOCKEVENTS # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independent of this. @@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US interval and NTP's maximum frequency drift of 500 parts per million. If the clocksource is good enough for NTP, it is good enough for the clocksource watchdog! +endif + +config POSIX_AUX_CLOCKS + bool "Enable auxiliary POSIX clocks" + depends on POSIX_TIMERS + help + Auxiliary POSIX clocks are clocks which can be steered + independently of the core timekeeper, which controls the + MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to + provide e.g. lockless time accessors to independent PTP clocks + and other clock domains, which are not correlated to the TAI/NTP + notion of time. endmenu -endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile index fe0ae82124fe..f7d52d9543cc 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 + +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o @@ -20,7 +26,7 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o ifeq ($(CONFIG_SMP),y) obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o endif -obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o +obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0ddccdff119a..069d93bfb0c7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -35,7 +35,7 @@ /** * struct alarm_base - Alarm timer bases - * @lock: Lock for syncrhonized access to the base + * @lock: Lock for synchronized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base * @get_timespec: Function to read the namespace time correlating to the base @@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock); */ struct rtc_device *alarmtimer_get_rtcdev(void) { - unsigned long flags; struct rtc_device *ret; - spin_lock_irqsave(&rtcdev_lock, flags); + guard(spinlock_irqsave)(&rtcdev_lock); ret = rtcdev; - spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } @@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev) { - unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct platform_device *pdev; int ret = 0; @@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev) if (!IS_ERR(pdev)) device_init_wakeup(&pdev->dev, true); - spin_lock_irqsave(&rtcdev_lock, flags); - if (!IS_ERR(pdev) && !rtcdev) { - if (!try_module_get(rtc->owner)) { + scoped_guard(spinlock_irqsave, &rtcdev_lock) { + if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + pdev = NULL; + } else { ret = -1; - goto unlock; } - - rtcdev = rtc; - /* hold a reference so it doesn't go away */ - get_device(dev); - pdev = NULL; - } else { - ret = -1; } -unlock: - spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); - return ret; } @@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; - scoped_guard (spinlock_irqsave, &base->lock) + scoped_guard(spinlock_irqsave, &base->lock) alarmtimer_dequeue(base, alarm); if (alarm->function) @@ -228,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); static int alarmtimer_suspend(struct device *dev) { ktime_t min, now, expires; - int i, ret, type; struct rtc_device *rtc; - unsigned long flags; struct rtc_time tm; + int i, ret, type; - spin_lock_irqsave(&freezer_delta_lock, flags); - min = freezer_delta; - expires = freezer_expires; - type = freezer_alarmtype; - freezer_delta = 0; - spin_unlock_irqrestore(&freezer_delta_lock, flags); + scoped_guard(spinlock_irqsave, &freezer_delta_lock) { + min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; + freezer_delta = 0; + } rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ @@ -251,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev) struct timerqueue_node *next; ktime_t delta; - spin_lock_irqsave(&base->lock, flags); - next = timerqueue_getnext(&base->timerqueue); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) + next = timerqueue_getnext(&base->timerqueue); if (!next) continue; delta = ktime_sub(next->expires, base->get_ktime()); @@ -352,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init); void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); + } trace_alarmtimer_start(alarm, base->get_ktime()); } @@ -381,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); + guard(spinlock_irqsave)(&base->lock); hrtimer_set_expires(&alarm->timer, alarm->node.expires); hrtimer_restart(&alarm->timer); alarmtimer_enqueue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(alarm_restart); @@ -401,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart); int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; int ret; - spin_lock_irqsave(&base->lock, flags); - ret = hrtimer_try_to_cancel(&alarm->timer); - if (ret >= 0) - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + } trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; @@ -479,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now); static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { struct alarm_base *base; - unsigned long flags; ktime_t delta; switch(type) { @@ -498,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) delta = ktime_sub(absexp, base->get_ktime()); - spin_lock_irqsave(&freezer_delta_lock, flags); + guard(spinlock_irqsave)(&freezer_delta_lock); if (!freezer_delta || (delta < freezer_delta)) { freezer_delta = delta; freezer_expires = absexp; freezer_alarmtype = type; } - spin_unlock_irqrestore(&freezer_delta_lock, flags); } /** @@ -515,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) { if (clockid == CLOCK_REALTIME_ALARM) return ALARM_REALTIME; - if (clockid == CLOCK_BOOTTIME_ALARM) - return ALARM_BOOTTIME; - return -1; + + WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM); + return ALARM_BOOTTIME; } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f3e831f62906..a59bc75ab7c5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -633,7 +633,7 @@ void tick_offline_cpu(unsigned int cpu) raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); - tick_shutdown(cpu); + tick_shutdown(); /* * Unregister the clock event devices which were diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 62e73444ffe4..38dae590b29f 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -137,7 +137,8 @@ static int wdtest_func(void *arg) udelay(1); j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); + WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), + "Expected at least 1000ns, got %lu.\n", j2 - j1); /* Verify tsc-like stability with various numbers of errors injected. */ max_retries = clocksource_get_max_watchdog_retry(); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index aab6472853fa..a1890a073196 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -24,7 +24,7 @@ static void clocksource_enqueue(struct clocksource *cs); static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) { - u64 delta = clocksource_delta(end, start, cs->mask); + u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta); if (likely(delta < cs->max_cycles)) return clocksource_cyc2ns(delta, cs->mult, cs->shift); @@ -144,7 +144,7 @@ static u64 suspend_start; * Default for maximum permissible skew when cs->uncertainty_margin is * not specified, and the lower bound even when cs->uncertainty_margin * is specified. This is also the default that is used when registering - * clocks with unspecifed cs->uncertainty_margin, so this macro is used + * clocks with unspecified cs->uncertainty_margin, so this macro is used * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. */ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) @@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) { int cpu, i, n = verify_n_cpus; - if (n < 0) { + if (n < 0 || n >= num_online_cpus()) { /* Check all of the CPUs. */ cpumask_copy(&cpus_chosen, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); @@ -323,9 +323,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_first(cpu_online_mask); - if (cpu == smp_processor_id()) - cpu = cpumask_next(cpu, cpu_online_mask); + cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpumask_set_cpu(cpu, &cpus_chosen); @@ -342,10 +340,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = get_random_u32_below(nr_cpu_ids); - cpu = cpumask_next(cpu - 1, cpu_online_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_online_mask); + cpu = cpumask_random(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } @@ -373,16 +368,18 @@ void clocksource_verify_percpu(struct clocksource *cs) cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); cpus_read_lock(); - preempt_disable(); + migrate_disable(); clocksource_verify_choose_cpus(); if (cpumask_empty(&cpus_chosen)) { - preempt_enable(); + migrate_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } testcpu = smp_processor_id(); - pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + preempt_disable(); for_each_cpu(cpu, &cpus_chosen) { if (cpu == testcpu) continue; @@ -402,6 +399,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); + migrate_enable(); cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", @@ -409,9 +407,8 @@ void clocksource_verify_percpu(struct clocksource *cs) if (!cpumask_empty(&cpus_behind)) pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_behind), testcpu, cs->name); - if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) - pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); + pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); } EXPORT_SYMBOL_GPL(clocksource_verify_percpu); @@ -586,9 +583,7 @@ static void clocksource_watchdog(struct timer_list *unused) * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); /* * Arm timer if not already pending: could race with concurrent @@ -616,7 +611,7 @@ static inline void clocksource_stop_watchdog(void) { if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) return; - del_timer(&watchdog_timer); + timer_delete(&watchdog_timer); watchdog_running = 0; } @@ -993,6 +988,15 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs) cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, cs->mask, &cs->max_cycles); + + /* + * Threshold for detecting negative motion in clocksource_delta(). + * + * Allow for 0.875 of the counter width so that overly long idle + * sleeps, which go slightly over mask/2, do not trigger the + * negative motion detection. + */ + cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3); } static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) @@ -1498,7 +1502,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strscpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 80fe3749d2db..f8ea8c8fc895 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,9 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); + /* * The timer bases: * @@ -74,55 +77,46 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} /* * Functions and macros which are different for UP/SMP systems are kept in a @@ -145,11 +139,6 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return base == &migration_base; -} - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -183,27 +172,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will be issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -254,8 +270,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -264,8 +280,7 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -275,11 +290,6 @@ again: #else /* CONFIG_SMP */ -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return false; -} - static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) @@ -349,7 +359,7 @@ static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { - return ((struct hrtimer *) addr)->function; + return ACCESS_PRIVATE((struct hrtimer *)addr, function); } /* @@ -448,19 +458,17 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer, static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, @@ -716,8 +724,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -774,10 +780,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -1067,11 +1073,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * - * Returns 1 when the new timer is the leftmost timer in the tree. + * Returns true when the new timer is the leftmost timer in the tree. */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - enum hrtimer_mode mode) +static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); @@ -1206,6 +1211,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1217,10 +1223,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. @@ -1234,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); @@ -1249,8 +1261,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid @@ -1276,8 +1307,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; - if (WARN_ON_ONCE(!timer->function)) - return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard @@ -1371,13 +1400,25 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, } } +#ifdef CONFIG_SMP +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} +#else +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} +#endif + /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted - * in the middle of a timer callback, then calling del_timer_sync() can + * in the middle of a timer callback, then calling hrtimer_cancel() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer @@ -1525,23 +1566,47 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; + switch (clock_id) { + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; + } +} - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; } -static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) { - return HRTIMER_NORESTART; + return __hrtimer_cb_get_time(timer->base->clockid); } +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; @@ -1574,41 +1639,14 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); -} - -static void __hrtimer_setup(struct hrtimer *timer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t clock_id, enum hrtimer_mode mode) -{ - __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) - timer->function = hrtimer_dummy_timeout; + ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; else - timer->function = function; + ACCESS_PRIVATE(timer, function) = function; } /** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: The modes which are relevant for initialization: - * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, - * HRTIMER_MODE_REL_SOFT - * - * The PINNED variants of the above can be handed in, - * but the PINNED bit is ignored as pinning happens - * when the hrtimer is started - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init); - -/** * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized * @function: the callback function @@ -1624,7 +1662,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init); void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(timer, clock_id, mode); + debug_setup(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); @@ -1643,7 +1681,7 @@ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(timer, clock_id, mode); + debug_setup_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); @@ -1717,7 +1755,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - fn = timer->function; + fn = ACCESS_PRIVATE(timer, function); /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the @@ -1992,7 +2030,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * __hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_setup_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -2002,8 +2040,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -2029,8 +2067,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, mode |= HRTIMER_MODE_HARD; } - __hrtimer_init(&sl->timer, clock_id, mode); - sl->timer.function = hrtimer_wakeup; + __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode); sl->task = current; } @@ -2043,8 +2080,8 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(&sl->timer, clock_id, mode); - __hrtimer_init_sleeper(sl, clock_id, mode); + debug_setup_on_stack(&sl->timer, clock_id, mode); + __hrtimer_setup_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); @@ -2108,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -2135,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); @@ -2202,6 +2239,15 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; + hrtimer_cpu_base_init_expiry_lock(cpu_base); + return 0; +} + +int hrtimers_cpu_starting(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + + /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; @@ -2210,7 +2256,6 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; - hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2266,11 +2311,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); @@ -2286,5 +2326,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 876d389b2e21..7c6110e964e7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk) struct hrtimer *tmr = &tsk->signal->real_timer; if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); + hrtimer_forward_now(tmr, tsk->signal->it_real_incr); hrtimer_restart(tmr); } } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index bc4db9e5ab70..d31a6d40d38d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void) static struct clocksource refined_jiffies; -int register_refined_jiffies(long cycles_per_second) +void __init register_refined_jiffies(long cycles_per_second) { u64 nsec_per_tick, shift_hz; long cycles_per_tick; - - refined_jiffies = clocksource_jiffies; refined_jiffies.name = "refined-jiffies"; refined_jiffies.rating++; @@ -100,5 +98,129 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; __clocksource_register(&refined_jiffies); - return 0; } + +#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ) +#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ) + +static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ) +static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ) +static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies) +static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t) +static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies) +static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs) + +static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz, + sysctl_kern_to_user_int_conv_hz, false) +static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies, + sysctl_user_to_kern_int_conv_userhz, + sysctl_kern_to_user_int_conv_userhz, false) +static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms, + sysctl_kern_to_user_int_conv_ms, false) +static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax, + sysctl_user_to_kern_int_conv_ms, + sysctl_kern_to_user_int_conv_ms, true) + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_jiffies); + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ) + return -EINVAL; + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_userhz_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); + +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies_minmax); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, + HZ, 1000l); +} +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); + diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952a..e76be24b132c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> +#include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> @@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -130,7 +129,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) @@ -165,26 +164,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * HVCLOCK * VVAR * - * The check for vdso_data->clock_mode is in the unlikely path of + * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) { - struct timens_offset *offset = vdata->offset; + struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -219,7 +218,8 @@ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { - struct vdso_data *vdata; + struct vdso_time_data *vdata; + struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) @@ -235,10 +235,16 @@ static void timens_set_vvar_page(struct task_struct *task, goto out; ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); + timens_setup_vdso_clock_data(&vc[i], ns); + + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } out: mutex_unlock(&offset_lock); @@ -246,16 +252,13 @@ out: void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); - kfree(ns); -} - -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) @@ -459,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -469,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -477,9 +478,12 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, - .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 163e7a2033b6..97fa99b96dd0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/rtc.h> #include <linux/audit.h> +#include <linux/timekeeper_internal.h> #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -86,14 +87,16 @@ struct ntp_data { #endif }; -static struct ntp_data tk_ntp_data = { - .tick_usec = USER_TICK_USEC, - .time_state = TIME_OK, - .time_status = STA_UNSYNC, - .time_constant = 2, - .time_maxerror = NTP_PHASE_LIMIT, - .time_esterror = NTP_PHASE_LIMIT, - .ntp_next_leap_sec = TIME64_MAX, +static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = { + [ 0 ... TIMEKEEPERS_MAX - 1 ] = { + .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, + .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, + }, }; #define SECS_PER_DAY 86400 @@ -300,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - real_secs = __ktime_get_real_seconds(); + real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); secs = (long)(real_secs - ntpdata->time_reftime); if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; @@ -348,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata) /** * ntp_clear - Clears the NTP state variables + * @tkid: Timekeeper ID to be able to select proper ntp data array member */ -void ntp_clear(void) +void ntp_clear(unsigned int tkid) { - __ntp_clear(&tk_ntp_data); + __ntp_clear(&tk_ntp_data[tkid]); } -u64 ntp_tick_length(void) +u64 ntp_tick_length(unsigned int tkid) { - return tk_ntp_data.tick_length; + return tk_ntp_data[tkid].tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * @tkid: Timekeeper ID * - * Provides the time of the next leapsecond against CLOCK_REALTIME in - * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next + * leap second against CLOCK_REALTIME in a ktime_t format if a + * leap second is pending. KTIME_MAX otherwise. */ -ktime_t ntp_get_next_leap(void) +ktime_t ntp_get_next_leap(unsigned int tkid) { - struct ntp_data *ntpdata = &tk_ntp_data; - ktime_t ret; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + + if (tkid != TIMEKEEPER_CORE) + return KTIME_MAX; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) return ktime_set(ntpdata->ntp_next_leap_sec, 0); - ret = KTIME_MAX; - return ret; + + return KTIME_MAX; } /* @@ -387,9 +395,9 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(time64_t secs) +int second_overflow(unsigned int tkid, time64_t secs) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; s64 delta; int leap = 0; s32 rem; @@ -605,7 +613,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns */ static inline bool ntp_synced(void) { - return !(tk_ntp_data.time_status & STA_UNSYNC); + return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC); } /* @@ -678,8 +686,7 @@ void ntp_notify_cmos_timer(bool offset_set) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } @@ -703,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k * reference time to current time. */ if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) - ntpdata->time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); /* only set allowed bits */ ntpdata->time_status &= STA_RONLY; @@ -760,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad) +int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; if (txc->modes & ADJ_ADJTIME) { @@ -1032,8 +1039,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; struct pps_normtime pts_norm, freq_norm; - struct ntp_data *ntpdata = &tk_ntp_data; pts_norm = pps_normalize_ts(*phase_ts); @@ -1084,18 +1091,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj); if (rc) return rc; - tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } - __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { - ntp_clear(); + for (int id = 0; id < TIMEKEEPERS_MAX; id++) + __ntp_clear(tk_ntp_data + id); ntp_init_cmos_sync(); } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 5a633dce9057..7084d839c207 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -3,14 +3,13 @@ #define _LINUX_NTP_INTERNAL_H extern void ntp_init(void); -extern void ntp_clear(void); +extern void ntp_clear(unsigned int tkid); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ -extern u64 ntp_tick_length(void); -extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, - const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad); +extern u64 ntp_tick_length(unsigned int tkid); +extern ktime_t ntp_get_next_leap(unsigned int tkid); +extern int second_overflow(unsigned int tkid, time64_t secs); +extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 1af0bb2cc45c..101a0f7c43e0 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp, return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock_context *pccontext = fp->private_data; - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(pccontext, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; @@ -129,6 +109,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) goto out; } pccontext->clk = clk; + pccontext->fp = fp; if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); if (err) { @@ -171,11 +152,9 @@ static const struct file_operations posix_clock_file_operations = { .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) @@ -251,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) if (err) return err; - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 50e8d04ab661..0de2bb7cbec0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); /* + * Ensure that release_task(tsk) can't happen while + * handle_posix_cpu_timers() is running. Otherwise, a concurrent + * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and + * miss timer->it.cpu.firing != 0. + */ + if (tsk->exit_state) + return; + + /* * If the actual expiry is deferred to task work context and the * work is already scheduled there is no point to do anything here. */ @@ -1548,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = expires; + restart->nanosleep.expires = ns_to_ktime(expires); if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); } @@ -1590,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec64 t; - t = ns_to_timespec64(restart_block->nanosleep.expires); + t = ktime_to_timespec64(restart_block->nanosleep.expires); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 881a9ce96af7..80a8a09a21a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -9,34 +9,27 @@ * * These are all the functions necessary to implement POSIX clocks & timers */ -#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/jhash.h> #include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/mutex.h> -#include <linux/sched/task.h> - -#include <linux/uaccess.h> #include <linux/list.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/hash.h> +#include <linux/memblock.h> +#include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> +#include <linux/sched/task.h> +#include <linux/slab.h> #include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/workqueue.h> -#include <linux/export.h> -#include <linux/hashtable.h> -#include <linux/compat.h> -#include <linux/nospec.h> +#include <linux/time.h> #include <linux/time_namespace.h> +#include <linux/uaccess.h> #include "timekeeping.h" #include "posix-timers.h" -static struct kmem_cache *posix_timers_cache; - /* * Timers are managed in a hash table for lockless lookup. The hash key is * constructed from current::signal and the timer ID and the timer is @@ -46,39 +39,67 @@ static struct kmem_cache *posix_timers_cache; * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); +struct timer_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; + +static struct { + struct timer_hash_bucket *buckets; + unsigned long mask; + struct kmem_cache *cache; +} __timer_data __ro_after_init __aligned(4*sizeof(long)); + +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) +#define posix_timers_cache (__timer_data.cache) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; +#define TIMER_ANY_ID INT_MIN + /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id); -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ +#define lock_timer(tid) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ + __timr; \ }) -static int hash(struct signal_struct *sig, unsigned int nr) +static inline void unlock_timer(struct k_itimer *timr) { - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); + if (likely((timr))) + spin_unlock_irq(&timr->it_lock); } -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) +#define scoped_timer_get_or_fail(_id) \ + scoped_cond_guard(lock_timer, return -EINVAL, _id) + +#define scoped_timer (scope) + +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS_IS_COND_GUARD(lock_timer); + +static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) { + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; @@ -86,46 +107,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, return NULL; } -static struct k_itimer *posix_timer_by_id(timer_t id) +static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) { - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + unsigned long val = (unsigned long)timer->it_signal; - return __posix_timers_find(head, sig, id); + /* + * Mask out bit 0, which acts as invalid marker to prevent + * posix_timer_by_id() detecting it as valid. + */ + return (struct signal_struct *)(val & ~1UL); } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, + timer_t id) { - struct signal_struct *sig = current->signal; - struct hlist_head *head; - unsigned int cnt, id; + struct hlist_head *head = &bucket->head; + struct k_itimer *timer; - /* - * FIXME: Replace this by a per signal struct xarray once there is - * a plan to handle the resulting CRIU regression gracefully. - */ - for (cnt = 0; cnt <= INT_MAX; cnt++) { - spin_lock(&hash_lock); - id = sig->next_posix_timer_id; + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) + return true; + } + return false; +} - /* Write the next ID back. Clamp it to the positive space */ - sig->next_posix_timer_id = (id + 1) & INT_MAX; +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) +{ + struct timer_hash_bucket *bucket = hash_bucket(sig, id); - head = &posix_timers_hashtable[hash(sig, id)]; - if (!__posix_timers_find(head, sig, id)) { - hlist_add_head_rcu(&timer->t_hash, head); - spin_unlock(&hash_lock); - return id; + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; } - spin_unlock(&hash_lock); } - /* POSIX return code when no timer ID could be allocated */ - return -EAGAIN; + return false; } -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +static int posix_timer_add(struct k_itimer *timer, int req_id) { - spin_unlock_irqrestore(&timr->it_lock, flags); + struct signal_struct *sig = current->signal; + + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { + /* Get the next timer ID and clamp it to positive space */ + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; + + if (posix_timer_add_at(timer, sig, id)) + return id; + cond_resched(); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) @@ -220,15 +283,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } -static __init int init_posix_timers(void) -{ - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof(struct k_itimer), 0, - SLAB_PANIC | SLAB_ACCOUNT, NULL); - return 0; -} -__initcall(init_posix_timers); - /* * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX @@ -245,8 +299,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); hrtimer_restart(timer); } @@ -259,7 +312,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it * since the signal was queued. In either case, don't rearm and * drop the signal. */ - if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) return false; if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) @@ -304,6 +357,9 @@ void posix_timer_queue_signal(struct k_itimer *timr) { lockdep_assert_held(&timr->it_lock); + if (!posixtimer_valid(timr)) + return; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; posixtimer_send_sigqueue(timr); } @@ -324,6 +380,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; + } + return -EINVAL; +} + static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); @@ -350,8 +421,12 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer *alloc_posix_timer(void) { - struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr; + if (unlikely(!posix_timers_cache)) + return NULL; + + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; @@ -373,15 +448,16 @@ void posixtimer_free_timer(struct k_itimer *tmr) static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - spin_lock(&hash_lock); - hlist_del_rcu(&tmr->t_hash); - spin_unlock(&hash_lock); + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); + + scoped_guard (spinlock, &bucket->lock) + hlist_del_rcu(&tmr->t_hash); posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -390,6 +466,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; @@ -398,6 +475,15 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) return -EAGAIN; @@ -406,24 +492,21 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, /* * Add the timer to the hash table. The timer is not yet valid - * because new_timer::it_signal is still NULL. The timer id is also - * not yet visible to user space. + * after insertion, but has a unique ID allocated. */ - new_timer_id = posix_timer_add(new_timer); + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { posixtimer_free_timer(new_timer); return new_timer_id; } - new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(event)); - rcu_read_unlock(); + scoped_guard (rcu) + new_timer->it_pid = get_pid(good_sigevent(event)); if (!new_timer->it_pid) { error = -EINVAL; goto out; @@ -434,7 +517,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq.info.si_signo = SIGALRM; - memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } @@ -452,8 +534,8 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } /* - * After succesful copy out, the timer ID is visible to user space - * now but not yet valid because new_timer::signal is still NULL. + * After successful copy out, the timer ID is visible to user space + * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create * callback. @@ -462,14 +544,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - /* This makes the timer valid in the hash table */ - WRITE_ONCE(new_timer->it_signal, current->signal); - hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); /* - * After unlocking sighand::siglock @new_timer is subject to - * concurrent removal and cannot be touched anymore + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* + * new_timer::it_signal contains the signal pointer with + * bit 0 set, which makes it invalid for syscall operations. + * Store the unmodified signal pointer to make it valid. + */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ return 0; out: @@ -507,7 +600,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id) { struct k_itimer *timr; @@ -522,11 +615,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where - * timr::it_signal == NULL. timer::it_signal is only set after the - * rest of the initialization succeeded. + * timr::it_signal is marked invalid. timer::it_signal is only set + * after the rest of the initialization succeeded. * * Timer destruction happens in steps: - * 1) Set timr::it_signal to NULL with timr::it_lock held + * 1) Set timr::it_signal marked invalid with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Put the reference count. @@ -538,30 +631,26 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * When the reference count reaches zero, the timer is scheduled * for RCU removal after the grace period. * - * Holding rcu_read_lock() accross the lookup ensures that + * Holding rcu_read_lock() across the lookup ensures that * the timer cannot be freed. * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id - * can't change, but timr::it_signal becomes NULL during - * destruction. + * can't change, but timr::it_signal can become invalid during + * destruction, which makes the locked check fail. */ - rcu_read_lock(); + guard(rcu)(); timr = posix_timer_by_id(timer_id); if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); + spin_lock_irq(&timr->it_lock); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ - if (timr->it_signal == current->signal) { - rcu_read_unlock(); + if (timr->it_signal == current->signal) return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); + spin_unlock_irq(&timr->it_lock); } - rcu_read_unlock(); - return NULL; } @@ -652,24 +741,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - memset(setting, 0, sizeof(*setting)); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, setting); - - unlock_timer(timr, flags); - return ret; + scoped_timer_get_or_fail(timer_id) + scoped_timer->kclock->timer_get(scoped_timer, setting); + return 0; } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -723,18 +798,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { - struct k_itimer *timr; - unsigned long flags; - int overrun; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timer_overrun_to_int(timr); - unlock_timer(timr, flags); - - return overrun; + scoped_timer_get_or_fail(timer_id) + return timer_overrun_to_int(scoped_timer); } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, @@ -747,7 +812,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -756,11 +821,10 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) - expires = ktime_add_safe(expires, timer->base->get_time()); + expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); if (!sigev_none) @@ -791,26 +855,13 @@ static void common_timer_wait_running(struct k_itimer *timer) * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ -static struct k_itimer *timer_wait_running(struct k_itimer *timer, - unsigned long *flags) +static void timer_wait_running(struct k_itimer *timer) { - const struct k_clock *kc = READ_ONCE(timer->kclock); - timer_t timer_id = READ_ONCE(timer->it_id); - - /* Prevent kfree(timer) after dropping the lock */ - rcu_read_lock(); - unlock_timer(timer, *flags); - /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ - if (!WARN_ON_ONCE(!kc->timer_wait_running)) - kc->timer_wait_running(timer); - - rcu_read_unlock(); - /* Relock the timer. It might be not longer hashed. */ - return lock_timer(timer_id, flags); + timer->kclock->timer_wait_running(timer); } /* @@ -865,15 +916,9 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int tmr_flags, - struct itimerspec64 *new_spec64, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int error; - if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; @@ -881,33 +926,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); - timr = lock_timer(timer_id, &flags); -retry: - if (!timr) - return -EINVAL; + for (; ; old_spec64 = NULL) { + struct k_itimer *timr; - if (old_spec64) - old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + scoped_timer_get_or_fail(timer_id) { + timr = scoped_timer; - /* Prevent signal delivery and rearming. */ - timr->it_signal_seq++; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - - if (error == TIMER_RETRY) { - // We already got the old time... - old_spec64 = NULL; - /* Unlocks and relocks the timer if it still exists */ - timr = timer_wait_running(timr, &flags); - goto retry; - } - unlock_timer(timr, flags); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; - return error; + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); + if (ret != TIMER_RETRY) + return ret; + + /* Protect the timer from being freed when leaving the lock scope */ + rcu_read_lock(); + } + timer_wait_running(timr); + rcu_read_unlock(); + } } /* Set a POSIX.1b interval timer */ @@ -978,110 +1018,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) } } -static inline int timer_delete_hook(struct k_itimer *timer) +static void posix_timer_delete(struct k_itimer *timer) { - const struct k_clock *kc = timer->kclock; - - /* Prevent signal delivery and rearming. */ + /* + * Invalidate the timer, remove it from the linked list and remove + * it from the ignored list if pending. + * + * The invalidation must be written with siglock held so that the + * signal code observes the invalidated timer::it_signal in + * do_sigaction(), which prevents it from moving a pending signal + * of a deleted timer to the ignore list. + * + * The invalidation also prevents signal queueing, signal delivery + * and therefore rearming from the signal delivery path. + * + * A concurrent lookup can still find the timer in the hash, but it + * will check timer::it_signal with timer::it_lock held and observe + * bit 0 set, which invalidates it. That also prevents the timer ID + * from being handed out before this timer is completely gone. + */ timer->it_signal_seq++; - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); + scoped_guard (spinlock, ¤t->sighand->siglock) { + unsigned long sig = (unsigned long)timer->it_signal | 1UL; + + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); + hlist_del_rcu(&timer->list); + posix_timer_cleanup_ignored(timer); + } + + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { + guard(rcu)(); + spin_unlock_irq(&timer->it_lock); + timer_wait_running(timer); + spin_lock_irq(&timer->it_lock); + } } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; - unsigned long flags; - - timer = lock_timer(timer_id, &flags); -retry_delete: - if (!timer) - return -EINVAL; - - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { - /* Unlocks and relocks the timer if it still exists */ - timer = timer_wait_running(timer, &flags); - goto retry_delete; + scoped_timer_get_or_fail(timer_id) { + timer = scoped_timer; + posix_timer_delete(timer); } - - spin_lock(¤t->sighand->siglock); - hlist_del(&timer->list); - posix_timer_cleanup_ignored(timer); - /* - * A concurrent lookup could check timer::it_signal lockless. It - * will reevaluate with timer::it_lock held and observe the NULL. - * - * It must be written with siglock held so that the signal code - * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), - * which prevents it from moving a pending signal of a deleted - * timer to the ignore list. - */ - WRITE_ONCE(timer->it_signal, NULL); - spin_unlock(¤t->sighand->siglock); - - unlock_timer(timer, flags); + /* Remove it from the hash, which frees up the timer ID */ posix_timer_unhash_and_free(timer); return 0; } /* - * Delete a timer if it is armed, remove it from the hash and schedule it - * for RCU freeing. - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - - /* - * irqsave is required to make timer_wait_running() work. - */ - spin_lock_irqsave(&timer->it_lock, flags); - -retry_delete: - /* - * Even if the timer is not longer accessible from other tasks - * it still might be armed and queued in the underlying timer - * mechanism. Worse, that timer mechanism might run the expiry - * function concurrently. - */ - if (timer_delete_hook(timer) == TIMER_RETRY) { - /* - * Timer is expired concurrently, prevent livelocks - * and pointless spinning on RT. - * - * timer_wait_running() drops timer::it_lock, which opens - * the possibility for another task to delete the timer. - * - * That's not possible here because this is invoked from - * do_exit() only for the last thread of the thread group. - * So no other task can access and delete that timer. - */ - if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) - return; - - goto retry_delete; - } - hlist_del(&timer->list); - - posix_timer_cleanup_ignored(timer); - - /* - * Setting timer::it_signal to NULL is technically not required - * here as nothing can access the timer anymore legitimately via - * the hash table. Set it to NULL nevertheless so that all deletion - * paths are consistent. - */ - WRITE_ONCE(timer->it_signal, NULL); - - spin_unlock_irqrestore(&timer->it_lock, flags); - posix_timer_unhash_and_free(timer); -} - -/* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. @@ -1089,18 +1077,26 @@ retry_delete: void exit_itimers(struct task_struct *tsk) { struct hlist_head timers; + struct hlist_node *next; + struct k_itimer *timer; + + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ - spin_lock_irq(&tsk->sighand->siglock); - hlist_move_list(&tsk->signal->posix_timers, &timers); - spin_unlock_irq(&tsk->sighand->siglock); + scoped_guard (spinlock_irq, &tsk->sighand->siglock) + hlist_move_list(&tsk->signal->posix_timers, &timers); /* The timers are not longer accessible via tsk::signal */ - while (!hlist_empty(&timers)) - itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + hlist_for_each_entry_safe(timer, next, &timers, list) { + scoped_guard (spinlock_irq, &timer->it_lock) + posix_timer_delete(timer); + posix_timer_unhash_and_free(timer); + cond_resched(); + } /* * There should be no timers on the ignored list. itimer_delete() has @@ -1246,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only - * affects the presicion of the time returned by sys_clock_gettime(). + * affects the precision of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution @@ -1529,6 +1525,9 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, +#ifdef CONFIG_POSIX_AUX_CLOCKS + [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux, +#endif }; static const struct k_clock *clockid_to_kclock(const clockid_t id) @@ -1545,3 +1544,31 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id) return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } + +static int __init posixtimer_init(void) +{ + unsigned long i, size; + unsigned int shift; + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof(struct k_itimer), + __alignof__(struct k_itimer), + SLAB_ACCOUNT, NULL); + + if (IS_ENABLED(CONFIG_BASE_SMALL)) + size = 512; + else + size = roundup_pow_of_two(512 * num_possible_cpus()); + + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), + size, 0, 0, &shift, NULL, size, size); + size = 1UL << shift; + timer_hashmask = size - 1; + + for (i = 0; i < size; i++) { + spin_lock_init(&timer_buckets[i].lock); + INIT_HLIST_HEAD(&timer_buckets[i].head); + } + return 0; +} +core_initcall(posixtimer_init); diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 61906f0688c1..7f259e845d24 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic; extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; +extern const struct k_clock clock_aux; void posix_timer_queue_signal(struct k_itimer *timr); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fcca4e72f1ef..f39111830ca3 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) return HRTIMER_RESTART; } -void __init -sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pS as sched_clock source\n", read); } +EXPORT_SYMBOL_GPL(sched_clock_register); void __init generic_sched_clock_init(void) { @@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } @@ -297,6 +296,11 @@ int sched_clock_suspend(void) return 0; } +static int sched_clock_syscore_suspend(void *data) +{ + return sched_clock_suspend(); +} + void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -306,14 +310,23 @@ void sched_clock_resume(void) rd->read_sched_clock = cd.actual_read_sched_clock; } -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, +static void sched_clock_syscore_resume(void *data) +{ + sched_clock_resume(); +} + +static const struct syscore_ops sched_clock_syscore_ops = { + .suspend = sched_clock_syscore_suspend, + .resume = sched_clock_syscore_resume, +}; + +static struct syscore sched_clock_syscore = { + .ops = &sched_clock_syscore_ops, }; static int __init sched_clock_syscore_init(void) { - register_syscore_ops(&sched_clock_ops); + register_syscore(&sched_clock_syscore); return 0; } diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index dfe939f6e4ec..3c90574bd904 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -22,7 +22,7 @@ struct process_timer { static void process_timeout(struct timer_list *t) { - struct process_timer *timeout = from_timer(timeout, t, timer); + struct process_timer *timeout = timer_container_of(timeout, t, timer); wake_up_process(timeout->task); } @@ -97,10 +97,10 @@ signed long __sched schedule_timeout(signed long timeout) timer.timer.expires = expire; add_timer(&timer.timer); schedule(); - del_timer_sync(&timer.timer); + timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); + timer_destroy_on_stack(&timer.timer); timeout = expire - jiffies; diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index e28f9210f8a1..a88b72b0f35e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ed58eebb4e8f..0207868c8b4d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1020,6 +1020,8 @@ static inline ktime_t tick_get_next_period(void) /** * tick_broadcast_setup_oneshot - setup the broadcast device + * @bc: the broadcast device + * @from_periodic: true if called from periodic mode */ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a47bcf71defc..7e33d3f2e889 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -411,24 +411,18 @@ int tick_cpu_dying(unsigned int dying_cpu) } /* - * Shutdown an event device on a given cpu: + * Shutdown an event device on the outgoing CPU: * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. + * Called by the dying CPU during teardown, with clockevents_lock held + * and interrupts disabled. */ -void tick_shutdown(unsigned int cpu) +void tick_shutdown(void) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; @@ -509,6 +503,7 @@ void tick_resume(void) #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP); static unsigned int tick_freeze_depth; /** @@ -528,9 +523,22 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + /* + * All other CPUs have their interrupts disabled and are + * suspended to idle. Other tasks have been frozen so there + * is no scheduling happening. This means that there is no + * concurrency in the system at this point. Therefore it is + * okay to acquire a sleeping lock on PREEMPT_RT, such as a + * spinlock, because the lock cannot be held by other CPUs + * or threads and acquiring it cannot block. + * + * Inform lockdep about the situation. + */ + lock_map_acquire_try(&tick_freeze_map); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); + lock_map_release(&tick_freeze_map); } else { tick_suspend_local(); } @@ -552,8 +560,16 @@ void tick_unfreeze(void) raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { + /* + * Similar to tick_freeze(). On resumption the first CPU may + * acquire uncontended sleeping locks while other CPUs block on + * tick_freeze_lock. + */ + lock_map_acquire_try(&tick_freeze_map); timekeeping_resume(); sched_clock_resume(); + lock_map_release(&tick_freeze_map); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index faac36de35b9..4e4f7bbe2a64 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); extern void tick_offline_cpu(unsigned int cpu); -extern void tick_shutdown(unsigned int cpu); +extern void tick_shutdown(void); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5e2c2c26b3cc..ffee943d796d 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -19,6 +19,10 @@ /** * tick_program_event - program the CPU local timer device for the next event + * @expires: the time at which the next timer event should occur + * @force: flag to force reprograming even if the event time hasn't changed + * + * Return: 0 on success, negative error code on failure */ int tick_program_event(ktime_t expires, int force) { @@ -57,6 +61,13 @@ void tick_resume_oneshot(void) /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + * @newdev: Pointer to the clock event device to configure + * @handler: Function to be called when the event device triggers an interrupt + * @next_event: Initial expiry time for the next event (in ktime) + * + * Configures the specified clock event device for onshot mode, + * assigns the given handler as its event callback, and programs + * the device to trigger at the specified next event time. */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev, /** * tick_switch_to_oneshot - switch to oneshot mode + * @handler: function to call when an event occurs on the tick device + * + * Return: 0 on success, -EINVAL if the tick device is not present, + * not functional, or does not support oneshot mode. */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { @@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * - * returns 1 when either nohz or highres are enabled. otherwise 0. + * Return: 1 when either nohz or highres are enabled, otherwise 0. */ int tick_oneshot_mode_active(void) { @@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void) * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. + * + * Return: 0 on success, -EINVAL if the tick device cannot switch + * to oneshot/high-resolution mode. */ int tick_init_highres(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa058510af9c..8ddf74e705d3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, ts->flags &= ~flag; } +/* + * Allow only one non-timekeeper CPU at a time update jiffies from + * the timer tick. + * + * Returns true if update was run. + */ +static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) +{ + static atomic_t in_progress; + int inp; + + inp = atomic_read(&in_progress); + if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) + return false; + + if (ts->last_tick_jiffies == jiffies) + tick_do_update_jiffies64(now); + atomic_set(&in_progress, 0); + return true; +} + #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) @@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { - if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { - tick_do_update_jiffies64(now); - ts->stalled_jiffies = 0; - ts->last_tick_jiffies = READ_ONCE(jiffies); + if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { + if (tick_limited_update_jiffies64(ts, now)) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } } } @@ -1152,16 +1174,15 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit >= 10) - return false; - /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } @@ -1573,12 +1594,10 @@ void tick_setup_sched_timer(bool hrtimer) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); - ts->sched_timer.function = tick_nohz_handler; - } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/time/time.c b/kernel/time/time.c index 1b69caa87480..0ba8e3c50d62 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -858,6 +858,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } +EXPORT_SYMBOL_GPL(timespec64_add_safe); /** * get_timespec64 - get user's time value into kernel space diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index e6285288d765..3d2a354cfe1c 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -6,7 +6,7 @@ #include <linux/timecounter.h> void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, + struct cyclecounter *cc, u64 start_tstamp) { tc->cc = cc; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0ca85ff4fbb4..3ec3daa4acab 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -6,6 +6,7 @@ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/kobject.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> @@ -25,6 +26,8 @@ #include <linux/audit.h> #include <linux/random.h> +#include <vdso/auxclock.h> + #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -53,7 +56,38 @@ struct tk_data { raw_spinlock_t lock; } ____cacheline_aligned; -static struct tk_data tk_core; +static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; + +/* The core timekeeper */ +#define tk_core (timekeeper_data[TIMEKEEPER_CORE]) + +#ifdef CONFIG_POSIX_AUX_CLOCKS +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; +} +#else +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return false; +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return false; +} +#endif + +static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs) +{ + tk->offs_aux = offs; + tk->monotonic_to_aux = ktime_to_timespec64(offs); +} /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -113,6 +147,16 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void); +static void tk_aux_update_clocksource(void); +static void tk_aux_advance(void); +#else +static inline void tk_aux_setup(void) { } +static inline void tk_aux_update_clocksource(void) { } +static inline void tk_aux_advance(void) { } +#endif + unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; @@ -164,10 +208,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -175,6 +243,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -485,91 +554,30 @@ u64 notrace ktime_get_tai_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); -static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. + */ +u64 ktime_get_real_fast_ns(void) { + struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; - u64 basem, baser, delta; + u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); - if (mono) - *mono = basem + delta; return baser + delta; } - -/** - * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. - * - * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. - */ -u64 ktime_get_real_fast_ns(void) -{ - return __ktime_get_real_fast(&tk_fast_mono, NULL); -} EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** - * ktime_get_fast_timestamps: - NMI safe timestamps - * @snapshot: Pointer to timestamp storage - * - * Stores clock monotonic, boottime and realtime timestamps. - * - * Boot time is a racy access on 32bit systems if the sleep time injection - * happens late during resume and not in timekeeping_resume(). That could - * be avoided by expanding struct tk_read_base with boot offset for 32bit - * and adding more overhead to the update. As this is a hard to observe - * once per resume event which can be filtered with reasonable effort using - * the accurate mono/real timestamps, it's probably not worth the trouble. - * - * Aside of that it might be possible on 32 and 64 bit to observe the - * following when the sleep time injection happens late: - * - * CPU 0 CPU 1 - * timekeeping_resume() - * ktime_get_fast_timestamps() - * mono, real = __ktime_get_real_fast() - * inject_sleep_time() - * update boot offset - * boot = mono + bootoffset; - * - * That means that boot time already has the sleep time adjustment, but - * real time does not. On the next readout both are in sync again. - * - * Preventing this for 64bit is not really feasible without destroying the - * careful cache layout of the timekeeper because the sequence count and - * struct tk_read_base would then need two cache lines instead of one. - * - * Access to the time keeper clock source is disabled across the innermost - * steps of suspend/resume. The accessors still work, but the timestamps - * are frozen until time keeping is resumed which happens very early. - * - * For regular suspend/resume there is no observable difference vs. sched - * clock, but it might affect some of the nasty low level debug printks. - * - * OTOH, access to sched clock is not guaranteed across suspend/resume on - * all systems either so it depends on the hardware in use. - * - * If that turns out to be a real problem then this could be mitigated by - * using sched clock in a similar way as during early boot. But it's not as - * trivial as on early boot because it needs some careful protection - * against the clock monotonic timestamp jumping backwards on resume. - */ -void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); - snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); -} - -/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * @@ -637,7 +645,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_leap_state(struct timekeeper *tk) { - tk->next_leap_ktime = ntp_get_next_leap(); + tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); @@ -699,7 +707,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd) static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; lockdep_assert_held(&tkd->lock); @@ -714,18 +722,22 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; - ntp_clear(); + ntp_clear(tk->id); } tk_update_leap_state(tk); tk_update_ktime_data(tk); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (tk->id == TIMEKEEPER_CORE) { + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } else if (tk_is_aux(tk)) { + vdso_time_update_aux(tk); + } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; @@ -755,7 +767,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -768,6 +781,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); delta -= incr; } + tk_update_coarse_nsecs(tk); } /** @@ -864,8 +878,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; ktime_t base, *offset = offsets[offs]; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -873,7 +887,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1009,9 +1023,14 @@ time64_t ktime_get_real_seconds(void) EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** - * __ktime_get_real_seconds - The same as ktime_get_real_seconds - * but without the sequence counter protect. This internal function - * is called just when timekeeping lock is already held. + * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds + * + * The same as ktime_get_real_seconds() but without the sequence counter + * protection. This function is used in restricted contexts like the x86 MCE + * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half + * completed modification and only to be used for such critical contexts. + * + * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { @@ -1290,7 +1309,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { - struct system_counterval_t system_counterval; + struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; @@ -1446,41 +1465,73 @@ int do_settimeofday64(const struct timespec64 *ts) } EXPORT_SYMBOL(do_settimeofday64); +static inline bool timekeeper_is_core_tk(struct timekeeper *tk) +{ + return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; +} + /** - * timekeeping_inject_offset - Adds or subtracts from the current time. + * __timekeeping_inject_offset - Adds or subtracts from the current time. + * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { + struct timekeeper *tks = &tkd->shadow_timekeeper; + struct timespec64 tmp; + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - struct timespec64 tmp; - - timekeeping_forward_now(tks); + timekeeping_forward_now(tks); + if (timekeeper_is_core_tk(tks)) { /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(&tk_core); + timekeeping_restore_shadow(tkd); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); - timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } else { + struct tk_read_base *tkr_mono = &tks->tkr_mono; + ktime_t now, offs; + + /* Get the current time */ + now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); + /* Add the relative offset change */ + offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); + + /* Prevent that the resulting time becomes negative */ + if (ktime_add(now, offs) < 0) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + tk_update_aux_offs(tks, offs); } - /* Signal hrtimers about time change */ - clock_was_set(CLOCK_SET_WALL); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + int ret; + + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) + ret = __timekeeping_inject_offset(&tk_core, ts); + + /* Signal hrtimers about time change */ + if (!ret) + clock_was_set(CLOCK_SET_WALL); + return ret; +} + /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. @@ -1556,6 +1607,8 @@ static int change_clocksource(void *data) timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } + tk_aux_update_clocksource(); + if (old) { if (old->disable) old->disable(old); @@ -1607,6 +1660,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); +/** + * ktime_get_clock_ts64 - Returns time of a clock in a timespec + * @id: POSIX clock ID of the clock to read + * @ts: Pointer to the timespec64 to be set + * + * The timestamp is invalidated (@ts->sec is set to -1) if the + * clock @id is not available. + */ +void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) +{ + /* Invalidate time stamp */ + ts->tv_sec = -1; + ts->tv_nsec = 0; + + switch (id) { + case CLOCK_REALTIME: + ktime_get_real_ts64(ts); + return; + case CLOCK_MONOTONIC: + ktime_get_ts64(ts); + return; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(ts); + return; + case CLOCK_AUX ... CLOCK_AUX_LAST: + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + ktime_get_aux_ts64(id, ts); + return; + default: + WARN_ON_ONCE(1); + } +} +EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres @@ -1683,10 +1769,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } -static __init void tkd_basic_setup(struct tk_data *tkd) +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); + tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; + tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; } /* @@ -1716,7 +1804,8 @@ void __init timekeeping_init(void) struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - tkd_basic_setup(&tk_core); + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); + tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -1905,6 +1994,11 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; @@ -1972,15 +2066,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); @@ -2068,7 +2171,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - u64 ntp_tl = ntp_tick_length(); + u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* @@ -2149,7 +2252,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) } /* Figure out if its a leap sec and apply if needed */ - leap = second_overflow(tk->xtime_sec); + leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; @@ -2215,23 +2318,22 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; - struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; + struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; - u64 offset; - - guard(raw_spinlock_irqsave)(&tk_core.lock); + u64 offset, orig_offset; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), - tk->tkr_mono.cycle_last, tk->tkr_mono.mask); - + tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; @@ -2247,7 +2349,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); @@ -2264,19 +2366,35 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); - timekeeping_update_from_shadow(&tk_core, clock_set); + /* + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time + */ + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + + timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + guard(raw_spinlock_irqsave)(&tk_core.lock); + return __timekeeping_advance(&tk_core, mode); +} + /** * update_wall_time - Uses the current clocksource to increment the wall time * + * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); + tk_aux_advance(); } /** @@ -2307,7 +2425,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); @@ -2330,7 +2448,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2409,12 +2527,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); @@ -2474,7 +2592,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct __kernel_timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2533,6 +2651,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return -EINVAL; } + if (aux_clock) { + /* Auxiliary clocks are similar to TAI and do not have leap seconds */ + if (txc->status & (STA_INS | STA_DEL)) + return -EINVAL; + + /* No TAI offset setting */ + if (txc->modes & ADJ_TAI) + return -EINVAL; + + /* No PPS support either */ + if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) + return -EINVAL; + } + return 0; } @@ -2551,74 +2683,103 @@ unsigned long random_get_entropy_fallback(void) } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); -/** - * do_adjtimex() - Accessor function to NTP __do_adjtimex function - * @txc: Pointer to kernel_timex structure containing NTP parameters - */ -int do_adjtimex(struct __kernel_timex *txc) +struct adjtimex_result { + struct audit_ntp_data ad; + struct timespec64 delta; + bool clock_set; +}; + +static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, + struct adjtimex_result *result) { - struct audit_ntp_data ad; - bool offset_set = false; - bool clock_set = false; + struct timekeeper *tks = &tkd->shadow_timekeeper; + bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; + s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ - ret = timekeeping_validate_timex(txc); + ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); - if (txc->modes & ADJ_SETOFFSET) { - struct timespec64 delta; + if (!aux_clock) + ktime_get_real_ts64(&ts); + else + tk_get_aux_ts64(tkd->timekeeper.id, &ts); - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; + add_device_randomness(&ts, sizeof(ts)); + + guard(raw_spinlock_irqsave)(&tkd->lock); + + if (!tks->clock_valid) + return -ENODEV; + + if (txc->modes & ADJ_SETOFFSET) { + result->delta.tv_sec = txc->time.tv_sec; + result->delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = timekeeping_inject_offset(&delta); + result->delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(tkd, &result->delta); if (ret) return ret; - - offset_set = delta.tv_sec != 0; - audit_tk_injoffset(delta); + result->clock_set = true; } - audit_ntp_init(&ad); + orig_tai = tai = tks->tai_offset; + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); - ktime_get_real_ts64(&ts); - add_device_randomness(&ts, sizeof(ts)); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); + result->clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); + } - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - s32 orig_tai, tai; + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); - orig_tai = tai = tks->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + return ret; +} - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tks, tai); - timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); - clock_set = true; - } else { - tk_update_leap_state_all(&tk_core); - } - } +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct adjtimex_result result = { }; + int ret; - audit_ntp_log(&ad); + ret = __do_adjtimex(&tk_core, txc, &result); + if (ret < 0) + return ret; - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= timekeeping_advance(TK_ADV_FREQ); + if (txc->modes & ADJ_SETOFFSET) + audit_tk_injoffset(result.delta); - if (clock_set) + audit_ntp_log(&result.ad); + + if (result.clock_set) clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(offset_set); + ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } +/* + * Invoked from NTP with the time keeper lock held, so lockless access is + * fine. + */ +long ktime_get_ntp_seconds(unsigned int id) +{ + return timekeeper_data[id].timekeeper.xtime_sec; +} + #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function @@ -2632,3 +2793,321 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ + +#ifdef CONFIG_POSIX_AUX_CLOCKS +#include "posix-timers.h" + +/* + * Bitmap for the activated auxiliary timekeepers to allow lockless quick + * checks in the hot paths without touching extra cache lines. If set, then + * the state of the corresponding timekeeper has to be re-checked under + * timekeeper::lock. + */ +static unsigned long aux_timekeepers; + +static inline unsigned int clockid_to_tkid(unsigned int id) +{ + return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; +} + +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + if (!clockid_aux_valid(id)) + return NULL; + return &timekeeper_data[clockid_to_tkid(id)]; +} + +/* Invoked from timekeeping after a clocksource change */ +static void tk_aux_update_clocksource(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + struct timekeeper *tks = &tkd->shadow_timekeeper; + + guard(raw_spinlock_irqsave)(&tkd->lock); + if (!tks->clock_valid) + continue; + + timekeeping_forward_now(tks); + tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + } +} + +static void tk_aux_advance(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + /* Lockless quick check to avoid extra cache lines */ + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + + guard(raw_spinlock)(&aux_tkd->lock); + if (aux_tkd->shadow_timekeeper.clock_valid) + __timekeeping_advance(aux_tkd, TK_ADV_TICK); + } +} + +/** + * ktime_get_aux - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @kt: Pointer to ktime_t to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux(clockid_t id, ktime_t *kt) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tk; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + if (!aux_tkd) + return false; + + aux_tk = &aux_tkd->timekeeper; + do { + seq = read_seqcount_begin(&aux_tkd->seq); + if (!aux_tk->clock_valid) + return false; + + base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); + nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); + } while (read_seqcount_retry(&aux_tkd->seq, seq)); + + *kt = ktime_add_ns(base, nsecs); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux); + +/** + * ktime_get_aux_ts64 - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @ts: Pointer to timespec64 to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) +{ + ktime_t now; + + if (!ktime_get_aux(id, &now)) + return false; + *ts = ktime_to_timespec64(now); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); + +static int aux_get_res(clockid_t id, struct timespec64 *tp) +{ + if (!clockid_aux_valid(id)) + return -ENODEV; + + tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; + tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; + return 0; +} + +static int aux_get_timespec(clockid_t id, struct timespec64 *tp) +{ + return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; +} + +static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks; + ktime_t tnow, nsecs; + + if (!timespec64_valid_settod(tnew)) + return -EINVAL; + if (!aux_tkd) + return -ENODEV; + + aux_tks = &aux_tkd->shadow_timekeeper; + + guard(raw_spinlock_irq)(&aux_tkd->lock); + if (!aux_tks->clock_valid) + return -ENODEV; + + /* Forward the timekeeper base time */ + timekeeping_forward_now(aux_tks); + /* + * Get the updated base time. tkr_mono.base has not been + * updated yet, so do that first. That makes the update + * in timekeeping_update_from_shadow() redundant, but + * that's harmless. After that @tnow can be calculated + * by using tkr_mono::cycle_last, which has been set + * by timekeeping_forward_now(). + */ + tk_update_ktime_data(aux_tks); + nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); + tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); + + /* + * Calculate the new AUX offset as delta to @tnow ("monotonic"). + * That avoids all the tk::xtime back and forth conversions as + * xtime ("realtime") is not applicable for auxiliary clocks and + * kept in sync with "monotonic". + */ + tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow)); + + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); + return 0; +} + +static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct adjtimex_result result = { }; + + if (!aux_tkd) + return -ENODEV; + + /* + * @result is ignored for now as there are neither hrtimers nor a + * RTC related to auxiliary clocks for now. + */ + return __do_adjtimex(aux_tkd, txc, &result); +} + +const struct k_clock clock_aux = { + .clock_getres = aux_get_res, + .clock_get_timespec = aux_get_timespec, + .clock_set = aux_clock_set, + .clock_adj = aux_clock_adj, +}; + +static void aux_clock_enable(clockid_t id) +{ + struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; + + /* Prevent the core timekeeper from changing. */ + guard(raw_spinlock_irq)(&tk_core.lock); + + /* + * Setup the auxiliary clock assuming that the raw core timekeeper + * clock frequency conversion is close enough. Userspace has to + * adjust for the deviation via clock_adjtime(2). + */ + guard(raw_spinlock_nested)(&aux_tkd->lock); + + /* Remove leftovers of a previous registration */ + memset(aux_tks, 0, sizeof(*aux_tks)); + /* Restore the timekeeper id */ + aux_tks->id = aux_tkd->timekeeper.id; + /* Setup the timekeeper based on the current system clocksource */ + tk_setup_internals(aux_tks, tkr_raw->clock); + + /* Mark it valid and set it live */ + aux_tks->clock_valid = true; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static void aux_clock_disable(clockid_t id) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + + guard(raw_spinlock_irq)(&aux_tkd->lock); + aux_tkd->shadow_timekeeper.clock_valid = false; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static DEFINE_MUTEX(aux_clock_mutex); + +static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + bool enable; + + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (kstrtobool(buf, &enable) < 0) + return -EINVAL; + + guard(mutex)(&aux_clock_mutex); + if (enable == test_bit(id, &aux_timekeepers)) + return count; + + if (enable) { + aux_clock_enable(CLOCK_AUX + id); + set_bit(id, &aux_timekeepers); + } else { + aux_clock_disable(CLOCK_AUX + id); + clear_bit(id, &aux_timekeepers); + } + return count; +} + +static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + + return sysfs_emit(buf, "%d\n", test_bit(id, &active)); +} + +static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); + +static struct attribute *aux_clock_enable_attrs[] = { + &aux_clock_enable_attr.attr, + NULL +}; + +static const struct attribute_group aux_clock_enable_attr_group = { + .attrs = aux_clock_enable_attrs, +}; + +static int __init tk_aux_sysfs_init(void) +{ + struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; + + if (!tko) + return ret; + + auxo = kobject_create_and_add("aux_clocks", tko); + if (!auxo) + goto err_clean; + + for (int i = 0; i < MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (ret) + goto err_clean; + } + return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; +} +late_initcall(tk_aux_sysfs_init); + +static __init void tk_aux_setup(void) +{ + for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) + tkd_basic_setup(&timekeeper_data[i], i, false); +} +#endif /* CONFIG_POSIX_AUX_CLOCKS */ diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 63e600e943a7..973ede670a36 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -30,19 +30,22 @@ static inline void timekeeping_inc_mg_floor_swaps(void) #endif -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) { u64 ret = (now - last) & mask; /* - * Prevent time going backwards by checking the MSB of mask in - * the result. If set, return 0. + * Prevent time going backwards by checking the result against + * @max_delta. If greater, return 0. */ - return ret & ~(mask >> 1) ? 0 : ret; + return ret > max_delta ? 0 : ret; } /* Semi public for serialization of non timekeeper VDSO updates. */ unsigned long timekeeper_lock_irqsave(void); void timekeeper_unlock_irqrestore(unsigned long flags); +/* NTP specific interface to access the current seconds value */ +long ktime_get_ntp_seconds(unsigned int id); + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5860bf6d16f..1f2364126894 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -301,7 +301,7 @@ static int timer_migration_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table timer_sysctl[] = { +static const struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, @@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, } /** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j) EXPORT_SYMBOL_GPL(round_jiffies_relative); /** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -744,7 +702,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: @@ -790,7 +748,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: @@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, +void timer_init_key_on_stack(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) @@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer, debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL_GPL(init_timer_on_stack_key); +EXPORT_SYMBOL_GPL(timer_init_key_on_stack); -void destroy_timer_on_stack(struct timer_list *timer) +void timer_destroy_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); +EXPORT_SYMBOL_GPL(timer_destroy_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } @@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer, } /** - * init_timer_key - initialize a timer + * timer_init_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags @@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior to calling *any* of the + * timer_init_key() must be done to a timer prior to calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, +void timer_init_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL(init_timer_key); +EXPORT_SYMBOL(timer_init_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { @@ -956,33 +914,29 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = per_cpu_ptr(&timer_bases[index], cpu); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); - return base; + index = BASE_DEF; + + return per_cpu_ptr(&timer_bases[index], cpu); } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = this_cpu_ptr(&timer_bases[index]); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = this_cpu_ptr(&timer_bases[BASE_DEF]); - return base; + index = BASE_DEF; + + return this_cpu_ptr(&timer_bases[index]); } static inline struct timer_base *get_timer_base(u32 tflags) @@ -1216,10 +1170,10 @@ EXPORT_SYMBOL(mod_timer_pending); * * mod_timer(timer, expires) is equivalent to: * - * del_timer(timer); timer->expires = expires; add_timer(timer); + * timer_delete(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In - * case that the timer is inactive, the del_timer() part is a NOP. The + * case that the timer is inactive, the timer_delete() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the @@ -1504,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); @@ -1515,7 +1470,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) } /** - * try_to_del_timer_sync - Try to deactivate a timer + * timer_delete_sync_try - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not @@ -1530,11 +1485,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ -int try_to_del_timer_sync(struct timer_list *timer) +int timer_delete_sync_try(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } -EXPORT_SYMBOL(try_to_del_timer_sync); +EXPORT_SYMBOL(timer_delete_sync_try); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) @@ -1904,7 +1859,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) unsigned long clk, next, adj; unsigned lvl, offset = 0; - next = base->clk + NEXT_TIMER_MAX_DELTA; + next = base->clk + TIMER_NEXT_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); @@ -1967,7 +1922,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; - base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); + base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON @@ -2019,7 +1974,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); + WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA); return base->next_expiry; } @@ -2403,7 +2358,7 @@ static inline void __run_timers(struct timer_base *base) * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + - * NEXT_TIMER_MAX_DELTA. + * TIMER_NEXT_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); @@ -2518,7 +2473,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK - if (in_irq()) + if (in_hardirq()) irq_work_tick(); #endif sched_tick(); @@ -2548,7 +2503,7 @@ int timers_prepare_cpu(unsigned int cpu) for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; @@ -2603,7 +2558,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; timer_base_init_expiry_lock(base); } } @@ -2616,7 +2571,7 @@ static void __init init_timer_cpus(void) init_timer_cpu(cpu); } -void __init init_timers(void) +void __init timers_init(void) { init_timer_cpus(); posix_cputimers_init_work(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1c311c46da50..488e47e96e93 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,12 +98,10 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 8d57f7686bb0..18dda1aa782d 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/timerqueue.h> #include <trace/events/ipi.h> +#include <linux/sched/isolation.h> #include "timer_migration.h" #include "tick-internal.h" @@ -420,14 +421,54 @@ static struct list_head *tmigr_level_list __read_mostly; static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; +static struct tmigr_group *tmigr_root; + static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); +/* + * CPUs available for timer migration. + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * Additionally tmigr_available_mutex serializes set/clear operations with each other. + */ +static cpumask_var_t tmigr_available_cpumask; +static DEFINE_MUTEX(tmigr_available_mutex); + +/* Enabled during late initcall */ +static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated); + #define TMIGR_NONE 0xFF #define BIT_CNT 8 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) { - return !(tmc->tmgroup && tmc->online); + return !(tmc->tmgroup && tmc->available); +} + +/* + * Returns true if @cpu should be excluded from the hierarchy as isolated. + * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs + * are still part of the hierarchy but become idle (from a tick and timer + * migration perspective) when they stop their tick. This lets the timekeeping + * CPU handle their global timers. Marking also isolated CPUs as idle would be + * too costly, hence they are completely excluded from the hierarchy. + * This check is necessary, for instance, to prevent offline isolated CPUs from + * being incorrectly marked as available once getting back online. + * + * This function returns false during early boot and the isolation logic is + * enabled only after isolated CPUs are marked as unavailable at late boot. + * The tick CPU can be isolated at boot, however we cannot mark it as + * unavailable to avoid having no global migrator for the nohz_full CPUs. This + * should be ensured by the callers of this function: implicitly from hotplug + * callbacks and explicitly in tmigr_init_isolation() and + * tmigr_isolated_exclude_cpumask(). + */ +static inline bool tmigr_is_isolated(int cpu) +{ + if (!static_branch_unlikely(&tmigr_exclude_isolated)) + return false; + return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) || + cpuset_cpu_is_isolated(cpu)) && + housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE); } /* @@ -502,11 +543,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group) * @now: timer base monotonic * @check: is set if there is the need to handle remote timers; * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. */ struct tmigr_walk { u64 nextexp; @@ -517,16 +553,13 @@ struct tmigr_walk { unsigned long basej; u64 now; bool check; - bool tmc_active; }; typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, struct tmigr_walk *data, - struct tmigr_cpu *tmc) +static void __walk_groups_from(up_f up, struct tmigr_walk *data, + struct tmigr_group *child, struct tmigr_group *group) { - struct tmigr_group *child = NULL, *group = tmc->tmgroup; - do { WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); @@ -534,11 +567,22 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, break; child = group; - group = group->parent; + /* + * Pairs with the store release on group connection + * to make sure group initialization is visible. + */ + group = READ_ONCE(group->parent); data->childmask = child->groupmask; + WARN_ON_ONCE(!data->childmask); } while (group); } +static void __walk_groups(up_f up, struct tmigr_walk *data, + struct tmigr_cpu *tmc) +{ + __walk_groups_from(up, data, NULL, tmc->tmgroup); +} + static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); @@ -564,7 +608,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) while ((node = timerqueue_getnext(&group->events))) { evt = container_of(node, struct tmigr_event, nextevt); - if (!evt->ignore) { + if (!READ_ONCE(evt->ignore)) { WRITE_ONCE(group->next_expiry, evt->nextevt.expires); return evt; } @@ -660,7 +704,7 @@ static bool tmigr_active_up(struct tmigr_group *group, * lock is held while updating the ignore flag in idle path. So this * state change will not be lost. */ - group->groupevt.ignore = true; + WRITE_ONCE(group->groupevt.ignore, true); return walk_done; } @@ -703,7 +747,7 @@ void tmigr_cpu_activate(void) /* * Returns true, if there is nothing to be propagated to the next level * - * @data->firstexp is set to expiry of first gobal event of the (top level of + * @data->firstexp is set to expiry of first global event of the (top level of * the) hierarchy, but only when hierarchy is completely idle. * * The child and group states need to be read under the lock, to prevent a race @@ -721,6 +765,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, union tmigr_state childstate, groupstate; bool remote = data->remote; bool walk_done = false; + bool ignore; u64 nextexp; if (child) { @@ -739,11 +784,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, nextexp = child->next_expiry; evt = &child->groupevt; - evt->ignore = (nextexp == KTIME_MAX) ? true : false; + /* + * This can race with concurrent idle exit (activate). + * If the current writer wins, a useless remote expiration may + * be scheduled. If the activate wins, the event is properly + * ignored. + */ + ignore = (nextexp == KTIME_MAX) ? true : false; + WRITE_ONCE(evt->ignore, ignore); } else { nextexp = data->nextexp; first_childevt = evt = data->evt; + ignore = evt->ignore; /* * Walking the hierarchy is required in any case when a @@ -769,7 +822,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * first event information of the group is updated properly and * also handled properly, so skip this fast return path. */ - if (evt->ignore && !remote && group->parent) + if (ignore && !remote && group->parent) return true; raw_spin_lock(&group->lock); @@ -783,7 +836,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { - if ((evt->nextevt.expires == nextexp) && !evt->ignore) { + if ((evt->nextevt.expires == nextexp) && !ignore) { /* Make sure not to miss a new CPU event with the same expiry */ evt->cpu = first_childevt->cpu; goto check_toplvl; @@ -793,7 +846,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, WRITE_ONCE(group->next_expiry, KTIME_MAX); } - if (evt->ignore) { + if (ignore) { /* * When the next child event could be ignored (nextexp is * KTIME_MAX) and there was no remote timer handling before or @@ -912,7 +965,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * updated the event takes care when hierarchy is completely * idle. Otherwise the migrator does it as the event is enqueued. */ - if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || + if (!tmc->available || tmc->remote || tmc->cpuevt.ignore || now < tmc->cpuevt.nextevt.expires) { raw_spin_unlock_irq(&tmc->lock); return; @@ -959,7 +1012,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * (See also section "Required event and timerqueue update after a * remote expiry" in the documentation at the top) */ - if (!tmc->online || !tmc->idle) { + if (!tmc->available || !tmc->idle) { timer_unlock_remote_bases(cpu); goto unlock; } @@ -1099,15 +1152,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, */ if (!tmigr_check_migrator(group, childmask)) return true; - - /* - * When there is a parent group and the CPU which triggered the - * hierarchy walk is not active, proceed the walk to reach the top level - * group before reading the next_expiry value. - */ - if (group->parent && !data->tmc_active) - return false; - /* * The lock is required on 32bit architectures to read the variable * consistently with a concurrent writer. On 64bit the lock is not @@ -1152,7 +1196,6 @@ bool tmigr_requires_handle_remote(void) data.now = get_jiffies_update(&jif); data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; - data.tmc_active = !tmc->idle; data.check = false; /* @@ -1391,23 +1434,20 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; do { - if (!tmigr_check_lonely(group)) { + if (!tmigr_check_lonely(group)) return KTIME_MAX; - } else { - /* - * Since current CPU is active, events may not be sorted - * from bottom to the top because the CPU's event is ignored - * up to the top and its sibling's events not propagated upwards. - * Thus keep track of the lowest observed expiry. - */ - nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); - if (!group->parent) - return nextevt; - } + + /* + * Since current CPU is active, events may not be sorted + * from bottom to the top because the CPU's event is ignored + * up to the top and its sibling's events not propagated upwards. + * Thus keep track of the lowest observed expiry. + */ + nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); group = group->parent; } while (group); - return KTIME_MAX; + return nextevt; } /* @@ -1421,38 +1461,43 @@ static long tmigr_trigger_active(void *unused) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - WARN_ON_ONCE(!tmc->online || tmc->idle); + WARN_ON_ONCE(!tmc->available || tmc->idle); return 0; } -static int tmigr_cpu_offline(unsigned int cpu) +static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); int migrator; u64 firstexp; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); + guard(mutex)(&tmigr_available_mutex); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + cpumask_clear_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (!tmc->available) + return 0; + tmc->available = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_unavailable(tmc); + } if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); + migrator = cpumask_any(tmigr_available_cpumask); work_on_cpu(migrator, tmigr_trigger_active, NULL); } return 0; } -static int tmigr_cpu_online(unsigned int cpu) +static int tmigr_set_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1460,16 +1505,123 @@ static int tmigr_cpu_online(unsigned int cpu) if (WARN_ON_ONCE(!tmc->tmgroup)) return -EINVAL; - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); + if (tmigr_is_isolated(cpu)) + return 0; + + guard(mutex)(&tmigr_available_mutex); + + cpumask_set_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (tmc->available) + return 0; + trace_tmigr_cpu_available(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->available = true; + } + return 0; +} + +static void tmigr_cpu_isolate(struct work_struct *ignored) +{ + tmigr_clear_cpu_available(smp_processor_id()); +} + +static void tmigr_cpu_unisolate(struct work_struct *ignored) +{ + tmigr_set_cpu_available(smp_processor_id()); +} + +/** + * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy + * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy + * + * This function can be called from cpuset code to provide the new set of + * isolated CPUs that should be excluded from the hierarchy. + * Online CPUs not present in exclude_cpumask but already excluded are brought + * back to the hierarchy. + * Functions to isolate/unisolate need to be called locally and can sleep. + */ +int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + struct work_struct __percpu *works __free(free_percpu) = + alloc_percpu(struct work_struct); + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + int cpu; + + lockdep_assert_cpus_held(); + + if (!works) + return -ENOMEM; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + /* + * First set previously isolated CPUs as available (unisolate). + * This cpumask contains only CPUs that switched to available now. + */ + cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); + cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_unisolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + /* + * Then clear previously available CPUs (isolate). + * This cpumask contains only CPUs that switched to not available now. + * There cannot be overlap with the newly available ones. + */ + cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask); + cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)); + /* + * Handle this here and not in the cpuset code because exclude_cpumask + * might include also the tick CPU if included in isolcpus. + */ + for_each_cpu(cpu, cpumask) { + if (!tick_nohz_cpu_hotpluggable(cpu)) { + cpumask_clear_cpu(cpu, cpumask); + break; + } + } + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_isolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + return 0; } +static int __init tmigr_init_isolation(void) +{ + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + + static_branch_enable(&tmigr_exclude_isolated); + + if (!housekeeping_enabled(HK_TYPE_DOMAIN)) + return 0; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + + /* Protect against RCU torture hotplug testing */ + guard(cpus_read_lock)(); + return tmigr_isolated_exclude_cpumask(cpumask); +} +late_initcall(tmigr_init_isolation); + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1494,8 +1646,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, - unsigned int lvl) +static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; @@ -1541,98 +1692,116 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, return group; } -static void tmigr_connect_child_parent(struct tmigr_group *child, - struct tmigr_group *parent, - bool activate) +static bool tmigr_init_root(struct tmigr_group *group, bool activate) { - struct tmigr_walk data; + if (!group->parent && group != tmigr_root) { + /* + * This is the new top-level, prepare its groupmask in advance + * to avoid accidents where yet another new top-level is + * created in the future and made visible before this groupmask. + */ + group->groupmask = BIT(0); + WARN_ON_ONCE(activate); - raw_spin_lock_irq(&child->lock); - raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); + return true; + } - child->parent = parent; - child->groupmask = BIT(parent->num_children++); + return false; - raw_spin_unlock(&parent->lock); - raw_spin_unlock_irq(&child->lock); +} - trace_tmigr_connect_child_parent(child); +static void tmigr_connect_child_parent(struct tmigr_group *child, + struct tmigr_group *parent, + bool activate) +{ + if (tmigr_init_root(parent, activate)) { + /* + * The previous top level had prepared its groupmask already, + * simply account it in advance as the first child. If some groups + * have been created between the old and new root due to node + * mismatch, the new root's child will be intialized accordingly. + */ + parent->num_children = 1; + } - if (!activate) - return; + /* Connecting old root to new root ? */ + if (!parent->parent && activate) { + /* + * @child is the old top, or in case of node mismatch, some + * intermediate group between the old top and the new one in + * @parent. In this case the @child must be pre-accounted above + * as the first child. Its new inactive sibling corresponding + * to the CPU going up has been accounted as the second child. + */ + WARN_ON_ONCE(parent->num_children != 2); + child->groupmask = BIT(0); + } else { + /* Common case adding @child for the CPU going up to @parent. */ + child->groupmask = BIT(parent->num_children++); + } /* - * To prevent inconsistent states, active children need to be active in - * the new parent as well. Inactive children are already marked inactive - * in the parent group: - * - * * When new groups were created by tmigr_setup_groups() starting from - * the lowest level (and not higher then one level below the current - * top level), then they are not active. They will be set active when - * the new online CPU comes active. - * - * * But if a new group above the current top level is required, it is - * mandatory to propagate the active state of the already existing - * child to the new parent. So tmigr_connect_child_parent() is - * executed with the formerly top level group (child) and the newly - * created group (parent). - * - * * It is ensured that the child is active, as this setup path is - * executed in hotplug prepare callback. This is exectued by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. + * Make sure parent initialization is visible before publishing it to a + * racing CPU entering/exiting idle. This RELEASE barrier enforces an + * address dependency that pairs with the READ_ONCE() in __walk_groups(). */ - data.childmask = child->groupmask; + smp_store_release(&child->parent, parent); - /* - * There is only one new level per time (which is protected by - * tmigr_mutex). When connecting the child and the parent and set the - * child active when the parent is inactive, the parent needs to be the - * uppermost level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); + trace_tmigr_connect_child_parent(child); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node) +static int tmigr_setup_groups(unsigned int cpu, unsigned int node, + struct tmigr_group *start, bool activate) { struct tmigr_group *group, *child, **stack; - int top = 0, err = 0, i = 0; - struct list_head *lvllist; + int i, top = 0, err = 0, start_lvl = 0; + bool root_mismatch = false; stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); if (!stack) return -ENOMEM; - do { - group = tmigr_get_group(cpu, node, i); + if (start) { + stack[start->level] = start; + start_lvl = start->level + 1; + } + + if (tmigr_root) + root_mismatch = tmigr_root->numa_node != node; + + for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { + group = tmigr_get_group(node, i); if (IS_ERR(group)) { err = PTR_ERR(group); + i--; break; } top = i; - stack[i++] = group; + stack[i] = group; /* * When booting only less CPUs of a system than CPUs are - * available, not all calculated hierarchy levels are required. + * available, not all calculated hierarchy levels are required, + * unless a node mismatch is detected. * * The loop is aborted as soon as the highest level, which might * be different from tmigr_hierarchy_levels, contains only a - * single group. + * single group, unless the nodes mismatch below tmigr_crossnode_level */ - if (group->parent || i == tmigr_hierarchy_levels || - (list_empty(&tmigr_level_list[i]) && - list_is_singular(&tmigr_level_list[i - 1]))) + if (group->parent) + break; + if ((!root_mismatch || i >= tmigr_crossnode_level) && + list_is_singular(&tmigr_level_list[i])) break; + } - } while (i < tmigr_hierarchy_levels); + /* Assert single root without parent */ + if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) + return -EINVAL; - while (i > 0) { - group = stack[--i]; + for (; i >= start_lvl; i--) { + group = stack[i]; if (err < 0) { list_del(&group->list); @@ -1648,12 +1817,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) if (i == 0) { struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); - raw_spin_lock_irq(&group->lock); - tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - raw_spin_unlock_irq(&group->lock); + tmigr_init_root(group, activate); trace_tmigr_connect_cpu_parent(tmc); @@ -1661,37 +1828,58 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - /* Will be activated at online time */ - tmigr_connect_child_parent(child, group, false); + tmigr_connect_child_parent(child, group, activate); } + } - /* check if uppermost level was newly created */ - if (top != i) - continue; - - WARN_ON_ONCE(top == 0); + if (err < 0) + goto out; - lvllist = &tmigr_level_list[top]; - if (group->num_children == 1 && list_is_singular(lvllist)) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); + if (activate) { + struct tmigr_walk data; + union tmigr_state state; - lvllist = &tmigr_level_list[top - 1]; - list_for_each_entry(child, lvllist, list) { - if (child->parent) - continue; + /* + * To prevent inconsistent states, active children need to be active in + * the new parent as well. Inactive children are already marked inactive + * in the parent group: + * + * * When new groups were created by tmigr_setup_groups() starting from + * the lowest level, then they are not active. They will be set active + * when the new online CPU comes active. + * + * * But if new groups above the current top level are required, it is + * mandatory to propagate the active state of the already existing + * child to the new parents. So tmigr_active_up() activates the + * new parents while walking up from the old root to the new. + * + * * It is ensured that @start is active, as this setup path is + * executed in hotplug prepare callback. This is executed by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. + */ + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(!state.active); + WARN_ON_ONCE(!start->parent); + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } - tmigr_connect_child_parent(child, group, true); - } + /* Root update */ + if (list_is_singular(&tmigr_level_list[top])) { + group = list_first_entry(&tmigr_level_list[top], + typeof(*group), list); + WARN_ON_ONCE(group->parent); + if (tmigr_root) { + /* Old root should be the same or below */ + WARN_ON_ONCE(tmigr_root->level > top); } + tmigr_root = group; } - +out: kfree(stack); return err; @@ -1699,12 +1887,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) static int tmigr_add_cpu(unsigned int cpu) { + struct tmigr_group *old_root = tmigr_root; int node = cpu_to_node(cpu); int ret; - mutex_lock(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node); - mutex_unlock(&tmigr_mutex); + guard(mutex)(&tmigr_mutex); + + ret = tmigr_setup_groups(cpu, node, NULL, false); + + /* Root has changed? Connect the old one to the new */ + if (ret >= 0 && old_root && old_root != tmigr_root) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + /* + * The (likely) current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); + ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + } return ret; } @@ -1749,6 +1956,11 @@ static int __init tmigr_init(void) if (ncpus == 1) return 0; + if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err; + } + /* * Calculate the required hierarchy levels. Unfortunately there is no * reliable information available, unless all possible CPUs have been @@ -1798,7 +2010,7 @@ static int __init tmigr_init(void) goto err; ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", - tmigr_cpu_online, tmigr_cpu_offline); + tmigr_set_cpu_available, tmigr_clear_cpu_available); if (ret) goto err; diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 154accc7a543..70879cde6fdd 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -97,7 +97,7 @@ struct tmigr_group { */ struct tmigr_cpu { raw_spinlock_t lock; - bool online; + bool available; bool idle; bool remote; struct tmigr_group *tmgroup; @@ -110,22 +110,19 @@ struct tmigr_cpu { * union tmigr_state - state of tmigr_group * @state: Combined version of the state - only used for atomic * read/cmpxchg function - * @struct: Split version of the state - only use the struct members to + * &anon struct: Split version of the state - only use the struct members to * update information to stay independent of endianness + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator + * @seq: Sequence counter needs to be increased when an update + * to the tmigr_state is done. It prevents a race when + * updates in the child groups are propagated in changed + * order. Detailed information about the scenario is + * given in the documentation at the begin of + * timer_migration.c. */ union tmigr_state { u32 state; - /** - * struct - split state of tmigr_group - * @active: Contains each mask bit of the active children - * @migrator: Contains mask of the child which is migrator - * @seq: Sequence counter needs to be increased when an update - * to the tmigr_state is done. It prevents a race when - * updates in the child groups are propagated in changed - * order. Detailed information about the scenario is - * given in the documentation at the begin of - * timer_migration.c. - */ struct { u8 active; u8 migrator; diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 05d383143165..aa59919b8f2c 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,29 +15,28 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) { + vc->cycle_last = base->cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vc->max_cycles = base->clock->max_cycles; +#endif + vc->mask = base->mask; + vc->mult = base->mult; + vc->shift = base->shift; +} + +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) +{ + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; -#endif - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; -#endif - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); + fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -55,7 +54,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -65,19 +64,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -86,55 +86,95 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); - __arch_update_vsyscall(vdata); + __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); + __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; + + __arch_sync_vdso_time_data(vdata); +} + +#ifdef CONFIG_POSIX_AUX_CLOCKS +void vdso_time_update_aux(struct timekeeper *tk) +{ + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_timestamp *vdso_ts; + struct vdso_clock *vc; + s32 clock_mode; + u64 nsec; + + vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; + vdso_ts = &vc->basetime[VDSO_BASE_AUX]; + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + if (!tk->clock_valid) + clock_mode = VDSO_CLOCKMODE_NONE; + + /* copy vsyscall data */ + vdso_write_begin_clock(vc); - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vc->clock_mode = clock_mode; - __arch_sync_vdso_data(vdata); + if (clock_mode != VDSO_CLOCKMODE_NONE) { + fill_clock_configuration(vc, &tk->tkr_mono); + + vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec; + + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->monotonic_to_aux.tv_nsec; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + nsec = nsec << tk->tkr_mono.shift; + vdso_ts->nsec = nsec; + } + + __arch_update_vdso_clock(vc); + + vdso_write_end_clock(vc); + + __arch_sync_vdso_time_data(vdata); } +#endif /** * vdso_update_begin - Start of a VDSO update section @@ -150,7 +190,7 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); @@ -167,9 +207,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } |
