diff options
Diffstat (limited to 'kernel/time/hrtimer.c')
| -rw-r--r-- | kernel/time/hrtimer.c | 1702 |
1 files changed, 1122 insertions, 580 deletions
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88f75f92ef36..f8ea8c8fc895 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1,34 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/hrtimer.c - * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * - * In contrast to the low-resolution timeout API implemented in - * kernel/timer.c, hrtimers provide finer resolution and accuracy - * depending on system configuration and capabilities. - * - * These timers are currently used for: - * - itimers - * - POSIX timers - * - nanosleep - * - precise in-kernel timing + * In contrast to the low-resolution timeout API, aka timer wheel, + * hrtimers provide finer resolution and accuracy depending on system + * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: - * based on kernel/timer.c + * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. - * - * For licencing details see kernel-base/COPYING */ #include <linux/cpu.h> @@ -37,10 +28,8 @@ #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> -#include <linux/kallsyms.h> #include <linux/interrupt.h> #include <linux/tick.h> -#include <linux/seq_file.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> @@ -49,6 +38,7 @@ #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> +#include <linux/sched/isolation.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> @@ -60,6 +50,18 @@ #include "tick-internal.h" /* + * Masks for selecting the soft and hard context timers from + * cpu_base->active + */ +#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) +#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) +#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) +#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) + +static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); + +/* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index @@ -70,41 +72,51 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), - .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, - } + { + .index = HRTIMER_BASE_MONOTONIC_SOFT, + .clockid = CLOCK_MONOTONIC, + }, + { + .index = HRTIMER_BASE_REALTIME_SOFT, + .clockid = CLOCK_REALTIME, + }, + { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + }, + { + .index = HRTIMER_BASE_TAI_SOFT, + .clockid = CLOCK_TAI, + }, + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} /* * Functions and macros which are different for UP/SMP systems are kept in a @@ -118,8 +130,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .seq = SEQCNT_ZERO(migration_cpu_base), - .clock_base = { { .cpu_base = &migration_cpu_base, }, }, + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -139,11 +154,12 @@ static struct hrtimer_cpu_base migration_cpu_base = { static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; for (;;) { - base = timer->base; + base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -156,45 +172,60 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * With HIGHRES=y we do not migrate the timer when it is expiring - * before the next event on the target cpu because we cannot reprogram - * the target cpu hardware and we would cause it to fire late. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { -#ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires; - if (!new_base->cpu_base->hres_active) - return 0; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will be issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires <= new_base->cpu_base->expires_next; -#else - return 0; -#endif -} -#ifdef CONFIG_NO_HZ_COMMON -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) -{ - if (pinned || !base->migration_enabled) - return base; - return &per_cpu(hrtimer_bases, get_nohz_timer_target()); + return expires >= new_base->cpu_base->expires_next; } -#else -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) + +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && !pinned) + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +#endif return base; } -#endif /* * We switch the timer base to a power-optimized selected CPU target, @@ -235,22 +266,21 @@ again: return base; /* See the comment in lock_hrtimer_base() */ - timer->base = &migration_base; + WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; - timer->base = base; + WRITE_ONCE(timer->base, base); goto again; } - timer->base = new_base; + WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -262,6 +292,7 @@ again: static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; @@ -297,7 +328,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div) div >>= 1; } tmp >>= sft; - do_div(tmp, (unsigned long) div); + do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); @@ -324,11 +355,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -static struct debug_obj_descr hrtimer_debug_descr; +static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { - return ((struct hrtimer *) addr)->function; + return ACCESS_PRIVATE((struct hrtimer *)addr, function); } /* @@ -359,7 +390,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); - + fallthrough; default: return false; } @@ -383,7 +414,7 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr hrtimer_debug_descr = { +static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, @@ -396,31 +427,21 @@ static inline void debug_hrtimer_init(struct hrtimer *timer) debug_object_init(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_activate(struct hrtimer *timer) -{ - debug_object_activate(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_deactivate(struct hrtimer *timer) +static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { - debug_object_deactivate(timer, &hrtimer_debug_descr); + debug_object_init_on_stack(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_free(struct hrtimer *timer) +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { - debug_object_free(timer, &hrtimer_debug_descr); + debug_object_activate(timer, &hrtimer_debug_descr); } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode); - -void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { - debug_object_init_on_stack(timer, &hrtimer_debug_descr); - __hrtimer_init(timer, clock_id, mode); + debug_object_deactivate(timer, &hrtimer_debug_descr); } -EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { @@ -429,23 +450,32 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else + static inline void debug_hrtimer_init(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer) { } +static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_activate(struct hrtimer *timer) +static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) { - debug_hrtimer_activate(timer); - trace_hrtimer_start(timer); + debug_hrtimer_init_on_stack(timer); + trace_hrtimer_setup(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer, + enum hrtimer_mode mode) +{ + debug_hrtimer_activate(timer, mode); + trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) @@ -454,35 +484,57 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) -static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, - struct hrtimer *timer) +static struct hrtimer_clock_base * +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { -#ifdef CONFIG_HIGH_RES_TIMERS - cpu_base->next_timer = timer; -#endif + unsigned int idx; + + if (!*active) + return NULL; + + idx = __ffs(*active); + *active &= ~(1U << idx); + + return &cpu_base->clock_base[idx]; } -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +#define for_each_active_base(base, cpu_base, active) \ + while ((base = __next_base((cpu_base), &(active)))) + +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, + unsigned int active, + ktime_t expires_next) { - struct hrtimer_clock_base *base = cpu_base->clock_base; - unsigned int active = cpu_base->active_bases; - ktime_t expires, expires_next = KTIME_MAX; + struct hrtimer_clock_base *base; + ktime_t expires; - hrtimer_update_next_timer(cpu_base, NULL); - for (; active; base++, active >>= 1) { + for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; - if (!(active & 0x01)) - continue; - next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + next = timerqueue_iterate_next(next); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; - hrtimer_update_next_timer(cpu_base, timer); + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + + if (timer->is_soft) + cpu_base->softirq_next_timer = timer; + else + cpu_base->next_timer = timer; } } /* @@ -494,7 +546,83 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) expires_next = 0; return expires_next; } -#endif + +/* + * Recomputes cpu_base::*next_timer and returns the earliest expires_next + * but does not set cpu_base::*expires_next, that is done by + * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating + * cpu_base::*expires_next right away, reprogramming logic would no longer + * work. + * + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, + * those timers will get run whenever the softirq gets handled, at the end of + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. + * + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. + * + * @active_mask must be one of: + * - HRTIMER_ACTIVE_ALL, + * - HRTIMER_ACTIVE_SOFT, or + * - HRTIMER_ACTIVE_HARD. + */ +static ktime_t +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) +{ + unsigned int active; + struct hrtimer *next_timer = NULL; + ktime_t expires_next = KTIME_MAX; + + if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + cpu_base->softirq_next_timer = NULL; + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); + + next_timer = cpu_base->softirq_next_timer; + } + + if (active_mask & HRTIMER_ACTIVE_HARD) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + cpu_base->next_timer = next_timer; + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); + } + + return expires_next; +} + +static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) +{ + ktime_t expires_next, soft = KTIME_MAX; + + /* + * If the soft interrupt has already been activated, ignore the + * soft bases. They will be handled in the already raised soft + * interrupt. + */ + if (!cpu_base->softirq_activated) { + soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + /* + * Update the soft expiry time. clock_settime() might have + * affected it. + */ + cpu_base->softirq_expires_next = soft; + } + + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); + /* + * If a softirq timer is expiring first, update cpu_base->next_timer + * and program the hardware with the soft expiry time. + */ + if (expires_next > soft) { + cpu_base->next_timer = cpu_base->softirq_next_timer; + expires_next = soft; + } + + return expires_next; +} static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { @@ -502,8 +630,70 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets_now(&base->clock_was_set_seq, + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); + + base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; + base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; + + return now; +} + +/* + * Is the high resolution mode active ? + */ +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? + cpu_base->hres_active : 0; +} + +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *next_timer, + ktime_t expires_next) +{ + cpu_base->expires_next = expires_next; + + /* + * If hres is not active, hardware does not have to be + * reprogrammed yet. + * + * If a hang was detected in the last timer interrupt then we + * leave the hang delay active in the hardware. We want the + * system to make progress. That also prevents the following + * scenario: + * T1 expires 50ms from now + * T2 expires 5s from now + * + * T1 is removed, so this code is called and would reprogram + * the hardware to 5s from now. Any hrtimer_start after that + * will not reprogram the hardware due to hang_detected being + * set. So we'd effectively block all timers until the T2 event + * fires. + */ + if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + return; + + tick_program_event(expires_next, 1); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next; + + expires_next = hrtimer_update_next_event(cpu_base); + + if (skip_equal && expires_next == cpu_base->expires_next) + return; + + __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ @@ -535,56 +725,72 @@ static inline int hrtimer_is_hres_enabled(void) } /* - * Is the high resolution mode active ? + * Switch to high resolution mode */ -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +static void hrtimer_switch_to_hres(void) { - return cpu_base->hres_active; -} + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); -static inline int hrtimer_hres_active(void) -{ - return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); + if (tick_init_highres()) { + pr_warn("Could not switch to high resolution mode on CPU %u\n", + base->cpu); + return; + } + base->hres_active = 1; + hrtimer_resolution = HIGH_RES_NSEC; + + tick_setup_sched_timer(true); + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); } +#else + +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ /* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held + * Retrigger next event is called after clock was set with interrupts + * disabled through an SMP function call or directly from low level + * resume code. + * + * This is only invoked when: + * - CONFIG_HIGH_RES_TIMERS is enabled. + * - CONFIG_NOHZ_COMMON is enabled + * + * For the other cases this function is empty and because the call sites + * are optimized out it vanishes as well, i.e. no need for lots of + * #ifdeffery. */ -static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +static void retrigger_next_event(void *arg) { - ktime_t expires_next; - - if (!cpu_base->hres_active) - return; - - expires_next = __hrtimer_get_next_event(cpu_base); - - if (skip_equal && expires_next == cpu_base->expires_next) - return; - - cpu_base->expires_next = expires_next; + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); /* - * If a hang was detected in the last timer interrupt then we - * leave the hang delay active in the hardware. We want the - * system to make progress. That also prevents the following - * scenario: - * T1 expires 50ms from now - * T2 expires 5s from now + * When high resolution mode or nohz is active, then the offsets of + * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the + * next tick will take care of that. * - * T1 is removed, so this code is called and would reprogram - * the hardware to 5s from now. Any hrtimer_start after that - * will not reprogram the hardware due to hang_detected being - * set. So we'd effectivly block all timers until the T2 event - * fires. + * If high resolution mode is active then the next expiring timer + * must be reevaluated and the clock event device reprogrammed if + * necessary. + * + * In the NOHZ case the update of the offset and the reevaluation + * of the next expiring timer is enough. The return from the SMP + * function call will take care of the reprogramming in case the + * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (cpu_base->hang_detected) - return; - - tick_program_event(cpu_base->expires_next, 1); + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + if (hrtimer_hres_active(base)) + hrtimer_force_reprogram(base, 0); + else + hrtimer_update_next_event(base); + raw_spin_unlock(&base->lock); } /* @@ -594,177 +800,207 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * * Called with interrupts disabled and base->cpu_base.lock held */ -static void hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Set it to 0. + */ + if (expires < 0) + expires = 0; + + if (timer->is_soft) { + /* + * soft hrtimer could be started on a remote CPU. In this + * case softirq_expires_next needs to be updated on the + * remote CPU. The soft hrtimer will not expire before the + * first hard hrtimer on the remote CPU - + * hrtimer_check_target() prevents this case. + */ + struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; + + if (timer_cpu_base->softirq_activated) + return; + + if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) + return; + + timer_cpu_base->softirq_next_timer = timer; + timer_cpu_base->softirq_expires_next = expires; + + if (!ktime_before(expires, timer_cpu_base->expires_next) || + !reprogram) + return; + } + + /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; + if (expires >= cpu_base->expires_next) + return; + /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; + cpu_base->next_timer = timer; + + __hrtimer_reprogram(cpu_base, timer, expires); +} + +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, + unsigned int active) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + ktime_t expires; + /* - * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Set it to 0. + * Update the base offsets unconditionally so the following + * checks whether the SMP function call is required works. + * + * The update is safe even when the remote CPU is in the hrtimer + * interrupt or the hrtimer soft interrupt and expiring affected + * bases. Either it will see the update before handling a base or + * it will see it when it finishes the processing and reevaluates + * the next expiring timer. */ - if (expires < 0) - expires = 0; + seq = cpu_base->clock_was_set_seq; + hrtimer_update_base(cpu_base); - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ - cpu_base->next_timer = timer; + /* + * If the sequence did not change over the update then the + * remote CPU already handled it. + */ + if (seq == cpu_base->clock_was_set_seq) + return false; /* - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. + * If the remote CPU is currently handling an hrtimer interrupt, it + * will reevaluate the first expiring timer of all clock bases + * before reprogramming. Nothing to do here. */ - if (cpu_base->hang_detected) - return; + if (cpu_base->in_hrtirq) + return false; /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. + * Walk the affected clock bases and check whether the first expiring + * timer in a clock base is moving ahead of the first expiring timer of + * @cpu_base. If so, the IPI must be invoked because per CPU clock + * event devices cannot be remotely reprogrammed. */ - cpu_base->expires_next = expires; - tick_program_event(expires, 1); -} + active &= cpu_base->active_bases; -/* - * Initialize the high resolution related parts of cpu_base - */ -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) -{ - base->expires_next = KTIME_MAX; - base->hres_active = 0; + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + expires = ktime_sub(next->expires, base->offset); + if (expires < cpu_base->expires_next) + return true; + + /* Extra check for softirq clock bases */ + if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) + continue; + if (cpu_base->softirq_activated) + continue; + if (expires < cpu_base->softirq_expires_next) + return true; + } + return false; } /* - * Retrigger next event is called after clock was set + * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and + * CLOCK_BOOTTIME (for late sleep time injection). * - * Called with interrupts disabled via on_each_cpu() + * This requires to update the offsets for these clocks + * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this + * also requires to eventually reprogram the per CPU clock event devices + * when the change moves an affected timer ahead of the first expiring + * timer on that CPU. Obviously remote per CPU clock event devices cannot + * be reprogrammed. The other reason why an IPI has to be sent is when the + * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets + * in the tick, which obviously might be stopped, so this has to bring out + * the remote CPU which might sleep in idle to get this sorted. */ -static void retrigger_next_event(void *arg) +void clock_was_set(unsigned int bases) { - struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); + cpumask_var_t mask; + int cpu; - if (!base->hres_active) - return; + if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) + goto out_timerfd; - raw_spin_lock(&base->lock); - hrtimer_update_base(base); - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto out_timerfd; + } -/* - * Switch to high resolution mode - */ -static void hrtimer_switch_to_hres(void) -{ - struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + /* Avoid interrupting CPUs if possible */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + unsigned long flags; - if (tick_init_highres()) { - printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", base->cpu); - return; + cpu_base = &per_cpu(hrtimer_bases, cpu); + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (update_needs_ipi(cpu_base, bases)) + cpumask_set_cpu(cpu, mask); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } - base->hres_active = 1; - hrtimer_resolution = HIGH_RES_NSEC; - tick_setup_sched_timer(); - /* "Retrigger" the interrupt to get things going */ - retrigger_next_event(NULL); + preempt_disable(); + smp_call_function_many(mask, retrigger_next_event, NULL, 1); + preempt_enable(); + cpus_read_unlock(); + free_cpumask_var(mask); + +out_timerfd: + timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { - clock_was_set(); + clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus. + * Called from timekeeping code to reprogram the hrtimer interrupt device + * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } -#else - -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } -static inline int hrtimer_hres_active(void) { return 0; } -static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline void hrtimer_switch_to_hres(void) { } -static inline void -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void retrigger_next_event(void *arg) { } - -#endif /* CONFIG_HIGH_RES_TIMERS */ - -/* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. - */ -void clock_was_set(void) -{ -#ifdef CONFIG_HIGH_RES_TIMERS - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -#endif - timerfd_clock_was_set(); -} - /* - * During resume we might have to reprogram the high resolution timer - * interrupt on all online CPUs. However, all other CPUs will be - * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred. + * Called during resume either directly from via timekeeping_resume() + * or in the case of s2idle from tick_unfreeze() to ensure that the + * hrtimers are up to date. */ -void hrtimers_resume(void) +void hrtimers_resume_local(void) { - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - + lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); - /* And schedule a retrigger for all others */ - clock_was_set_delayed(); } /* @@ -772,26 +1008,29 @@ void hrtimers_resume(void) */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** - * hrtimer_forward - forward the timer expiry + * hrtimer_forward() - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. - * Returns the number of overruns. * - * Can be safely called from the callback function of @timer. If - * called from other contexts @timer must neither be enqueued nor - * running the callback and the caller needs to take care of - * serialization. + * .. note:: + * This only updates the timer expiry value and does not requeue the timer. * - * Note: This only updates the timer expiry value and does not requeue - * the timer. + * There is also a variant of the function hrtimer_forward_now(). + * + * Context: Can be safely called from the callback function of @timer. If called + * from other contexts @timer must neither be enqueued nor running the + * callback and the caller needs to take care of serialization. + * + * Return: The number of overruns are returned. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { @@ -834,16 +1073,18 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * - * Returns 1 when the new timer is the leftmost timer in the tree. + * Returns true when the new timer is the leftmost timer in the tree. */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { - debug_activate(timer); + debug_activate(timer, mode); + WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -865,36 +1106,37 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; - timer->state = newstate; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); -#ifdef CONFIG_HIGH_RES_TIMERS /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is - * an superflous call to hrtimer_force_reprogram() on the + * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); -#endif } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) { - if (hrtimer_is_queued(timer)) { - u8 state = timer->state; - int reprogram; + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { + bool reprogram; /* * Remove the timer and force reprogramming when high @@ -907,8 +1149,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ if (!restart) state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; @@ -923,7 +1173,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution - * (i.e. one jiffie) to prevent short timeouts. + * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) @@ -932,51 +1182,146 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; } -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - */ -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode) +static void +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { - struct hrtimer_clock_base *base, *new_base; - unsigned long flags; - int leftmost; + ktime_t expires; - base = lock_hrtimer_base(timer, &flags); + /* + * Find the next SOFT expiration. + */ + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + + /* + * reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time than the next hard + * hrtimer. cpu_base->softirq_expires_next needs to be updated! + */ + if (expires == KTIME_MAX) + return; - /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base, true); + /* + * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->*expires_next is only set by hrtimer_reprogram() + */ + hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); +} + +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) +{ + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *new_base; + bool force_local, first; + + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_base; + force_local &= base->cpu_base->next_timer == timer; + + /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } - leftmost = enqueue_hrtimer(timer, new_base); - if (!leftmost) - goto unlock; + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; - if (!hrtimer_is_hres_active(timer)) { /* - * Kick to reschedule the next tick to handle the new timer - * on dynticks target. + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. */ - if (new_base->cpu_base->nohz_active) - wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else { - hrtimer_reprogram(timer, new_base); + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; } -unlock: + + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; +} + +/** + * hrtimer_start_range_ns - (re)start an hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard + * expiry mode because unmarked timers are moved to softirq expiry. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + else + WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); + + base = lock_hrtimer_base(timer, &flags); + + if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) + hrtimer_reprogram(timer, true); + unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); @@ -986,9 +1331,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * @timer: hrtimer to stop * * Returns: - * 0 when the timer was not active - * 1 when the timer was active - * -1 when the timer is currently executing the callback function and + * + * * 0 when the timer was not active + * * 1 when the timer was active + * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) @@ -1009,7 +1355,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false); + ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); @@ -1018,6 +1364,107 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); +#ifdef CONFIG_PREEMPT_RT +static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) +{ + spin_lock_init(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) + __acquires(&base->softirq_expiry_lock) +{ + spin_lock(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) + __releases(&base->softirq_expiry_lock) +{ + spin_unlock(&base->softirq_expiry_lock); +} + +/* + * The counterpart to hrtimer_cancel_wait_running(). + * + * If there is a waiter for cpu_base->expiry_lock, then it was waiting for + * the timer callback to finish. Drop expiry_lock and reacquire it. That + * allows the waiter to acquire the lock and make progress. + */ +static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, + unsigned long flags) +{ + if (atomic_read(&cpu_base->timer_waiters)) { + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + spin_unlock(&cpu_base->softirq_expiry_lock); + spin_lock(&cpu_base->softirq_expiry_lock); + raw_spin_lock_irq(&cpu_base->lock); + } +} + +#ifdef CONFIG_SMP +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} +#else +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} +#endif + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion: if the soft irq thread is preempted + * in the middle of a timer callback, then calling hrtimer_cancel() can + * lead to two issues: + * + * - If the caller is on a remote CPU then it has to spin wait for the timer + * handler to complete. This can result in unbound priority inversion. + * + * - If the caller originates from the task which preempted the timer + * handler on the same CPU, then spin waiting for the timer handler to + * complete is never going to end. + */ +void hrtimer_cancel_wait_running(const struct hrtimer *timer) +{ + /* Lockless read. Prevent the compiler from reloading it below */ + struct hrtimer_clock_base *base = READ_ONCE(timer->base); + + /* + * Just relax if the timer expires in hard interrupt context or if + * it is currently on the migration base. + */ + if (!timer->is_soft || is_migration_base(base)) { + cpu_relax(); + return; + } + + /* + * Mark the base as contended and grab the expiry lock, which is + * held by the softirq across the timer callback. Drop the lock + * immediately so the softirq can expire the next timer. In theory + * the timer could already be running again, but that's more than + * unlikely and just causes another wait loop. + */ + atomic_inc(&base->cpu_base->timer_waiters); + spin_lock_bh(&base->cpu_base->softirq_expiry_lock); + atomic_dec(&base->cpu_base->timer_waiters); + spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); +} +#else +static inline void +hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, + unsigned long flags) { } +#endif + /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled @@ -1028,18 +1475,20 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); */ int hrtimer_cancel(struct hrtimer *timer) { - for (;;) { - int ret = hrtimer_try_to_cancel(timer); + int ret; - if (ret >= 0) - return ret; - cpu_relax(); - } + do { + ret = hrtimer_try_to_cancel(timer); + + if (ret < 0) + hrtimer_cancel_wait_running(timer); + } while (ret < 0); + return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** - * hrtimer_get_remaining - get remaining time for the timer + * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ @@ -1073,8 +1522,41 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!__hrtimer_hres_active(cpu_base)) - expires = __hrtimer_get_next_event(cpu_base); + if (!hrtimer_hres_active(cpu_base)) + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1084,47 +1566,125 @@ u64 hrtimer_get_next_event(void) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; + switch (clock_id) { + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; + } +} - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) +{ + return __hrtimer_cb_get_time(timer->base->clockid); +} +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); + +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { + bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; + /* + * On PREEMPT_RT enabled kernels hrtimers which are not explicitly + * marked for hard interrupt expiry mode are moved into soft + * interrupt context for latency reasons and because the callbacks + * can invoke functions which might sleep on RT, e.g. spin_lock(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) + softtimer = true; + memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) + /* + * POSIX magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they needs to become CLOCK_MONOTONIC to + * ensure POSIX compliance. + */ + if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; - base = hrtimer_clockid_to_base(clock_id); + base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; + base += hrtimer_clockid_to_base(clock_id); + timer->is_soft = softtimer; + timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); + + if (WARN_ON_ONCE(!function)) + ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; + else + ACCESS_PRIVATE(timer, function) = function; } /** - * hrtimer_init - initialize a timer to the given clock + * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized + * @function: the callback function * @clock_id: the clock to be used - * @mode: timer mode abs/rel + * @mode: The modes which are relevant for initialization: + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, + * HRTIMER_MODE_REL_SOFT + * + * The PINNED variants of the above can be handed in, + * but the PINNED bit is ignored as pinning happens + * when the hrtimer is started + */ +void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_setup(timer, clock_id, mode); + __hrtimer_setup(timer, function, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_setup); + +/** + * hrtimer_setup_on_stack - initialize a timer on stack memory + * @timer: The timer to be initialized + * @function: the callback function + * @clock_id: The clock to be used + * @mode: The timer mode + * + * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack + * memory. */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +void hrtimer_setup_on_stack(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); + debug_setup_on_stack(timer, clock_id, mode); + __hrtimer_setup(timer, function, clock_id, mode); } -EXPORT_SYMBOL_GPL(hrtimer_init); +EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); /* * A timer is active, when it is enqueued into the rbtree or the @@ -1135,19 +1695,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ bool hrtimer_active(const struct hrtimer *timer) { - struct hrtimer_cpu_base *cpu_base; + struct hrtimer_clock_base *base; unsigned int seq; do { - cpu_base = READ_ONCE(timer->base->cpu_base); - seq = raw_read_seqcount_begin(&cpu_base->seq); + base = READ_ONCE(timer->base); + seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + base->running == timer) return true; - } while (read_seqcount_retry(&cpu_base->seq, seq) || - cpu_base != READ_ONCE(timer->base->cpu_base)); + } while (read_seqcount_retry(&base->seq, seq) || + base != READ_ONCE(timer->base)); return false; } @@ -1167,33 +1727,35 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * insufficient for that. * * The sequence numbers are required because otherwise we could still observe - * a false negative if the read side got smeared over multiple consequtive + * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, - struct hrtimer *timer, ktime_t *now) + struct hrtimer *timer, ktime_t *now, + unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); + bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - cpu_base->running = timer; + base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in - * hrtimer_active() cannot observe cpu_base->running == NULL && + * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ - raw_write_seqcount_barrier(&cpu_base->seq); + raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - fn = timer->function; + fn = ACCESS_PRIVATE(timer, function); /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the @@ -1204,15 +1766,19 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, timer->is_rel = false; /* - * Because we run timers from hardirq context, there is no chance - * they get migrated to another cpu, therefore its safe to unlock - * the timer base. + * The timer is marked as running in the CPU base, so it is + * protected against migration to a different CPU even if the lock + * is dropped. */ - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); + expires_in_hardirq = lockdep_hrtimer_enter(timer); + restart = fn(timer); + + lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and @@ -1225,33 +1791,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) - enqueue_hrtimer(timer, base); + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in - * hrtimer_active() cannot observe cpu_base->running == NULL && + * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ - raw_write_seqcount_barrier(&cpu_base->seq); + raw_write_seqcount_barrier(&base->seq); - WARN_ON_ONCE(cpu_base->running != timer); - cpu_base->running = NULL; + WARN_ON_ONCE(base->running != timer); + base->running = NULL; } -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + unsigned long flags, unsigned int active_mask) { - struct hrtimer_clock_base *base = cpu_base->clock_base; - unsigned int active = cpu_base->active_bases; + struct hrtimer_clock_base *base; + unsigned int active = cpu_base->active_bases & active_mask; - for (; active; base++, active >>= 1) { + for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; - if (!(active & 0x01)) - continue; - basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { @@ -1264,7 +1828,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search - * Tree, which can answer a stabbing querry for + * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that @@ -1274,11 +1838,32 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (basenow < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(cpu_base, base, timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow, flags); + if (active_mask == HRTIMER_ACTIVE_SOFT) + hrtimer_sync_wait_running(cpu_base, flags); } } } +static __latent_entropy void hrtimer_run_softirq(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; + ktime_t now; + + hrtimer_cpu_base_lock_expiry(cpu_base); + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); + + cpu_base->softirq_activated = 0; + hrtimer_update_softirq_timer(cpu_base, true); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + hrtimer_cpu_base_unlock_expiry(cpu_base); +} + #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1289,13 +1874,14 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; + unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; @@ -1308,17 +1894,23 @@ retry: */ cpu_base->expires_next = KTIME_MAX; - __hrtimer_run_queues(cpu_base, now); + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_timer_softirq(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); - /* Reevaluate the clock bases for the next expiry */ - expires_next = __hrtimer_get_next_event(cpu_base); + /* Reevaluate the clock bases for the [soft] next expiry */ + expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { @@ -1339,7 +1931,7 @@ retry: * Acquire base lock for updating the offsets and retrieving * the current time. */ - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) @@ -1352,7 +1944,8 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; @@ -1365,28 +1958,9 @@ retry: else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); - printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", - ktime_to_ns(delta)); -} - -/* called with interrupts disabled */ -static inline void __hrtimer_peek_ahead_timers(void) -{ - struct tick_device *td; - - if (!hrtimer_hres_active()) - return; - - td = this_cpu_ptr(&tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); + pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } - -#else /* CONFIG_HIGH_RES_TIMERS */ - -static inline void __hrtimer_peek_ahead_timers(void) { } - -#endif /* !CONFIG_HIGH_RES_TIMERS */ +#endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy @@ -1394,9 +1968,10 @@ static inline void __hrtimer_peek_ahead_timers(void) { } void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; ktime_t now; - if (__hrtimer_hres_active(cpu_base)) + if (hrtimer_hres_active(cpu_base)) return; /* @@ -1411,10 +1986,17 @@ void hrtimer_run_queues(void) return; } - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - __hrtimer_run_queues(cpu_base, now); - raw_spin_unlock(&cpu_base->lock); + + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_timer_softirq(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* @@ -1433,19 +2015,82 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +/** + * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer + * @sl: sleeper to be started + * @mode: timer mode abs/rel + * + * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers + * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) + */ +void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, + enum hrtimer_mode mode) { - sl->timer.function = hrtimer_wakeup; - sl->task = task; + /* + * Make the enqueue delivery mode check work on RT. If the sleeper + * was initialized for hard interrupt delivery, force the mode bit. + * This is a special case for hrtimer_sleepers because + * __hrtimer_setup_sleeper() determines the delivery mode on RT so the + * fiddling with this decision is avoided at the call sites. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) + mode |= HRTIMER_MODE_HARD; + + hrtimer_start_expires(&sl->timer, mode); } -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); +EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); + +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) +{ + /* + * On PREEMPT_RT enabled kernels hrtimers which are not explicitly + * marked for hard interrupt expiry mode are moved into soft + * interrupt context either for latency reasons or because the + * hrtimer callback takes regular spinlocks or invokes other + * functions which are not suitable for hard interrupt context on + * PREEMPT_RT. + * + * The hrtimer_sleeper callback is RT compatible in hard interrupt + * context, but there is a latency concern: Untrusted userspace can + * spawn many threads which arm timers for the same expiry time on + * the same CPU. That causes a latency spike due to the wakeup of + * a gazillion threads. + * + * OTOH, privileged real-time user space applications rely on the + * low latency of hard interrupt wakeups. If the current task is in + * a real-time scheduling class, mark the mode for hard interrupt + * expiry. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) + mode |= HRTIMER_MODE_HARD; + } + + __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode); + sl->task = current; +} + +/** + * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory + * @sl: sleeper to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_setup_on_stack(&sl->timer, clock_id, mode); + __hrtimer_setup_sleeper(sl, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: - if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) + if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif @@ -1463,14 +2108,12 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod { struct restart_block *restart; - hrtimer_init_sleeper(t, current); - do { - set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start_expires(&t->timer, mode); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) - freezable_schedule(); + schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; @@ -1501,29 +2144,22 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; int ret; - hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - + hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } -long hrtimer_nanosleep(const struct timespec64 *rqtp, - const enum hrtimer_mode mode, const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - u64 slack; - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) - slack = 0; - - hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); + hrtimer_setup_sleeper_on_stack(&t, clockid, mode); + hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -1535,16 +2171,18 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, } restart = ¤t->restart_block; - restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); + set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } -SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, - struct timespec __user *, rmtp) +#ifdef CONFIG_64BIT + +SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; @@ -1554,27 +2192,33 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } -#ifdef CONFIG_COMPAT +#endif -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) +#ifdef CONFIG_COMPAT_32BIT_TIME + +SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; - if (compat_get_timespec64(&tu, rqtp)) + if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif @@ -1587,12 +2231,31 @@ int hrtimers_prepare_cpu(unsigned int cpu) int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; - hrtimer_init_hres(cpu_base); + hrtimer_cpu_base_init_expiry_lock(cpu_base); + return 0; +} + +int hrtimers_cpu_starting(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + + /* Clear out any left over state from a CPU down operation */ + cpu_base->active_bases = 0; + cpu_base->hres_active = 0; + cpu_base->hang_detected = 0; + cpu_base->next_timer = NULL; + cpu_base->softirq_next_timer = NULL; + cpu_base->expires_next = KTIME_MAX; + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->online = 1; return 0; } @@ -1624,39 +2287,37 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_base); + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } -int hrtimers_dead_cpu(unsigned int scpu) +int hrtimers_cpu_dying(unsigned int dying_cpu) { + int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; - int i; - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); + old_base = this_cpu_ptr(&hrtimer_bases); + new_base = &per_cpu(hrtimer_bases, ncpu); - local_irq_disable(); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&old_base->lock); + raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - raw_spin_unlock(&old_base->lock); + /* Tell the other CPU to retrigger the next event */ + smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); + raw_spin_unlock(&new_base->lock); + old_base->online = 0; + raw_spin_unlock(&old_base->lock); - /* Check, if we got expired work to do */ - __hrtimer_peek_ahead_timers(); - local_irq_enable(); return 0; } @@ -1665,125 +2326,6 @@ int hrtimers_dead_cpu(unsigned int scpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + hrtimers_cpu_starting(smp_processor_id()); + open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } - -/** - * schedule_hrtimeout_range_clock - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME - */ -int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode, int clock) -{ - struct hrtimer_sleeper t; - - /* - * Optimize when a zero timeout value is given. It does not - * matter whether this is an absolute or a relative time. - */ - if (expires && *expires == 0) { - __set_current_state(TASK_RUNNING); - return 0; - } - - /* - * A NULL parameter means "infinite" - */ - if (!expires) { - schedule(); - return -EINTR; - } - - hrtimer_init_on_stack(&t.timer, clock, mode); - hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - - hrtimer_init_sleeper(&t, current); - - hrtimer_start_expires(&t.timer, mode); - - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - destroy_hrtimer_on_stack(&t.timer); - - __set_current_state(TASK_RUNNING); - - return !t.task ? 0 : -EINTR; -} - -/** - * schedule_hrtimeout_range - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range_clock(expires, delta, mode, - CLOCK_MONOTONIC); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); - -/** - * schedule_hrtimeout - sleep until timeout - * @expires: timeout value (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout(ktime_t *expires, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range(expires, 0, mode); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout); |
