diff options
Diffstat (limited to 'kernel/time')
-rw-r--r-- | kernel/time/Kconfig | 9 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 15 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 216 | ||||
-rw-r--r-- | kernel/time/posix-timers.c | 4 | ||||
-rw-r--r-- | kernel/time/sched_clock.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 21 | ||||
-rw-r--r-- | kernel/time/timekeeping_internal.h | 11 | ||||
-rw-r--r-- | kernel/time/timer.c | 3 | ||||
-rw-r--r-- | kernel/time/vsyscall.c | 41 |
11 files changed, 279 insertions, 47 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fcc42353f125..a09b1d61df6a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CMOS_UPDATE bool +# Select to handle posix CPU timers from task_work +# and not from the timer interrupt context +config HAVE_POSIX_CPU_TIMERS_TASK_WORK + bool + +config POSIX_CPU_TIMERS_TASK_WORK + bool + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2ffb466af77e..ca223a89530a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) * When a alarm timer fires, this runs through the timerqueue to * see which alarms expired, and runs those. If there are more alarm * timers queued for the future, we set the hrtimer to fire when - * when the next future alarm timer expires. + * the next future alarm timer expires. */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d89da1c7e005..95b6a708b040 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { .cpu_base = &migration_cpu_base, }, }, + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -373,7 +377,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); - /* fall through */ + fallthrough; default: return false; } @@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu) int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 165117996ea0..a71758e34e45 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { + static struct lock_class_key posix_cpu_timers_key; struct pid *pid; rcu_read_lock(); @@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; } + /* + * If posix timer expiry is handled in task work context then + * timer::it_lock can be taken without disabling interrupts as all + * other locking happens in task context. This requires a seperate + * lock class key otherwise regular posix timer expiry would record + * the lock class being taken in interrupt context and generate a + * false positive warning. + */ + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); + new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_pid(pid); @@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } +static void handle_posix_cpu_timers(struct task_struct *tsk); + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +static void posix_cpu_timers_work(struct callback_head *work) +{ + handle_posix_cpu_timers(current); +} + /* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. + * Initialize posix CPU timers task work in init task. Out of line to + * keep the callback static and to avoid header recursion hell. */ -void run_posix_cpu_timers(void) +void __init posix_cputimers_init_work(void) { - struct task_struct *tsk = current; - struct k_itimer *timer, *next; - unsigned long flags; - LIST_HEAD(firing); + init_task_work(¤t->posix_cputimers_work.work, + posix_cpu_timers_work); +} - lockdep_assert_irqs_disabled(); +/* + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either + * in hard interrupt context or in task context with interrupts + * disabled. Aside of that the writer/reader interaction is always in the + * context of the current task, which means they are strict per CPU. + */ +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return tsk->posix_cputimers_work.scheduled; +} - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) return; - lockdep_posixtimer_enter(); - if (!lock_task_sighand(tsk, &flags)) { - lockdep_posixtimer_exit(); - return; + /* Schedule task work to actually expire the timers */ + tsk->posix_cputimers_work.scheduled = true; + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + bool ret = true; + + /* + * On !RT kernels interrupts are disabled while collecting expired + * timers, so no tick can happen and the fast path check can be + * reenabled without further checks. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + tsk->posix_cputimers_work.scheduled = false; + return true; } + /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. + * On RT enabled kernels ticks can happen while the expired timers + * are collected under sighand lock. But any tick which observes + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath + * checks. So reenabling the tick work has do be done carefully: + * + * Disable interrupts and run the fast path check if jiffies have + * advanced since the collecting of expired timers started. If + * jiffies have not advanced or the fast path check did not find + * newly expired timers, reenable the fast path check in the timer + * interrupt. If there are newly expired timers, return false and + * let the collection loop repeat. */ - check_thread_timers(tsk, &firing); + local_irq_disable(); + if (start != jiffies && fastpath_timer_check(tsk)) + ret = false; + else + tsk->posix_cputimers_work.scheduled = false; + local_irq_enable(); + + return ret; +} +#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_posixtimer_enter(); + handle_posix_cpu_timers(tsk); + lockdep_posixtimer_exit(); +} + +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return false; +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + return true; +} +#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ + +static void handle_posix_cpu_timers(struct task_struct *tsk) +{ + struct k_itimer *timer, *next; + unsigned long flags, start; + LIST_HEAD(firing); + + if (!lock_task_sighand(tsk, &flags)) + return; - check_process_timers(tsk, &firing); + do { + /* + * On RT locking sighand lock does not disable interrupts, + * so this needs to be careful vs. ticks. Store the current + * jiffies value. + */ + start = READ_ONCE(jiffies); + barrier(); + + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * The above timer checks have updated the exipry cache and + * because nothing can have queued or modified timers after + * sighand lock was taken above it is guaranteed to be + * consistent. So the next timer interrupt fastpath check + * will find valid data. + * + * If timer expiry runs in the timer interrupt context then + * the loop is not relevant as timers will be directly + * expired in interrupt context. The stub function below + * returns always true which allows the compiler to + * optimize the loop out. + * + * If timer expiry is deferred to task work context then + * the following rules apply: + * + * - On !RT kernels no tick can have happened on this CPU + * after sighand lock was acquired because interrupts are + * disabled. So reenabling task work before dropping + * sighand lock and reenabling interrupts is race free. + * + * - On RT kernels ticks might have happened but the tick + * work ignored posix CPU timer handling because the + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work + * must be done very carefully including a check whether + * ticks have happened since the start of the timer + * expiry checks. posix_cpu_timers_enable_work() takes + * care of that and eventually lets the expiry checks + * run again. + */ + } while (!posix_cpu_timers_enable_work(tsk, start)); /* - * We must release these locks before taking any timer's lock. + * We must release sighand lock before taking any timer's lock. * There is a potential race with timer deletion here, as the * siglock now protects our private firing list. We have set * the firing flag in each timer, so that a deletion attempt @@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void) list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; + /* + * spin_lock() is sufficient here even independent of the + * expiry context. If expiry happens in hard interrupt + * context it's obvious. For task work context it's safe + * because all other operations on timer::it_lock happen in + * task context (syscall or exit). + */ spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; @@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } - lockdep_posixtimer_exit(); +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(void) +{ + struct task_struct *tsk = current; + + lockdep_assert_irqs_disabled(); + + /* + * If the actual expiry is deferred to task work context and the + * work is already scheduled there is no point to do anything here. + */ + if (posix_cpu_timers_work_scheduled(tsk)) + return; + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + __run_posix_cpu_timers(tsk); } /* diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 07709ac30439..bf540f5a4115 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -439,12 +439,12 @@ static struct pid *good_sigevent(sigevent_t * event) rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; - /* FALLTHRU */ + fallthrough; case SIGEV_SIGNAL: case SIGEV_THREAD: if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) return NULL; - /* FALLTHRU */ + fallthrough; case SIGEV_NONE: return pid; default: diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0deaf4b79fb4..1c03eec6ca9b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, - * make it the final one one. + * make it the final one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e51778c312f1..36d7464c8962 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -381,7 +381,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; - /* fall through */ + fallthrough; case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63a632f9896c..4c47f388a83f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -39,18 +39,19 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; +DEFINE_RAW_SPINLOCK(timekeeper_lock); + /* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { - .seq = SEQCNT_ZERO(tk_core.seq), + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** @@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct tk_read_base base[2]; }; @@ -80,11 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the - * seqlock ensures we don't return a bad value while structures are updated, + * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if @@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) unsigned int seq; /* - * Since we're called holding a seqlock, the data may shift + * Since we're called holding a seqcount, the data may shift * under us while we're doing the calculation. This can cause * false positives, since we'd note a problem but throw the - * results away. So nest another seqlock here to atomically + * results away. So nest another seqcount here to atomically * grab the points we are checking with. */ do { @@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: + * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset @@ -2001,7 +2004,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into - * into a shifted interval nanoseconds. Allows for O(log) accumulation + * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index bcbb52db2256..4ca2787d1642 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TIMEKEEPING_INTERNAL_H #define _TIMEKEEPING_INTERNAL_H -/* - * timekeeping debug functions - */ + #include <linux/clocksource.h> +#include <linux/spinlock.h> #include <linux/time.h> +/* + * timekeeping debug functions + */ #ifdef CONFIG_DEBUG_FS extern void tk_debug_account_sleep_time(const struct timespec64 *t); #else @@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif +/* Semi public for serialization of non timekeeper VDSO updates. */ +extern raw_spinlock_t timekeeper_lock; + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ae5029f984a8..a50364df1054 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -666,7 +666,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: WARN_ON(1); - /* fall through */ + fallthrough; default: return false; } @@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void) void __init init_timers(void) { init_timer_cpus(); + posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 54ce6eb2ca36..88e6b8ed6ca5 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -13,6 +13,8 @@ #include <vdso/helpers.h> #include <vdso/vsyscall.h> +#include "timekeeping_internal.h" + static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { @@ -127,3 +129,42 @@ void update_vsyscall_tz(void) __arch_sync_vdso_data(vdata); } + +/** + * vdso_update_begin - Start of a VDSO update section + * + * Allows architecture code to safely update the architecture specific VDSO + * data. Disables interrupts, acquires timekeeper lock to serialize against + * concurrent updates from timekeeping and invalidates the VDSO data + * sequence counter to prevent concurrent readers from accessing + * inconsistent data. + * + * Returns: Saved interrupt flags which need to be handed in to + * vdso_update_end(). + */ +unsigned long vdso_update_begin(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + vdso_write_begin(vdata); + return flags; +} + +/** + * vdso_update_end - End of a VDSO update section + * @flags: Interrupt flags as returned from vdso_update_begin() + * + * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data + * synchronization if the architecture requires it, drops timekeeper lock + * and restores interrupt flags. + */ +void vdso_update_end(unsigned long flags) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + vdso_write_end(vdata); + __arch_sync_vdso_data(vdata); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} |