diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-17 07:25:37 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-17 07:25:37 +0200 |
commit | 9ea925c806dbb8fee6797f59148daaf7f648832e (patch) | |
tree | 5380645bc7bfb06a6ed22c49b9e5ed5a13bc67f9 /kernel/time/timer.c | |
parent | cb69d86550b3f47be50fa5751d31ebbdb71b18ee (diff) | |
parent | 35b603f8a78b0bd51566db277c4f7b56b3ff6bac (diff) |
Merge tag 'timers-core-2024-09-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer updates from Thomas Gleixner:
"Core:
- Overhaul of posix-timers in preparation of removing the workaround
for periodic timers which have signal delivery ignored.
- Remove the historical extra jiffie in msleep()
msleep() adds an extra jiffie to the timeout value to ensure
minimal sleep time. The timer wheel ensures minimal sleep time
since the large rewrite to a non-cascading wheel, but the extra
jiffie in msleep() remained unnoticed. Remove it.
- Make the timer slack handling correct for realtime tasks.
The procfs interface is inconsistent and does neither reflect
reality nor conforms to the man page. Show the correct 0 slack for
real time tasks and enforce it at the core level instead of having
inconsistent individual checks in various timer setup functions.
- The usual set of updates and enhancements all over the place.
Drivers:
- Allow the ACPI PM timer to be turned off during suspend
- No new drivers
- The usual updates and enhancements in various drivers"
* tag 'timers-core-2024-09-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (43 commits)
ntp: Make sure RTC is synchronized when time goes backwards
treewide: Fix wrong singular form of jiffies in comments
cpu: Use already existing usleep_range()
timers: Rename next_expiry_recalc() to be unique
platform/x86:intel/pmc: Fix comment for the pmc_core_acpi_pm_timer_suspend_resume function
clocksource/drivers/jcore: Use request_percpu_irq()
clocksource/drivers/cadence-ttc: Add missing clk_disable_unprepare in ttc_setup_clockevent
clocksource/drivers/asm9260: Add missing clk_disable_unprepare in asm9260_timer_init
clocksource/drivers/qcom: Add missing iounmap() on errors in msm_dt_timer_init()
clocksource/drivers/ingenic: Use devm_clk_get_enabled() helpers
platform/x86:intel/pmc: Enable the ACPI PM Timer to be turned off when suspended
clocksource: acpi_pm: Add external callback for suspend/resume
clocksource/drivers/arm_arch_timer: Using for_each_available_child_of_node_scoped()
dt-bindings: timer: rockchip: Add rk3576 compatible
timers: Annotate possible non critical data race of next_expiry
timers: Remove historical extra jiffie for timeout in msleep()
hrtimer: Use and report correct timerslack values for realtime tasks
hrtimer: Annotate hrtimer_cpu_base_.*_expiry() for sparse.
timers: Add sparse annotation for timer_sync_wait_running().
signal: Replace BUG_ON()s
...
Diffstat (limited to 'kernel/time/timer.c')
-rw-r--r-- | kernel/time/timer.c | 62 |
1 files changed, 48 insertions, 14 deletions
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 760bbeb1f331..0fc9d066a7be 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -365,7 +365,7 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, rem = j % HZ; /* - * If the target jiffie is just after a whole second (which can happen + * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. @@ -672,7 +672,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ - base->next_expiry = bucket_expiry; + WRITE_ONCE(base->next_expiry, bucket_expiry); base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); @@ -1561,6 +1561,8 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) + __releases(&base->lock) __releases(&base->expiry_lock) + __acquires(&base->expiry_lock) __acquires(&base->lock) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); @@ -1898,7 +1900,7 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, * * Store next expiry time in base->next_expiry. */ -static void next_expiry_recalc(struct timer_base *base) +static void timer_recalc_next_expiry(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; @@ -1928,7 +1930,7 @@ static void next_expiry_recalc(struct timer_base *base) * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next - * expiring jiffie. So in case of: + * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 @@ -1964,7 +1966,7 @@ static void next_expiry_recalc(struct timer_base *base) clk += adj; } - base->next_expiry = next; + WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); } @@ -1993,7 +1995,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) return basem; /* - * Round up to the next jiffie. High resolution timers are + * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. @@ -2007,7 +2009,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, unsigned long basej) { if (base->next_expiry_recalc) - next_expiry_recalc(base); + timer_recalc_next_expiry(base); /* * Move next_expiry for the empty base into the future to prevent an @@ -2018,7 +2020,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - base->next_expiry = basej + NEXT_TIMER_MAX_DELTA; + WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); return base->next_expiry; } @@ -2252,7 +2254,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, base_global, &tevt); /* - * If the next event is only one jiffie ahead there is no need to call + * If the next event is only one jiffy ahead there is no need to call * timer migration hierarchy related functions. The value for the next * global timer in @tevt struct equals then KTIME_MAX. This is also * true, when the timer base is idle. @@ -2411,7 +2413,7 @@ static inline void __run_timers(struct timer_base *base) * jiffies to avoid endless requeuing to current jiffies. */ base->clk++; - next_expiry_recalc(base); + timer_recalc_next_expiry(base); while (levels--) expire_timers(base, heads + levels); @@ -2462,8 +2464,40 @@ static void run_local_timers(void) hrtimer_run_queues(); for (int i = 0; i < NR_BASES; i++, base++) { - /* Raise the softirq only if required. */ - if (time_after_eq(jiffies, base->next_expiry) || + /* + * Raise the softirq only if required. + * + * timer_base::next_expiry can be written by a remote CPU while + * holding the lock. If this write happens at the same time than + * the lockless local read, sanity checker could complain about + * data corruption. + * + * There are two possible situations where + * timer_base::next_expiry is written by a remote CPU: + * + * 1. Remote CPU expires global timers of this CPU and updates + * timer_base::next_expiry of BASE_GLOBAL afterwards in + * next_timer_interrupt() or timer_recalc_next_expiry(). The + * worst outcome is a superfluous raise of the timer softirq + * when the not yet updated value is read. + * + * 2. A new first pinned timer is enqueued by a remote CPU + * and therefore timer_base::next_expiry of BASE_LOCAL is + * updated. When this update is missed, this isn't a + * problem, as an IPI is executed nevertheless when the CPU + * was idle before. When the CPU wasn't idle but the update + * is missed, then the timer would expire one jiffy late - + * bad luck. + * + * Those unlikely corner cases where the worst outcome is only a + * one jiffy delay or a superfluous raise of the softirq are + * not that expensive as doing the check always while holding + * the lock. + * + * Possible remote writers are using WRITE_ONCE(). Local reader + * uses therefore READ_ONCE(). + */ + if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || (i == BASE_DEF && tmigr_requires_handle_remote())) { raise_softirq(TIMER_SOFTIRQ); return; @@ -2730,7 +2764,7 @@ void __init init_timers(void) */ void msleep(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs) + 1; + unsigned long timeout = msecs_to_jiffies(msecs); while (timeout) timeout = schedule_timeout_uninterruptible(timeout); @@ -2744,7 +2778,7 @@ EXPORT_SYMBOL(msleep); */ unsigned long msleep_interruptible(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs) + 1; + unsigned long timeout = msecs_to_jiffies(msecs); while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); |