diff options
Diffstat (limited to 'kernel/time/posix-timers.c')
-rw-r--r-- | kernel/time/posix-timers.c | 778 |
1 files changed, 457 insertions, 321 deletions
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index bf540f5a4115..1b675aee99a9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -35,20 +35,17 @@ #include "timekeeping.h" #include "posix-timers.h" -/* - * Management arrays for POSIX timers. Timers are now kept in static hash table - * with 512 entries. - * Timer ids are allocated by local routine, which selects proper hash head by - * key, constructed from current->signal address and per signal struct counter. - * This keeps timer ids unique per process, but now they can intersect between - * processes. - */ +static struct kmem_cache *posix_timers_cache; /* - * Lets keep our timers in a slab cache :-) + * Timers are managed in a hash table for lockless lookup. The hash key is + * constructed from current::signal and the timer ID and the timer is + * matched against current::signal and the timer ID when walking the hash + * bucket list. + * + * This allows checkpoint/restore to reconstruct the exact timer IDs for + * a process. */ -static struct kmem_cache *posix_timers_cache; - static DEFINE_HASHTABLE(posix_timers_hashtable, 9); static DEFINE_SPINLOCK(hash_lock); @@ -56,52 +53,12 @@ static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; -/* - * we assume that the new SIGEV_THREAD_ID shares no bits with the other - * SIGEV values. Here we put out an error if this assumption fails. - */ +/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -/* - * The timer ID is turned into a timer address by idr_find(). - * Verifying a valid ID consists of: - * - * a) checking that idr_find() returns other than -1. - * b) checking that the timer id matches the one in the timer itself. - * c) that the timer owner is in the callers thread group. - */ - -/* - * CLOCKs: The POSIX standard calls for a couple of clocks and allows us - * to implement others. This structure defines the various - * clocks. - * - * RESOLUTION: Clock resolution is used to round up timer and interval - * times, NOT to report clock times, which are reported with as - * much resolution as the system can muster. In some cases this - * resolution may depend on the underlying clock hardware and - * may not be quantifiable until run time, and only then is the - * necessary code is written. The standard says we should say - * something about this issue in the documentation... - * - * FUNCTIONS: The CLOCKs structure defines possible functions to - * handle various clock functions. - * - * The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for - * the timer. 2.) The list, it_lock, it_clock, it_id and - * it_pid fields are not modified by timer code. - * - * Permissions: It is assumed that the clock_settime() function defined - * for each clock will take care of permission checks. Some - * clocks may be set able by any user (i.e. local process - * clocks) others not. Currently the only set able clock we - * have is CLOCK_REALTIME and its high res counter part, both of - * which we beg off on and pass to do_sys_settimeofday(). - */ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); #define lock_timer(tid, flags) \ @@ -121,9 +78,9 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, { struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, - lockdep_is_held(&hash_lock)) { - if ((timer->it_signal == sig) && (timer->it_id == id)) + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + /* timer->it_signal can be set concurrently */ + if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; } return NULL; @@ -140,25 +97,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id) static int posix_timer_add(struct k_itimer *timer) { struct signal_struct *sig = current->signal; - int first_free_id = sig->posix_timer_id; struct hlist_head *head; - int ret = -ENOENT; + unsigned int cnt, id; - do { + /* + * FIXME: Replace this by a per signal struct xarray once there is + * a plan to handle the resulting CRIU regression gracefully. + */ + for (cnt = 0; cnt <= INT_MAX; cnt++) { spin_lock(&hash_lock); - head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; - if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + id = sig->next_posix_timer_id; + + /* Write the next ID back. Clamp it to the positive space */ + sig->next_posix_timer_id = (id + 1) & INT_MAX; + + head = &posix_timers_hashtable[hash(sig, id)]; + if (!__posix_timers_find(head, sig, id)) { hlist_add_head_rcu(&timer->t_hash, head); - ret = sig->posix_timer_id; + spin_unlock(&hash_lock); + return id; } - if (++sig->posix_timer_id < 0) - sig->posix_timer_id = 0; - if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) - /* Loop over all possible ids completed */ - ret = -EAGAIN; spin_unlock(&hash_lock); - } while (ret == -ENOENT); - return ret; + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) @@ -166,7 +128,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) spin_unlock_irqrestore(&timr->it_lock, flags); } -/* Get clock_realtime */ static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); @@ -178,7 +139,6 @@ static ktime_t posix_get_realtime_ktime(clockid_t which_clock) return ktime_get_real(); } -/* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) { @@ -191,9 +151,6 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, return do_adjtimex(t); } -/* - * Get monotonic time for posix timers - */ static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); @@ -206,9 +163,6 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) return ktime_get(); } -/* - * Get monotonic-raw time for posix timers - */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); @@ -216,7 +170,6 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) return 0; } - static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_real_ts64(tp); @@ -267,14 +220,11 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } -/* - * Initialize everything, well, just everything in Posix clocks/timers ;) - */ static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); @@ -283,11 +233,12 @@ __initcall(init_posix_timers); * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX */ -static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +static inline int timer_overrun_to_int(struct k_itimer *timr) { - s64 sum = timr->it_overrun_last + (s64)baseval; + if (timr->it_overrun_last > (s64)INT_MAX) + return INT_MAX; - return sum > (s64)INT_MAX ? INT_MAX : (int)sum; + return (int)timr->it_overrun_last; } static void common_hrtimer_rearm(struct k_itimer *timr) @@ -299,133 +250,78 @@ static void common_hrtimer_rearm(struct k_itimer *timr) hrtimer_restart(timer); } +static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr) +{ + guard(spinlock)(&timr->it_lock); + + /* + * Check if the timer is still alive or whether it got modified + * since the signal was queued. In either case, don't rearm and + * drop the signal. + */ + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + return false; + + if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) + return true; + + timr->kclock->timer_rearm(timr); + timr->it_status = POSIX_TIMER_ARMED; + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1LL; + ++timr->it_signal_seq; + info->si_overrun = timer_overrun_to_int(timr); + return true; +} + /* - * This function is exported for use by the signal deliver code. It is - * called just prior to the info block being released and passes that - * block to us. It's function is to update the overrun entry AND to - * restart the timer. It should only be called if the timer is to be - * restarted (i.e. we have flagged this in the sys_private entry of the - * info block). - * - * To protect against the timer going away while the interrupt is queued, - * we require that the it_requeue_pending flag be set. + * This function is called from the signal delivery code. It decides + * whether the signal should be dropped and rearms interval timers. The + * timer can be unconditionally accessed as there is a reference held on + * it. */ -void posixtimer_rearm(struct kernel_siginfo *info) +bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { - struct k_itimer *timr; - unsigned long flags; + struct k_itimer *timr = container_of(timer_sigq, struct k_itimer, sigq); + bool ret; - timr = lock_timer(info->si_tid, &flags); - if (!timr) - return; + /* + * Release siglock to ensure proper locking order versus + * timr::it_lock. Keep interrupts disabled. + */ + spin_unlock(¤t->sighand->siglock); - if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { - timr->kclock->timer_rearm(timr); + ret = __posixtimer_deliver_signal(info, timr); - timr->it_active = 1; - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1LL; - ++timr->it_requeue_pending; + /* Drop the reference which was acquired when the signal was queued */ + posixtimer_putref(timr); - info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); - } - - unlock_timer(timr, flags); + spin_lock(¤t->sighand->siglock); + return ret; } -int posix_timer_event(struct k_itimer *timr, int si_private) +void posix_timer_queue_signal(struct k_itimer *timr) { - enum pid_type type; - int ret = -1; - /* - * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->posixtimer_rearm(). - * - * If dequeue_signal() sees the "right" value of - * si_sys_private it calls posixtimer_rearm(). - * We re-queue ->sigq and drop ->it_lock(). - * posixtimer_rearm() locks the timer - * and re-schedules it while ->sigq is pending. - * Not really bad, but not that we want. - */ - timr->sigq->info.si_sys_private = si_private; + lockdep_assert_held(&timr->it_lock); - type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; - ret = send_sigqueue(timr->sigq, timr->it_pid, type); - /* If we failed to send the signal the timer stops. */ - return ret > 0; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; + posixtimer_send_sigqueue(timr); } /* - * This function gets called when a POSIX.1b interval timer expires. It - * is used as a callback from the kernel internal timer. The - * run_timer_list code ALWAYS calls with interrupts on. - - * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + * This function gets called when a POSIX.1b interval timer expires from + * the HRTIMER interrupt (soft interrupt on RT kernels). + * + * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI + * based timers. */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { - struct k_itimer *timr; - unsigned long flags; - int si_private = 0; - enum hrtimer_restart ret = HRTIMER_NORESTART; - - timr = container_of(timer, struct k_itimer, it.real.timer); - spin_lock_irqsave(&timr->it_lock, flags); + struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer); - timr->it_active = 0; - if (timr->it_interval != 0) - si_private = ++timr->it_requeue_pending; - - if (posix_timer_event(timr, si_private)) { - /* - * signal was not sent because of sig_ignor - * we will not get a call back to restart it AND - * it should be restarted. - */ - if (timr->it_interval != 0) { - ktime_t now = hrtimer_cb_get_time(timer); - - /* - * FIXME: What we really want, is to stop this - * timer completely and restart it in case the - * SIG_IGN is removed. This is a non trivial - * change which involves sighand locking - * (sigh !), which we don't want to do late in - * the release cycle. - * - * For now we just let timers with an interval - * less than a jiffie expire every jiffie to - * avoid softirq starvation in case of SIG_IGN - * and a very small interval, which would put - * the timer right back on the softirq pending - * list. By moving now ahead of time we trick - * hrtimer_forward() to expire the timer - * later, while we still maintain the overrun - * accuracy, but have some inconsistency in - * the timer_gettime() case. This is at least - * better than a starved softirq. A more - * complex fix which solves also another related - * inconsistency is already in the pipeline. - */ -#ifdef CONFIG_HIGH_RES_TIMERS - { - ktime_t kj = NSEC_PER_SEC / HZ; - - if (timr->it_interval < kj) - now = ktime_add(now, kj); - } -#endif - timr->it_overrun += hrtimer_forward(timer, now, - timr->it_interval); - ret = HRTIMER_RESTART; - ++timr->it_requeue_pending; - timr->it_active = 1; - } - } - - unlock_timer(timr, flags); - return ret; + guard(spinlock_irqsave)(&timr->it_lock); + posix_timer_queue_signal(timr); + return HRTIMER_NORESTART; } static struct pid *good_sigevent(sigevent_t * event) @@ -452,40 +348,35 @@ static struct pid *good_sigevent(sigevent_t * event) } } -static struct k_itimer * alloc_posix_timer(void) +static struct k_itimer *alloc_posix_timer(void) { - struct k_itimer *tmr; - tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + if (!tmr) return tmr; - if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + + if (unlikely(!posixtimer_init_sigqueue(&tmr->sigq))) { kmem_cache_free(posix_timers_cache, tmr); return NULL; } - clear_siginfo(&tmr->sigq->info); + rcuref_init(&tmr->rcuref, 1); return tmr; } -static void k_itimer_rcu_free(struct rcu_head *head) +void posixtimer_free_timer(struct k_itimer *tmr) { - struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); - - kmem_cache_free(posix_timers_cache, tmr); + put_pid(tmr->it_pid); + if (tmr->sigq.ucounts) + dec_rlimit_put_ucounts(tmr->sigq.ucounts, UCOUNT_RLIMIT_SIGPENDING); + kfree_rcu(tmr, rcu); } -#define IT_ID_SET 1 -#define IT_ID_NOT_SET 0 -static void release_posix_timer(struct k_itimer *tmr, int it_id_set) +static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - if (it_id_set) { - unsigned long flags; - spin_lock_irqsave(&hash_lock, flags); - hlist_del_rcu(&tmr->t_hash); - spin_unlock_irqrestore(&hash_lock, flags); - } - put_pid(tmr->it_pid); - sigqueue_free(tmr->sigq); - call_rcu(&tmr->rcu, k_itimer_rcu_free); + spin_lock(&hash_lock); + hlist_del_rcu(&tmr->t_hash); + spin_unlock(&hash_lock); + posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) @@ -501,7 +392,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; - int it_id_set = IT_ID_NOT_SET; if (!kc) return -EINVAL; @@ -513,13 +403,18 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EAGAIN; spin_lock_init(&new_timer->it_lock); + + /* + * Add the timer to the hash table. The timer is not yet valid + * because new_timer::it_signal is still NULL. The timer id is also + * not yet visible to user space. + */ new_timer_id = posix_timer_add(new_timer); if (new_timer_id < 0) { - error = new_timer_id; - goto out; + posixtimer_free_timer(new_timer); + return new_timer_id; } - it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; @@ -534,43 +429,51 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } new_timer->it_sigev_notify = event->sigev_notify; - new_timer->sigq->info.si_signo = event->sigev_signo; - new_timer->sigq->info.si_value = event->sigev_value; + new_timer->sigq.info.si_signo = event->sigev_signo; + new_timer->sigq.info.si_value = event->sigev_value; } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->sigq->info.si_signo = SIGALRM; - memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); - new_timer->sigq->info.si_value.sival_int = new_timer->it_id; + new_timer->sigq.info.si_signo = SIGALRM; + memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); + new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } - new_timer->sigq->info.si_tid = new_timer->it_id; - new_timer->sigq->info.si_code = SI_TIMER; + if (new_timer->it_sigev_notify & SIGEV_THREAD_ID) + new_timer->it_pid_type = PIDTYPE_PID; + else + new_timer->it_pid_type = PIDTYPE_TGID; + + new_timer->sigq.info.si_tid = new_timer->it_id; + new_timer->sigq.info.si_code = SI_TIMER; - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { + if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; goto out; } - + /* + * After succesful copy out, the timer ID is visible to user space + * now but not yet valid because new_timer::signal is still NULL. + * + * Complete the initialization with the clock specific create + * callback. + */ error = kc->timer_create(new_timer); if (error) goto out; spin_lock_irq(¤t->sighand->siglock); - new_timer->it_signal = current->signal; - list_add(&new_timer->list, ¤t->signal->posix_timers); + /* This makes the timer valid in the hash table */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); - - return 0; /* - * In the case of the timer belonging to another task, after - * the task is unlocked, the timer is owned by the other task - * and may cease to exist at any time. Don't use or modify - * new_timer after the unlock call. + * After unlocking sighand::siglock @new_timer is subject to + * concurrent removal and cannot be touched anymore */ + return 0; out: - release_posix_timer(new_timer, it_id_set); + posix_timer_unhash_and_free(new_timer); return error; } @@ -604,13 +507,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -/* - * Locking issues: We need to protect the result of the id look up until - * we get the timer locked down so it is not deleted under us. The - * removal is done under the idr spinlock so we use that here to bridge - * the find to the timer lock. To avoid a dead lock, the timer id MUST - * be release with out holding the timer lock. - */ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; @@ -622,10 +518,42 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) if ((unsigned long long)timer_id > INT_MAX) return NULL; + /* + * The hash lookup and the timers are RCU protected. + * + * Timers are added to the hash in invalid state where + * timr::it_signal == NULL. timer::it_signal is only set after the + * rest of the initialization succeeded. + * + * Timer destruction happens in steps: + * 1) Set timr::it_signal to NULL with timr::it_lock held + * 2) Release timr::it_lock + * 3) Remove from the hash under hash_lock + * 4) Put the reference count. + * + * The reference count might not drop to zero if timr::sigq is + * queued. In that case the signal delivery or flush will put the + * last reference count. + * + * When the reference count reaches zero, the timer is scheduled + * for RCU removal after the grace period. + * + * Holding rcu_read_lock() across the lookup ensures that + * the timer cannot be freed. + * + * The lookup validates locklessly that timr::it_signal == + * current::it_signal and timr::it_id == @timer_id. timr::it_id + * can't change, but timr::it_signal becomes NULL during + * destruction. + */ rcu_read_lock(); timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); + /* + * Validate under timr::it_lock that timr::it_signal is + * still valid. Pairs with #1 above. + */ if (timr->it_signal == current->signal) { rcu_read_unlock(); return timr; @@ -652,20 +580,16 @@ static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) } /* - * Get the time remaining on a POSIX.1b interval timer. This function - * is ALWAYS called with spin_lock_irq on the timer, thus it must not - * mess with irq. + * Get the time remaining on a POSIX.1b interval timer. + * + * Two issues to handle here: * - * We have a couple of messes to clean up here. First there is the case - * of a timer that has a requeue pending. These timers should appear to - * be in the timer list with an expiry as if we were to requeue them - * now. + * 1) The timer has a requeue pending. The return value must appear as + * if the timer has been requeued right now. * - * The second issue is the SIGEV_NONE timer which may be active but is - * not really ever put in the timer list (to save system resources). - * This timer may be expired, and if so, we will do it here. Otherwise - * it is the same as a requeue pending timer WRT to what we should - * report. + * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued + * into the hrtimer queue and therefore never expired. Emulate expiry + * here taking #1 into account. */ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { @@ -679,10 +603,14 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) /* interval timer ? */ if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); - } else if (!timr->it_active) { + } else if (timr->it_status == POSIX_TIMER_DISARMED) { /* - * SIGEV_NONE oneshot timers are never queued. Check them - * below. + * SIGEV_NONE oneshot timers are never queued and therefore + * timr->it_status is always DISARMED. The check below + * vs. remaining time will handle this case. + * + * For all other timers there is nothing to update here, so + * return. */ if (!sig_none) return; @@ -691,18 +619,29 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) now = kc->clock_get_ktime(timr->it_clock); /* - * When a requeue is pending or this is a SIGEV_NONE timer move the - * expiry time forward by intervals, so expiry is > now. + * If this is an interval timer and either has requeue pending or + * is a SIGEV_NONE timer move the expiry time forward by intervals, + * so expiry is > now. */ - if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + if (iv && timr->it_status != POSIX_TIMER_ARMED) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); - /* Return 0 only, when the timer is expired and not pending */ + /* + * As @now is retrieved before a possible timer_forward() and + * cannot be reevaluated by the compiler @remaining is based on the + * same @now value. Therefore @remaining is consistent vs. @now. + * + * Consequently all interval timers, i.e. @iv > 0, cannot have a + * remaining time <= 0 because timer_forward() guarantees to move + * them forward so that the next timer expiry is > @now. + */ if (remaining <= 0) { /* - * A single shot SIGEV_NONE timer must return 0, when - * it is expired ! + * A single shot SIGEV_NONE timer must return 0, when it is + * expired! Timers which have a real signal delivery mode + * must return a remaining time greater than 0 because the + * signal has not yet been delivered. */ if (!sig_none) cur_setting->it_value.tv_nsec = 1; @@ -711,11 +650,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) } } -/* Get the time remaining on a POSIX.1b interval timer. */ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - struct k_itimer *timr; const struct k_clock *kc; + struct k_itimer *timr; unsigned long flags; int ret = 0; @@ -765,26 +703,35 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, #endif -/* - * Get the number of overruns of a POSIX.1b interval timer. This is to - * be the overrun of the timer last delivered. At the same time we are - * accumulating overruns on the next timer. The overrun is frozen when - * the signal is delivered, either at the notify time (if the info block - * is not queued) or at the actual delivery time (as we are informed by - * the call back to posixtimer_rearm(). So all we need to do is - * to pick up the frozen overrun. +/** + * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer + * @timer_id: The timer ID which identifies the timer + * + * The "overrun count" of a timer is one plus the number of expiration + * intervals which have elapsed between the first expiry, which queues the + * signal and the actual signal delivery. On signal delivery the "overrun + * count" is calculated and cached, so it can be returned directly here. + * + * As this is relative to the last queued signal the returned overrun count + * is meaningless outside of the signal delivery path and even there it + * does not accurately reflect the current state when user space evaluates + * it. + * + * Returns: + * -EINVAL @timer_id is invalid + * 1..INT_MAX The number of overruns related to the last delivered signal */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { struct k_itimer *timr; - int overrun; unsigned long flags; + int overrun; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; - overrun = timer_overrun_to_int(timr, 0); + overrun = timer_overrun_to_int(timr); unlock_timer(timr, flags); return overrun; @@ -831,10 +778,18 @@ static void common_timer_wait_running(struct k_itimer *timer) } /* - * On PREEMPT_RT this prevent priority inversion against softirq kthread in - * case it gets preempted while executing a timer callback. See comments in - * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a - * cpu_relax(). + * On PREEMPT_RT this prevents priority inversion and a potential livelock + * against the ksoftirqd thread in case that ksoftirqd gets preempted while + * executing a hrtimer callback. + * + * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this + * just results in a cpu_relax(). + * + * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is + * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this + * prevents spinning on an eventually scheduled out task and a livelock + * when the task which tries to delete or disarm the timer has preempted + * the task which runs the expiry in task work context. */ static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags) @@ -846,6 +801,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer, rcu_read_lock(); unlock_timer(timer, *flags); + /* + * kc->timer_wait_running() might drop RCU lock. So @timer + * cannot be touched anymore after the function returns! + */ if (!WARN_ON_ONCE(!kc->timer_wait_running)) kc->timer_wait_running(timer); @@ -854,6 +813,21 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer, return lock_timer(timer_id, flags); } +/* + * Set up the new interval and reset the signal delivery data + */ +void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting) +{ + if (new_setting->it_value.tv_sec || new_setting->it_value.tv_nsec) + timer->it_interval = timespec64_to_ktime(new_setting->it_interval); + else + timer->it_interval = 0; + + /* Reset overrun accounting */ + timer->it_overrun_last = 0; + timer->it_overrun = -1LL; +} + /* Set a POSIX.1b interval timer. */ int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, @@ -866,8 +840,6 @@ int common_timer_set(struct k_itimer *timr, int flags, if (old_setting) common_timer_get(timr, old_setting); - /* Prevent rearming by clearing the interval */ - timr->it_interval = 0; /* * Careful here. On SMP systems the timer expiry function could be * active and spinning on timr->it_lock. @@ -875,23 +847,21 @@ int common_timer_set(struct k_itimer *timr, int flags, if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; - timr->it_active = 0; - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timr->it_overrun_last = 0; + timr->it_status = POSIX_TIMER_DISARMED; + posix_timer_set_common(timr, new_setting); - /* Switch off the timer when it_value is zero */ + /* Keep timer disarmed when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); if (flags & TIMER_ABSTIME) expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); - timr->it_active = !sigev_none; + if (!sigev_none) + timr->it_status = POSIX_TIMER_ARMED; return 0; } @@ -902,7 +872,7 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, const struct k_clock *kc; struct k_itimer *timr; unsigned long flags; - int error = 0; + int error; if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) @@ -916,6 +886,12 @@ retry: if (!timr) return -EINVAL; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; @@ -939,8 +915,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct __kernel_itimerspec __user *, new_setting, struct __kernel_itimerspec __user *, old_setting) { - struct itimerspec64 new_spec, old_spec; - struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; + struct itimerspec64 new_spec, old_spec, *rtn; int error = 0; if (!new_setting) @@ -949,6 +924,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; + rtn = old_setting ? &old_spec : NULL; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { if (put_itimerspec64(&old_spec, old_setting)) @@ -984,17 +960,31 @@ int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; - timer->it_interval = 0; if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; - timer->it_active = 0; + timer->it_status = POSIX_TIMER_DISARMED; return 0; } +/* + * If the deleted timer is on the ignored list, remove it and + * drop the associated reference. + */ +static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) +{ + if (!hlist_unhashed(&tmr->ignored_list)) { + hlist_del_init(&tmr->ignored_list); + posixtimer_putref(tmr); + } +} + static inline int timer_delete_hook(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; + /* Prevent signal delivery and rearming. */ + timer->it_signal_seq++; + if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; return kc->timer_del(timer); @@ -1019,48 +1009,110 @@ retry_delete: } spin_lock(¤t->sighand->siglock); - list_del(&timer->list); - spin_unlock(¤t->sighand->siglock); + hlist_del(&timer->list); + posix_timer_cleanup_ignored(timer); /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). + * A concurrent lookup could check timer::it_signal lockless. It + * will reevaluate with timer::it_lock held and observe the NULL. + * + * It must be written with siglock held so that the signal code + * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), + * which prevents it from moving a pending signal of a deleted + * timer to the ignore list. */ - timer->it_signal = NULL; + WRITE_ONCE(timer->it_signal, NULL); + spin_unlock(¤t->sighand->siglock); unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); + posix_timer_unhash_and_free(timer); return 0; } /* - * return timer owned by the process, used by exit_itimers + * Delete a timer if it is armed, remove it from the hash and schedule it + * for RCU freeing. */ static void itimer_delete(struct k_itimer *timer) { -retry_delete: - spin_lock_irq(&timer->it_lock); + unsigned long flags; + /* + * irqsave is required to make timer_wait_running() work. + */ + spin_lock_irqsave(&timer->it_lock, flags); + +retry_delete: + /* + * Even if the timer is not longer accessible from other tasks + * it still might be armed and queued in the underlying timer + * mechanism. Worse, that timer mechanism might run the expiry + * function concurrently. + */ if (timer_delete_hook(timer) == TIMER_RETRY) { - spin_unlock_irq(&timer->it_lock); + /* + * Timer is expired concurrently, prevent livelocks + * and pointless spinning on RT. + * + * timer_wait_running() drops timer::it_lock, which opens + * the possibility for another task to delete the timer. + * + * That's not possible here because this is invoked from + * do_exit() only for the last thread of the thread group. + * So no other task can access and delete that timer. + */ + if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) + return; + goto retry_delete; } - list_del(&timer->list); + hlist_del(&timer->list); + + posix_timer_cleanup_ignored(timer); + + /* + * Setting timer::it_signal to NULL is technically not required + * here as nothing can access the timer anymore legitimately via + * the hash table. Set it to NULL nevertheless so that all deletion + * paths are consistent. + */ + WRITE_ONCE(timer->it_signal, NULL); - spin_unlock_irq(&timer->it_lock); - release_posix_timer(timer, IT_ID_SET); + spin_unlock_irqrestore(&timer->it_lock, flags); + posix_timer_unhash_and_free(timer); } /* - * This is called by do_exit or de_thread, only when there are no more - * references to the shared signal_struct. + * Invoked from do_exit() when the last thread of a thread group exits. + * At that point no other task can access the timers of the dying + * task anymore. */ -void exit_itimers(struct signal_struct *sig) +void exit_itimers(struct task_struct *tsk) { - struct k_itimer *tmr; + struct hlist_head timers; + + if (hlist_empty(&tsk->signal->posix_timers)) + return; + + /* Protect against concurrent read via /proc/$PID/timers */ + spin_lock_irq(&tsk->sighand->siglock); + hlist_move_list(&tsk->signal->posix_timers, &timers); + spin_unlock_irq(&tsk->sighand->siglock); + + /* The timers are not longer accessible via tsk::signal */ + while (!hlist_empty(&timers)) + itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); - while (!list_empty(&sig->posix_timers)) { - tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); - itimer_delete(tmr); + /* + * There should be no timers on the ignored list. itimer_delete() has + * mopped them up. + */ + if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers))) + return; + + hlist_move_list(&tsk->signal->ignored_posix_timers, &timers); + while (!hlist_empty(&timers)) { + posix_timer_cleanup_ignored(hlist_entry(timers.first, struct k_itimer, + ignored_list)); } } @@ -1076,6 +1128,10 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, if (get_timespec64(&new_tp, tp)) return -EFAULT; + /* + * Permission checks have to be done inside the clock specific + * setter callback. + */ return kc->clock_set(which_clock, &new_tp); } @@ -1126,6 +1182,79 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, return err; } +/** + * sys_clock_getres - Get the resolution of a clock + * @which_clock: The clock to get the resolution for + * @tp: Pointer to a a user space timespec64 for storage + * + * POSIX defines: + * + * "The clock_getres() function shall return the resolution of any + * clock. Clock resolutions are implementation-defined and cannot be set by + * a process. If the argument res is not NULL, the resolution of the + * specified clock shall be stored in the location pointed to by res. If + * res is NULL, the clock resolution is not returned. If the time argument + * of clock_settime() is not a multiple of res, then the value is truncated + * to a multiple of res." + * + * Due to the various hardware constraints the real resolution can vary + * wildly and even change during runtime when the underlying devices are + * replaced. The kernel also can use hardware devices with different + * resolutions for reading the time and for arming timers. + * + * The kernel therefore deviates from the POSIX spec in various aspects: + * + * 1) The resolution returned to user space + * + * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI, + * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW + * the kernel differentiates only two cases: + * + * I) Low resolution mode: + * + * When high resolution timers are disabled at compile or runtime + * the resolution returned is nanoseconds per tick, which represents + * the precision at which timers expire. + * + * II) High resolution mode: + * + * When high resolution timers are enabled the resolution returned + * is always one nanosecond independent of the actual resolution of + * the underlying hardware devices. + * + * For CLOCK_*_ALARM the actual resolution depends on system + * state. When system is running the resolution is the same as the + * resolution of the other clocks. During suspend the actual + * resolution is the resolution of the underlying RTC device which + * might be way less precise than the clockevent device used during + * running state. + * + * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution + * returned is always nanoseconds per tick. + * + * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution + * returned is always one nanosecond under the assumption that the + * underlying scheduler clock has a better resolution than nanoseconds + * per tick. + * + * For dynamic POSIX clocks (PTP devices) the resolution returned is + * always one nanosecond. + * + * 2) Affect on sys_clock_settime() + * + * The kernel does not truncate the time which is handed in to + * sys_clock_settime(). The kernel internal timekeeping is always using + * nanoseconds precision independent of the clocksource device which is + * used to read the time from. The resolution of that device only + * affects the presicion of the time returned by sys_clock_gettime(). + * + * Returns: + * 0 Success. @tp contains the resolution + * -EINVAL @which_clock is not a valid clock ID + * -EFAULT Copying the resolution to @tp faulted + * -ENODEV Dynamic POSIX clock is not backed by a device + * -EOPNOTSUPP Dynamic POSIX clock does not support getres() + */ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { @@ -1191,8 +1320,8 @@ SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, err = do_clock_adjtime(which_clock, &ktx); - if (err >= 0) - err = put_old_timex32(utp, &ktx); + if (err >= 0 && put_old_timex32(utp, &ktx)) + return -EFAULT; return err; } @@ -1217,7 +1346,7 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, #endif /* - * nanosleep for monotonic and realtime clocks + * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI */ static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) @@ -1229,8 +1358,13 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } +/* + * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME + * + * Absolute nanosleeps for these clocks are time-namespace adjusted. + */ static int common_nsleep_timens(const clockid_t which_clock, int flags, - const struct timespec64 *rqtp) + const struct timespec64 *rqtp) { ktime_t texp = timespec64_to_ktime(*rqtp); @@ -1261,6 +1395,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; @@ -1288,6 +1423,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; |