From 511885d7061eda3eb1faf3f57dcc936ff75863f1 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 24 Jul 2019 08:23:23 -0700
Subject: lib/timerqueue: Rely on rbtree semantics for next timer

Simplify the timerqueue code by using cached rbtrees and rely on the tree
leftmost node semantics to get the timer with earliest expiration time.
This is a drop in conversion, and therefore semantics remain untouched.

The runtime overhead of cached rbtrees is be pretty much the same as the
current head->next method, noting that when removing the leftmost node,
a common operation for the timerqueue, the rb_next(leftmost) is O(1) as
well, so the next timer will either be the right node or its parent.
Therefore no extra pointer chasing. Finally, the size of the struct
timerqueue_head remains the same.

Passes several hours of rcutorture.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190724152323.bojciei3muvfxalm@linux-r8p5
---
 include/linux/timerqueue.h | 13 ++++++-------
 lib/timerqueue.c           | 30 ++++++++++++------------------
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 78b8cc73f12f..aff122f1062a 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -12,8 +12,7 @@ struct timerqueue_node {
 };
 
 struct timerqueue_head {
-	struct rb_root head;
-	struct timerqueue_node *next;
+	struct rb_root_cached rb_root;
 };
 
 
@@ -29,13 +28,14 @@ extern struct timerqueue_node *timerqueue_iterate_next(
  *
  * @head: head of timerqueue
  *
- * Returns a pointer to the timer node that has the
- * earliest expiration time.
+ * Returns a pointer to the timer node that has the earliest expiration time.
  */
 static inline
 struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
 {
-	return head->next;
+	struct rb_node *leftmost = rb_first_cached(&head->rb_root);
+
+	return rb_entry(leftmost, struct timerqueue_node, node);
 }
 
 static inline void timerqueue_init(struct timerqueue_node *node)
@@ -45,7 +45,6 @@ static inline void timerqueue_init(struct timerqueue_node *node)
 
 static inline void timerqueue_init_head(struct timerqueue_head *head)
 {
-	head->head = RB_ROOT;
-	head->next = NULL;
+	head->rb_root = RB_ROOT_CACHED;
 }
 #endif /* _LINUX_TIMERQUEUE_H */
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index bc7e64df27df..c52710964593 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -26,9 +26,10 @@
  */
 bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 {
-	struct rb_node **p = &head->head.rb_node;
+	struct rb_node **p = &head->rb_root.rb_root.rb_node;
 	struct rb_node *parent = NULL;
-	struct timerqueue_node  *ptr;
+	struct timerqueue_node *ptr;
+	bool leftmost = true;
 
 	/* Make sure we don't add nodes that are already added */
 	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
@@ -36,19 +37,17 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	while (*p) {
 		parent = *p;
 		ptr = rb_entry(parent, struct timerqueue_node, node);
-		if (node->expires < ptr->expires)
+		if (node->expires < ptr->expires) {
 			p = &(*p)->rb_left;
-		else
+		} else {
 			p = &(*p)->rb_right;
+			leftmost = false;
+		}
 	}
 	rb_link_node(&node->node, parent, p);
-	rb_insert_color(&node->node, &head->head);
+	rb_insert_color_cached(&node->node, &head->rb_root, leftmost);
 
-	if (!head->next || node->expires < head->next->expires) {
-		head->next = node;
-		return true;
-	}
-	return false;
+	return leftmost;
 }
 EXPORT_SYMBOL_GPL(timerqueue_add);
 
@@ -65,15 +64,10 @@ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 {
 	WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
 
-	/* update next pointer */
-	if (head->next == node) {
-		struct rb_node *rbn = rb_next(&node->node);
-
-		head->next = rb_entry_safe(rbn, struct timerqueue_node, node);
-	}
-	rb_erase(&node->node, &head->head);
+	rb_erase_cached(&node->node, &head->rb_root);
 	RB_CLEAR_NODE(&node->node);
-	return head->next != NULL;
+
+	return !RB_EMPTY_ROOT(&head->rb_root.rb_root);
 }
 EXPORT_SYMBOL_GPL(timerqueue_del);
 
-- 
cgit 


From b74494872555d1f7888dfd9225700a363f4a84fc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:49 +0200
Subject: hrtimer: Remove task argument from hrtimer_init_sleeper()

All callers hand in 'current' and that's the only task pointer which
actually makes sense. Remove the task argument and set current in the
function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185752.791885290@linutronix.de
---
 block/blk-mq.c                 | 2 +-
 drivers/staging/android/vsoc.c | 2 +-
 include/linux/hrtimer.h        | 3 +--
 include/linux/wait.h           | 2 +-
 kernel/futex.c                 | 2 +-
 kernel/time/hrtimer.c          | 8 ++++----
 net/core/pktgen.c              | 2 +-
 7 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b038ec680e84..5f647cb8c695 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3418,7 +3418,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
 	hrtimer_set_expires(&hs.timer, kt);
 
-	hrtimer_init_sleeper(&hs, current);
+	hrtimer_init_sleeper(&hs);
 	do {
 		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
 			break;
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 00a1ec7b9154..ce480bcf20d2 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -442,7 +442,7 @@ static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg)
 		hrtimer_set_expires_range_ns(&to->timer, wake_time,
 					     current->timer_slack_ns);
 
-		hrtimer_init_sleeper(to, current);
+		hrtimer_init_sleeper(to);
 	}
 
 	while (1) {
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 4971100a8cab..3c74f89367c4 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -463,8 +463,7 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
 
-extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
-				 struct task_struct *tsk);
+extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl);
 
 extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
 						const enum hrtimer_mode mode);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index b6f77cf60dd7..d57832774ca6 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -489,7 +489,7 @@ do {										\
 	struct hrtimer_sleeper __t;						\
 										\
 	hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	\
-	hrtimer_init_sleeper(&__t, current);					\
+	hrtimer_init_sleeper(&__t);						\
 	if ((timeout) != KTIME_MAX)						\
 		hrtimer_start_range_ns(&__t.timer, timeout,			\
 				       current->timer_slack_ns,			\
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d50728ef2e7..5e9842ea4012 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -490,7 +490,7 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 	hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
 			      CLOCK_REALTIME : CLOCK_MONOTONIC,
 			      HRTIMER_MODE_ABS);
-	hrtimer_init_sleeper(timeout, current);
+	hrtimer_init_sleeper(timeout);
 
 	/*
 	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5ee77f1a8a92..de895d86800c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1639,10 +1639,10 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl)
 {
 	sl->timer.function = hrtimer_wakeup;
-	sl->task = task;
+	sl->task = current;
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
@@ -1669,7 +1669,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 {
 	struct restart_block *restart;
 
-	hrtimer_init_sleeper(t, current);
+	hrtimer_init_sleeper(t);
 
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -1930,7 +1930,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
 	hrtimer_init_on_stack(&t.timer, clock_id, mode);
 	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
 
-	hrtimer_init_sleeper(&t, current);
+	hrtimer_init_sleeper(&t);
 
 	hrtimer_start_expires(&t.timer, mode);
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index bb9915291644..7f3cf2381f27 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2171,7 +2171,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 		} while (ktime_compare(end_time, spin_until) < 0);
 	} else {
 		/* see do_nanosleep */
-		hrtimer_init_sleeper(&t, current);
+		hrtimer_init_sleeper(&t);
 		do {
 			set_current_state(TASK_INTERRUPTIBLE);
 			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-- 
cgit 


From dbc1625fc9deefb352f6ff26a575ae4b3ddef23a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:50 +0200
Subject: hrtimer: Consolidate hrtimer_init() + hrtimer_init_sleeper() calls

hrtimer_init_sleeper() calls require prior initialisation of the hrtimer
object which is embedded into the hrtimer_sleeper.

Combine the initialization and spare a function call. Fixup all call sites.

This is also a preparatory change for PREEMPT_RT to do hrtimer sleeper
specific initializations of the embedded hrtimer without modifying any of
the call sites.

No functional change.

[ anna-maria: Minor cleanups ]
[ tglx: Adopted to the removal of the task argument of
  	hrtimer_init_sleeper() and trivial polishing.
	Folded a fix from Stephen Rothwell for the vsoc code ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185752.887468908@linutronix.de
---
 block/blk-mq.c                 |  3 +--
 drivers/staging/android/vsoc.c |  6 ++----
 include/linux/hrtimer.h        | 17 ++++++++++++++---
 include/linux/wait.h           |  4 ++--
 kernel/futex.c                 |  8 +++-----
 kernel/time/hrtimer.c          | 43 +++++++++++++++++++++++++++++++-----------
 net/core/pktgen.c              |  4 +---
 7 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5f647cb8c695..df3fafbfe9a9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3415,10 +3415,9 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	kt = nsecs;
 
 	mode = HRTIMER_MODE_REL;
-	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+	hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
 	hrtimer_set_expires(&hs.timer, kt);
 
-	hrtimer_init_sleeper(&hs);
 	do {
 		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
 			break;
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index ce480bcf20d2..2d6b3981afb8 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -437,12 +437,10 @@ static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg)
 			return -EINVAL;
 		wake_time = ktime_set(arg->wake_time_sec, arg->wake_time_nsec);
 
-		hrtimer_init_on_stack(&to->timer, CLOCK_MONOTONIC,
-				      HRTIMER_MODE_ABS);
+		hrtimer_init_sleeper_on_stack(to, CLOCK_MONOTONIC,
+					      HRTIMER_MODE_ABS);
 		hrtimer_set_expires_range_ns(&to->timer, wake_time,
 					     current->timer_slack_ns);
-
-		hrtimer_init_sleeper(to);
 	}
 
 	while (1) {
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 3c74f89367c4..0df373bed3d7 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -347,10 +347,15 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 /* Initialize timers: */
 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
 			 enum hrtimer_mode mode);
+extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+				 enum hrtimer_mode mode);
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
 				  enum hrtimer_mode mode);
+extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
+					  clockid_t clock_id,
+					  enum hrtimer_mode mode);
 
 extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
 #else
@@ -360,6 +365,14 @@ static inline void hrtimer_init_on_stack(struct hrtimer *timer,
 {
 	hrtimer_init(timer, which_clock, mode);
 }
+
+static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
+						 clockid_t clock_id,
+						 enum hrtimer_mode mode)
+{
+	hrtimer_init_sleeper(sl, clock_id, mode);
+}
+
 static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 #endif
 
@@ -463,10 +476,8 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
 
-extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl);
-
 extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
-						const enum hrtimer_mode mode);
+				    const enum hrtimer_mode mode);
 extern int schedule_hrtimeout_range_clock(ktime_t *expires,
 					  u64 delta,
 					  const enum hrtimer_mode mode,
diff --git a/include/linux/wait.h b/include/linux/wait.h
index d57832774ca6..4707543ef575 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -488,8 +488,8 @@ do {										\
 	int __ret = 0;								\
 	struct hrtimer_sleeper __t;						\
 										\
-	hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	\
-	hrtimer_init_sleeper(&__t);						\
+	hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,			\
+				      HRTIMER_MODE_REL);			\
 	if ((timeout) != KTIME_MAX)						\
 		hrtimer_start_range_ns(&__t.timer, timeout,			\
 				       current->timer_slack_ns,			\
diff --git a/kernel/futex.c b/kernel/futex.c
index 5e9842ea4012..c8561aa5338e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -487,11 +487,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 	if (!time)
 		return NULL;
 
-	hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
-			      CLOCK_REALTIME : CLOCK_MONOTONIC,
-			      HRTIMER_MODE_ABS);
-	hrtimer_init_sleeper(timeout);
-
+	hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
+				      CLOCK_REALTIME : CLOCK_MONOTONIC,
+				      HRTIMER_MODE_ABS);
 	/*
 	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
 	 * effectively the same as calling hrtimer_set_expires().
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index de895d86800c..bb55d62f631e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -427,6 +427,17 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
 
+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+				   clockid_t clock_id, enum hrtimer_mode mode);
+
+void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
+				   clockid_t clock_id, enum hrtimer_mode mode)
+{
+	debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
+	__hrtimer_init_sleeper(sl, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
+
 void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
 	debug_object_free(timer, &hrtimer_debug_descr);
@@ -1639,11 +1650,27 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl)
+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+				   clockid_t clock_id, enum hrtimer_mode mode)
 {
+	__hrtimer_init(&sl->timer, clock_id, mode);
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = current;
 }
+
+/**
+ * hrtimer_init_sleeper - initialize sleeper to the given clock
+ * @sl:		sleeper to be initialized
+ * @clock_id:	the clock to be used
+ * @mode:	timer mode abs/rel
+ */
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+			  enum hrtimer_mode mode)
+{
+	debug_init(&sl->timer, clock_id, mode);
+	__hrtimer_init_sleeper(sl, clock_id, mode);
+
+}
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
@@ -1669,8 +1696,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 {
 	struct restart_block *restart;
 
-	hrtimer_init_sleeper(t);
-
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start_expires(&t->timer, mode);
@@ -1707,10 +1732,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 	struct hrtimer_sleeper t;
 	int ret;
 
-	hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
-				HRTIMER_MODE_ABS);
+	hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
+				      HRTIMER_MODE_ABS);
 	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
-
 	ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
 	destroy_hrtimer_on_stack(&t.timer);
 	return ret;
@@ -1728,7 +1752,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
 	if (dl_task(current) || rt_task(current))
 		slack = 0;
 
-	hrtimer_init_on_stack(&t.timer, clockid, mode);
+	hrtimer_init_sleeper_on_stack(&t, clockid, mode);
 	hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
 	ret = do_nanosleep(&t, mode);
 	if (ret != -ERESTART_RESTARTBLOCK)
@@ -1927,11 +1951,8 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
 		return -EINTR;
 	}
 
-	hrtimer_init_on_stack(&t.timer, clock_id, mode);
+	hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
 	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
-
-	hrtimer_init_sleeper(&t);
-
 	hrtimer_start_expires(&t.timer, mode);
 
 	if (likely(t.task))
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7f3cf2381f27..a5905975bc12 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2156,7 +2156,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 	s64 remaining;
 	struct hrtimer_sleeper t;
 
-	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	hrtimer_set_expires(&t.timer, spin_until);
 
 	remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
@@ -2170,8 +2170,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 			end_time = ktime_get();
 		} while (ktime_compare(end_time, spin_until) < 0);
 	} else {
-		/* see do_nanosleep */
-		hrtimer_init_sleeper(&t);
 		do {
 			set_current_state(TASK_INTERRUPTIBLE);
 			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-- 
cgit 


From 01656464fce946f70b02a84ab218e562ceb1662e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 Jul 2019 21:03:53 +0200
Subject: hrtimer: Provide hrtimer_sleeper_start_expires()

hrtimer_sleepers will gain a scheduling class dependent treatment on
PREEMPT_RT. Create a wrapper around hrtimer_start_expires() to make that
possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  3 +++
 kernel/time/hrtimer.c   | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0df373bed3d7..24072a0942c0 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -408,6 +408,9 @@ static inline void hrtimer_start_expires(struct hrtimer *timer,
 	hrtimer_start_range_ns(timer, soft, delta, mode);
 }
 
+void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
+				   enum hrtimer_mode mode);
+
 static inline void hrtimer_restart(struct hrtimer *timer)
 {
 	hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bb55d62f631e..dab1ea1a99d0 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1650,6 +1650,21 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+/**
+ * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
+ * @sl:		sleeper to be started
+ * @mode:	timer mode abs/rel
+ *
+ * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
+ * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
+ */
+void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
+				   enum hrtimer_mode mode)
+{
+	hrtimer_start_expires(&sl->timer, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
+
 static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				   clockid_t clock_id, enum hrtimer_mode mode)
 {
@@ -1698,7 +1713,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
-		hrtimer_start_expires(&t->timer, mode);
+		hrtimer_sleeper_start_expires(t, mode);
 
 		if (likely(t->task))
 			freezable_schedule();
@@ -1953,7 +1968,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
 
 	hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
 	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
-	hrtimer_start_expires(&t.timer, mode);
+	hrtimer_sleeper_start_expires(&t, mode);
 
 	if (likely(t.task))
 		schedule();
-- 
cgit 


From 9dd8813ed9f6e2bba75434abc6c8bb06c3d87fdc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 Jul 2019 21:16:55 +0200
Subject: hrtimer/treewide: Use hrtimer_sleeper_start_expires()

hrtimer_sleepers will gain a scheduling class dependent treatment on
PREEMPT_RT. Use the new hrtimer_sleeper_start_expires() function to make
that possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 block/blk-mq.c                 | 2 +-
 drivers/staging/android/vsoc.c | 2 +-
 kernel/futex.c                 | 4 ++--
 net/core/pktgen.c              | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index df3fafbfe9a9..f567146f9ed7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3422,7 +3422,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
 			break;
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		hrtimer_start_expires(&hs.timer, mode);
+		hrtimer_sleeper_start_expires(&hs, mode);
 		if (hs.task)
 			io_schedule();
 		hrtimer_cancel(&hs.timer);
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 2d6b3981afb8..1240bb0317d9 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -458,7 +458,7 @@ static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg)
 			break;
 		}
 		if (to) {
-			hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+			hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
 			if (likely(to->task))
 				freezable_schedule();
 			hrtimer_cancel(&to->timer);
diff --git a/kernel/futex.c b/kernel/futex.c
index c8561aa5338e..bd18f60e4c6c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2611,7 +2611,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 
 	/* Arm the timer */
 	if (timeout)
-		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+		hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
 
 	/*
 	 * If we have been removed from the hash list, then another task
@@ -2897,7 +2897,7 @@ retry_private:
 	}
 
 	if (unlikely(to))
-		hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
 
 	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index a5905975bc12..1d0c1b4886d7 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2172,7 +2172,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 	} else {
 		do {
 			set_current_state(TASK_INTERRUPTIBLE);
-			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+			hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_ABS);
 
 			if (likely(t.task))
 				schedule();
-- 
cgit 


From ae6683d815895c2be1e60e1942630fa99488055b Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:51 +0200
Subject: hrtimer: Introduce HARD expiry mode

On PREEMPT_RT not all hrtimers can be expired in hard interrupt context
even if that is perfectly fine on a PREEMPT_RT=n kernel, e.g. because they
take regular spinlocks. Also for latency reasons PREEMPT_RT tries to defer
most hrtimers' expiry into soft interrupt context.

But there are hrtimers which must be expired in hard interrupt context even
when PREEMPT_RT is enabled:

  - hrtimers which must expiry in hard interrupt context, e.g. scheduler,
    perf, watchdog related hrtimers

  - latency critical hrtimers, e.g. nanosleep, ..., kvm lapic timer

Add a new mode flag HRTIMER_MODE_HARD which allows to mark these timers so
PREEMPT_RT will not move them into softirq expiry mode.

[ tglx: Split out of a larger combo patch. Added changelog ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185752.981398465@linutronix.de
---
 include/linux/hrtimer.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 24072a0942c0..15c2ba6b6316 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -38,6 +38,7 @@ enum hrtimer_mode {
 	HRTIMER_MODE_REL	= 0x01,
 	HRTIMER_MODE_PINNED	= 0x02,
 	HRTIMER_MODE_SOFT	= 0x04,
+	HRTIMER_MODE_HARD	= 0x08,
 
 	HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
 	HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
@@ -48,6 +49,11 @@ enum hrtimer_mode {
 	HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
 	HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
 
+	HRTIMER_MODE_ABS_HARD	= HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
+	HRTIMER_MODE_REL_HARD	= HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
+
+	HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
+	HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
 };
 
 /*
-- 
cgit 


From 0ab6a3ddbad40ef5b6b8c2353fd53fa4ecf9c479 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 Jul 2019 20:15:25 +0200
Subject: hrtimer: Make enqueue mode check work on RT

hrtimer_start_range_ns() has a WARN_ONCE() which verifies that a timer
which is marker for softirq expiry is not queued in the hard interrupt base
and vice versa.

When PREEMPT_RT is enabled, timers which are not explicitely marked to
expire in hard interrupt context are deferrred to the soft interrupt. So
the regular check would trigger.

Change the check, so when PREEMPT_RT is enabled, it is verified that the
timers marked for hard interrupt expiry are not tried to be queued for soft
interrupt expiry or any of the unmarked and softirq marked is tried to be
expired in hard interrupt context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 3 +++
 kernel/time/hrtimer.c   | 9 +++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 15c2ba6b6316..7d0d0a36a8f4 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -107,6 +107,8 @@ enum hrtimer_restart {
  * @state:	state information (See bit values above)
  * @is_rel:	Set if the timer was armed relative
  * @is_soft:	Set if hrtimer will be expired in soft interrupt context.
+ * @is_hard:	Set if hrtimer will be expired in hard interrupt context
+ *		even on RT.
  *
  * The hrtimer structure must be initialized by hrtimer_init()
  */
@@ -118,6 +120,7 @@ struct hrtimer {
 	u8				state;
 	u8				is_rel;
 	u8				is_soft;
+	u8				is_hard;
 };
 
 /**
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index dab1ea1a99d0..0ace301a56f4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1107,9 +1107,13 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 
 	/*
 	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
-	 * match.
+	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
+	 * expiry mode because unmarked timers are moved to softirq expiry.
 	 */
-	WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+	else
+		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
 
 	base = lock_hrtimer_base(timer, &flags);
 
@@ -1288,6 +1292,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
 	base += hrtimer_clockid_to_base(clock_id);
 	timer->is_soft = softtimer;
+	timer->is_hard = !softtimer;
 	timer->base = &cpu_base->clock_base[base];
 	timerqueue_init(&timer->node);
 }
-- 
cgit 


From d5096aa65acd0ef2d18ac8247260ab4481ade399 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:52 +0200
Subject: sched: Mark hrtimers to expire in hard interrupt context

The scheduler related hrtimers need to expire in hard interrupt context
even on PREEMPT_RT enabled kernels. Mark then as such.

No functional change.

[ tglx: Split out from larger combo patch. Add changelog. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.077004842@linutronix.de
---
 kernel/sched/core.c     | 6 +++---
 kernel/sched/deadline.c | 4 ++--
 kernel/sched/rt.c       | 7 ++++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..389e0993fbb4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -255,7 +255,7 @@ static void __hrtick_restart(struct rq *rq)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
 
-	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
 }
 
 /*
@@ -314,7 +314,7 @@ void hrtick_start(struct rq *rq, u64 delay)
 	 */
 	delay = max_t(u64, delay, 10000LL);
 	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
-		      HRTIMER_MODE_REL_PINNED);
+		      HRTIMER_MODE_REL_PINNED_HARD);
 }
 #endif /* CONFIG_SMP */
 
@@ -328,7 +328,7 @@ static void hrtick_rq_init(struct rq *rq)
 	rq->hrtick_csd.info = rq;
 #endif
 
-	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	rq->hrtick_timer.function = hrtick;
 }
 #else	/* CONFIG_SCHED_HRTICK */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ef5b9f6b1d42..0359612d5443 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -923,7 +923,7 @@ static int start_dl_timer(struct task_struct *p)
 	 */
 	if (!hrtimer_is_queued(timer)) {
 		get_task_struct(p);
-		hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+		hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
 	}
 
 	return 1;
@@ -1053,7 +1053,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
 {
 	struct hrtimer *timer = &dl_se->dl_timer;
 
-	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	timer->function = dl_task_timer;
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a532558a5176..da3e85e61013 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 
 	raw_spin_lock_init(&rt_b->rt_runtime_lock);
 
-	hrtimer_init(&rt_b->rt_period_timer,
-			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL_HARD);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
 }
 
@@ -67,7 +67,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 		 * to update the period.
 		 */
 		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
-		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+		hrtimer_start_expires(&rt_b->rt_period_timer,
+				      HRTIMER_MODE_ABS_PINNED_HARD);
 	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
-- 
cgit 


From 30f9028b6c43fd17c006550594ea3dbb87afbf80 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:53 +0200
Subject: perf/core: Mark hrtimers to expire in hard interrupt context

To guarantee that the multiplexing mechanism and the hrtimer driven events
work on PREEMPT_RT enabled kernels it's required that the related hrtimers
expire in hard interrupt context. Mark them so PREEMPT_RT kernels wont
defer them to soft interrupt context.

No functional change.

[ tglx: Split out of larger combo patch. Added changelog ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.169509224@linutronix.de
---
 kernel/events/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 026a14541a38..9d623e257a51 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1103,7 +1103,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
 	raw_spin_lock_init(&cpuctx->hrtimer_lock);
-	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
 	timer->function = perf_mux_hrtimer_handler;
 }
 
@@ -1121,7 +1121,7 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 	if (!cpuctx->hrtimer_active) {
 		cpuctx->hrtimer_active = 1;
 		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
-		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
 	}
 	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 
@@ -9491,7 +9491,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
 		period = max_t(u64, 10000, hwc->sample_period);
 	}
 	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
-		      HRTIMER_MODE_REL_PINNED);
+		      HRTIMER_MODE_REL_PINNED_HARD);
 }
 
 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -9513,7 +9513,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
 	if (!is_sampling_event(event))
 		return;
 
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	hwc->hrtimer.function = perf_swevent_hrtimer;
 
 	/*
-- 
cgit 


From d2ab4cf4943576fb060b8a69341d9e0c2a952ba7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:54 +0200
Subject: watchdog: Mark watchdog_hrtimer to expire in hard interrupt context

The watchdog hrtimer must expire in hard interrupt context even on
PREEMPT_RT=y kernels as otherwise the hard/softlockup detection logic would
not work.

No functional change.

[ tglx: Split out from larger combo patch. Added changelog ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.262895510@linutronix.de
---
 kernel/watchdog.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9e7b9306fe..f41334ef0971 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -490,10 +490,10 @@ static void watchdog_enable(unsigned int cpu)
 	 * Start the timer first to prevent the NMI watchdog triggering
 	 * before the timer has a chance to fire.
 	 */
-	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	hrtimer->function = watchdog_timer_fn;
 	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-		      HRTIMER_MODE_REL_PINNED);
+		      HRTIMER_MODE_REL_PINNED_HARD);
 
 	/* Initialize timestamp */
 	__touch_watchdog();
-- 
cgit 


From 2c0d278f3293fc59da0d183075415ca1e8c93b40 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:55 +0200
Subject: KVM: LAPIC: Mark hrtimer to expire in hard interrupt context

On PREEMPT_RT enabled kernels unmarked hrtimers are moved into soft
interrupt expiry mode by default.

While that's not a functional requirement for the KVM local APIC timer
emulation, it's a latency issue which can be avoided by marking the timer
so hard interrupt context expiry is enforced.

No functional change.

[ tglx: Split out from larger combo patch. Add changelog. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.363363474@linutronix.de
---
 arch/x86/kvm/lapic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0aa158657f20..b9e516099d07 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1601,7 +1601,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	    likely(ns > apic->lapic_timer.timer_advance_ns)) {
 		expire = ktime_add_ns(now, ns);
 		expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
-		hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS);
+		hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
 	} else
 		apic_timer_expired(apic);
 
@@ -2302,7 +2302,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
 	apic->vcpu = vcpu;
 
 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_ABS);
+		     HRTIMER_MODE_ABS_HARD);
 	apic->lapic_timer.timer.function = apic_timer_fn;
 	if (timer_advance_ns == -1) {
 		apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
@@ -2487,7 +2487,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 
 	timer = &vcpu->arch.apic->lapic_timer.timer;
 	if (hrtimer_cancel(timer))
-		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
 }
 
 /*
-- 
cgit 


From 902a9f9c509053161e987778dc5836d2628f53b7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:56 +0200
Subject: tick: Mark tick related hrtimers to expiry in hard interrupt context

The tick related hrtimers, which drive the scheduler tick and hrtimer based
broadcasting are required to expire in hard interrupt context for obvious
reasons.

Mark them so PREEMPT_RT kernels wont move them to soft interrupt expiry.

Make the horribly formatted RCU_NONIDLE bracket maze readable while at it.

No functional change,

[ tglx: Split out from larger combo patch. Add changelog ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.459144407@linutronix.de
---
 kernel/time/tick-broadcast-hrtimer.c | 13 +++++++++----
 kernel/time/tick-sched.c             | 15 +++++++++------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 5be6154e2fd2..c1f5bb590b5e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -59,11 +59,16 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 	 * hrtimer_{start/cancel} functions call into tracing,
 	 * calls to these functions must be bound within RCU_NONIDLE.
 	 */
-	RCU_NONIDLE({
+	RCU_NONIDLE(
+		{
 			bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
-			if (bc_moved)
+			if (bc_moved) {
 				hrtimer_start(&bctimer, expires,
-					      HRTIMER_MODE_ABS_PINNED);});
+					      HRTIMER_MODE_ABS_PINNED_HARD);
+			}
+		}
+	);
+
 	if (bc_moved) {
 		/* Bind the "device" to the cpu */
 		bc->bound_on = smp_processor_id();
@@ -104,7 +109,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
 
 void tick_setup_hrtimer_broadcast(void)
 {
-	hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	bctimer.function = bc_handler;
 	clockevents_register_device(&ce_broadcast_hrtimer);
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index be9707f68024..01ff32a02af2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -634,10 +634,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 	/* Forward the time to expire in the future */
 	hrtimer_forward(&ts->sched_timer, now, tick_period);
 
-	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
-	else
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+		hrtimer_start_expires(&ts->sched_timer,
+				      HRTIMER_MODE_ABS_PINNED_HARD);
+	} else {
 		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+	}
 
 	/*
 	 * Reset to make sure next tick stop doesn't get fooled by past
@@ -802,7 +804,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	}
 
 	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+		hrtimer_start(&ts->sched_timer, tick,
+			      HRTIMER_MODE_ABS_PINNED_HARD);
 	} else {
 		hrtimer_set_expires(&ts->sched_timer, tick);
 		tick_program_event(tick, 1);
@@ -1327,7 +1330,7 @@ void tick_setup_sched_timer(void)
 	/*
 	 * Emulate tick processing via per-CPU hrtimers:
 	 */
-	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	ts->sched_timer.function = tick_sched_timer;
 
 	/* Get the next period (per-CPU) */
@@ -1342,7 +1345,7 @@ void tick_setup_sched_timer(void)
 	}
 
 	hrtimer_forward(&ts->sched_timer, now, tick_period);
-	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
 	tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
 }
 #endif /* HIGH_RES_TIMERS */
-- 
cgit 


From f5c2f0215e36d76fbb9605283dd7535af09f5770 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:57 +0200
Subject: hrtimer: Move unmarked hrtimers to soft interrupt expiry on RT

On PREEMPT_RT not all hrtimers can be expired in hard interrupt context
even if that is perfectly fine on a PREEMPT_RT=n kernel, e.g. because they
take regular spinlocks. Also for latency reasons PREEMPT_RT tries to defer
most hrtimers' expiry into softirq context.

hrtimers marked with HRTIMER_MODE_HARD must be kept in hard interrupt
context expiry mode. Add the required logic.

No functional change for PREEMPT_RT=n kernels.

[ tglx: Split out of a larger combo patch. Added changelog ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.551967692@linutronix.de
---
 kernel/time/hrtimer.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 0ace301a56f4..90dcc4d95e91 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1275,8 +1275,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 			   enum hrtimer_mode mode)
 {
 	bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
-	int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
 	struct hrtimer_cpu_base *cpu_base;
+	int base;
+
+	/*
+	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+	 * marked for hard interrupt expiry mode are moved into soft
+	 * interrupt context for latency reasons and because the callbacks
+	 * can invoke functions which might sleep on RT, e.g. spin_lock().
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
+		softtimer = true;
 
 	memset(timer, 0, sizeof(struct hrtimer));
 
@@ -1290,6 +1299,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
 		clock_id = CLOCK_MONOTONIC;
 
+	base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
 	base += hrtimer_clockid_to_base(clock_id);
 	timer->is_soft = softtimer;
 	timer->is_hard = !softtimer;
-- 
cgit 


From 1842f5a427f5323f5c19ab99b55d09b3ab5172a5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:58 +0200
Subject: hrtimer: Determine hard/soft expiry mode for hrtimer sleepers on RT

On PREEMPT_RT enabled kernels hrtimers which are not explicitely marked for
hard interrupt expiry mode are moved into soft interrupt context either for
latency reasons or because the hrtimer callback takes regular spinlocks or
invokes other functions which are not suitable for hard interrupt context
on PREEMPT_RT.

The hrtimer_sleeper callback is RT compatible in hard interrupt context,
but there is a latency concern: Untrusted userspace can spawn many threads
which arm timers for the same expiry time on the same CPU. On expiry that
causes a latency spike due to the wakeup of a gazillion threads.

OTOH, priviledged real-time user space applications rely on the low latency
of hard interrupt wakeups. These syscall related wakeups are all based on
hrtimer sleepers.

If the current task is in a real-time scheduling class, mark the mode for
hard interrupt expiry.

[ tglx: Split out of a larger combo patch. Added changelog ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.645792403@linutronix.de
---
 kernel/time/hrtimer.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 90dcc4d95e91..c101f88ae8aa 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1676,6 +1676,16 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
 				   enum hrtimer_mode mode)
 {
+	/*
+	 * Make the enqueue delivery mode check work on RT. If the sleeper
+	 * was initialized for hard interrupt delivery, force the mode bit.
+	 * This is a special case for hrtimer_sleepers because
+	 * hrtimer_init_sleeper() determines the delivery mode on RT so the
+	 * fiddling with this decision is avoided at the call sites.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
+		mode |= HRTIMER_MODE_HARD;
+
 	hrtimer_start_expires(&sl->timer, mode);
 }
 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
@@ -1683,6 +1693,30 @@ EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
 static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				   clockid_t clock_id, enum hrtimer_mode mode)
 {
+	/*
+	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+	 * marked for hard interrupt expiry mode are moved into soft
+	 * interrupt context either for latency reasons or because the
+	 * hrtimer callback takes regular spinlocks or invokes other
+	 * functions which are not suitable for hard interrupt context on
+	 * PREEMPT_RT.
+	 *
+	 * The hrtimer_sleeper callback is RT compatible in hard interrupt
+	 * context, but there is a latency concern: Untrusted userspace can
+	 * spawn many threads which arm timers for the same expiry time on
+	 * the same CPU. That causes a latency spike due to the wakeup of
+	 * a gazillion threads.
+	 *
+	 * OTOH, priviledged real-time user space applications rely on the
+	 * low latency of hard interrupt wakeups. If the current task is in
+	 * a real-time scheduling class, mark the mode for hard interrupt
+	 * expiry.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
+			mode |= HRTIMER_MODE_HARD;
+	}
+
 	__hrtimer_init(&sl->timer, clock_id, mode);
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = current;
-- 
cgit 


From f61eff83cec9cfab31fd30a2ca8856be379cdcd5 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 26 Jul 2019 20:30:59 +0200
Subject: hrtimer: Prepare support for PREEMPT_RT

When PREEMPT_RT is enabled, the soft interrupt thread can be preempted.  If
the soft interrupt thread is preempted in the middle of a timer callback,
then calling hrtimer_cancel() can lead to two issues:

  - If the caller is on a remote CPU then it has to spin wait for the timer
    handler to complete. This can result in unbound priority inversion.

  - If the caller originates from the task which preempted the timer
    handler on the same CPU, then spin waiting for the timer handler to
    complete is never going to end.

To avoid these issues, add a new lock to the timer base which is held
around the execution of the timer callbacks. If hrtimer_cancel() detects
that the timer callback is currently running, it blocks on the expiry
lock. When the callback is finished, the expiry lock is dropped by the
softirq thread which wakes up the waiter and the system makes progress.

This addresses both the priority inversion and the life lock issues.

The same issue can happen in virtual machines when the vCPU which runs a
timer callback is scheduled out. If a second vCPU of the same guest calls
hrtimer_cancel() it will spin wait for the other vCPU to be scheduled back
in. The expiry lock mechanism would avoid that. It'd be trivial to enable
this when paravirt spinlocks are enabled in a guest, but it's not clear
whether this is an actual problem in the wild, so for now it's an RT only
mechanism.

[ tglx: Refactored it for mainline ]

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.737767218@linutronix.de
---
 include/linux/hrtimer.h | 16 +++++++++
 kernel/time/hrtimer.c   | 95 +++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7d0d0a36a8f4..5df4bcff96d5 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -192,6 +192,10 @@ enum  hrtimer_base_type {
  * @nr_retries:		Total number of hrtimer interrupt retries
  * @nr_hangs:		Total number of hrtimer interrupt hangs
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
+ * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
+ *			 expired
+ * @timer_waiters:	A hrtimer_cancel() invocation waits for the timer
+ *			callback to finish.
  * @expires_next:	absolute time of the next event, is required for remote
  *			hrtimer enqueue; it is the total first expiry time (hard
  *			and soft hrtimer are taken into account)
@@ -218,6 +222,10 @@ struct hrtimer_cpu_base {
 	unsigned short			nr_retries;
 	unsigned short			nr_hangs;
 	unsigned int			max_hang_time;
+#endif
+#ifdef CONFIG_PREEMPT_RT
+	spinlock_t			softirq_expiry_lock;
+	atomic_t			timer_waiters;
 #endif
 	ktime_t				expires_next;
 	struct hrtimer			*next_timer;
@@ -350,6 +358,14 @@ extern void hrtimers_resume(void);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
+#ifdef CONFIG_PREEMPT_RT
+void hrtimer_cancel_wait_running(const struct hrtimer *timer);
+#else
+static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
+{
+	cpu_relax();
+}
+#endif
 
 /* Exported timer functions: */
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index c101f88ae8aa..499122752649 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1162,6 +1162,82 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
 
+#ifdef CONFIG_PREEMPT_RT
+static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
+{
+	spin_lock_init(&base->softirq_expiry_lock);
+}
+
+static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
+{
+	spin_lock(&base->softirq_expiry_lock);
+}
+
+static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
+{
+	spin_unlock(&base->softirq_expiry_lock);
+}
+
+/*
+ * The counterpart to hrtimer_cancel_wait_running().
+ *
+ * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
+ * the timer callback to finish. Drop expiry_lock and reaquire it. That
+ * allows the waiter to acquire the lock and make progress.
+ */
+static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
+				      unsigned long flags)
+{
+	if (atomic_read(&cpu_base->timer_waiters)) {
+		raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+		spin_unlock(&cpu_base->softirq_expiry_lock);
+		spin_lock(&cpu_base->softirq_expiry_lock);
+		raw_spin_lock_irq(&cpu_base->lock);
+	}
+}
+
+/*
+ * This function is called on PREEMPT_RT kernels when the fast path
+ * deletion of a timer failed because the timer callback function was
+ * running.
+ *
+ * This prevents priority inversion, if the softirq thread on a remote CPU
+ * got preempted, and it prevents a life lock when the task which tries to
+ * delete a timer preempted the softirq thread running the timer callback
+ * function.
+ */
+void hrtimer_cancel_wait_running(const struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+
+	if (!timer->is_soft || !base || !base->cpu_base) {
+		cpu_relax();
+		return;
+	}
+
+	/*
+	 * Mark the base as contended and grab the expiry lock, which is
+	 * held by the softirq across the timer callback. Drop the lock
+	 * immediately so the softirq can expire the next timer. In theory
+	 * the timer could already be running again, but that's more than
+	 * unlikely and just causes another wait loop.
+	 */
+	atomic_inc(&base->cpu_base->timer_waiters);
+	spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
+	atomic_dec(&base->cpu_base->timer_waiters);
+	spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
+}
+#else
+static inline void
+hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
+static inline void
+hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void
+hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
+					     unsigned long flags) { }
+#endif
+
 /**
  * hrtimer_cancel - cancel a timer and wait for the handler to finish.
  * @timer:	the timer to be cancelled
@@ -1172,13 +1248,15 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
  */
 int hrtimer_cancel(struct hrtimer *timer)
 {
-	for (;;) {
-		int ret = hrtimer_try_to_cancel(timer);
+	int ret;
 
-		if (ret >= 0)
-			return ret;
-		cpu_relax();
-	}
+	do {
+		ret = hrtimer_try_to_cancel(timer);
+
+		if (ret < 0)
+			hrtimer_cancel_wait_running(timer);
+	} while (ret < 0);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
 
@@ -1475,6 +1553,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
 				break;
 
 			__run_hrtimer(cpu_base, base, timer, &basenow, flags);
+			if (active_mask == HRTIMER_ACTIVE_SOFT)
+				hrtimer_sync_wait_running(cpu_base, flags);
 		}
 	}
 }
@@ -1485,6 +1565,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
 	unsigned long flags;
 	ktime_t now;
 
+	hrtimer_cpu_base_lock_expiry(cpu_base);
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
 	now = hrtimer_update_base(cpu_base);
@@ -1494,6 +1575,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
 	hrtimer_update_softirq_timer(cpu_base, true);
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+	hrtimer_cpu_base_unlock_expiry(cpu_base);
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1897,6 +1979,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	cpu_base->softirq_next_timer = NULL;
 	cpu_base->expires_next = KTIME_MAX;
 	cpu_base->softirq_expires_next = KTIME_MAX;
+	hrtimer_cpu_base_init_expiry_lock(cpu_base);
 	return 0;
 }
 
-- 
cgit 


From 030dcdd197d77374879bb5603d091eee7d8aba80 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 26 Jul 2019 20:31:00 +0200
Subject: timers: Prepare support for PREEMPT_RT

When PREEMPT_RT is enabled, the soft interrupt thread can be preempted.  If
the soft interrupt thread is preempted in the middle of a timer callback,
then calling del_timer_sync() can lead to two issues:

  - If the caller is on a remote CPU then it has to spin wait for the timer
    handler to complete. This can result in unbound priority inversion.

  - If the caller originates from the task which preempted the timer
    handler on the same CPU, then spin waiting for the timer handler to
    complete is never going to end.

To avoid these issues, add a new lock to the timer base which is held
around the execution of the timer callbacks. If del_timer_sync() detects
that the timer callback is currently running, it blocks on the expiry
lock. When the callback is finished, the expiry lock is dropped by the
softirq thread which wakes up the waiter and the system makes progress.

This addresses both the priority inversion and the life lock issues.

This mechanism is not used for timers which are marked IRQSAFE as for those
preemption is disabled accross the callback and therefore this situation
cannot happen. The callbacks for such timers need to be individually
audited for RT compliance.

The same issue can happen in virtual machines when the vCPU which runs a
timer callback is scheduled out. If a second vCPU of the same guest calls
del_timer_sync() it will spin wait for the other vCPU to be scheduled back
in. The expiry lock mechanism would avoid that. It'd be trivial to enable
this when paravirt spinlocks are enabled in a guest, but it's not clear
whether this is an actual problem in the wild, so for now it's an RT only
mechanism.

As the softirq thread can be preempted with PREEMPT_RT=y, the SMP variant
of del_timer_sync() needs to be used on UP as well.

[ tglx: Refactored it for mainline ]

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.832418500@linutronix.de
---
 include/linux/timer.h |   2 +-
 kernel/time/timer.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 282e4f2a532a..1e6650ed066d 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -183,7 +183,7 @@ extern void add_timer(struct timer_list *timer);
 
 extern int try_to_del_timer_sync(struct timer_list *timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
   extern int del_timer_sync(struct timer_list *timer);
 #else
 # define del_timer_sync(t)		del_timer(t)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 343c7ba33b1c..673c6a0f0c45 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -196,6 +196,10 @@ EXPORT_SYMBOL(jiffies_64);
 struct timer_base {
 	raw_spinlock_t		lock;
 	struct timer_list	*running_timer;
+#ifdef CONFIG_PREEMPT_RT
+	spinlock_t		expiry_lock;
+	atomic_t		timer_waiters;
+#endif
 	unsigned long		clk;
 	unsigned long		next_expiry;
 	unsigned int		cpu;
@@ -1227,7 +1231,78 @@ int try_to_del_timer_sync(struct timer_list *timer)
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
-#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT_RT
+static __init void timer_base_init_expiry_lock(struct timer_base *base)
+{
+	spin_lock_init(&base->expiry_lock);
+}
+
+static inline void timer_base_lock_expiry(struct timer_base *base)
+{
+	spin_lock(&base->expiry_lock);
+}
+
+static inline void timer_base_unlock_expiry(struct timer_base *base)
+{
+	spin_unlock(&base->expiry_lock);
+}
+
+/*
+ * The counterpart to del_timer_wait_running().
+ *
+ * If there is a waiter for base->expiry_lock, then it was waiting for the
+ * timer callback to finish. Drop expiry_lock and reaquire it. That allows
+ * the waiter to acquire the lock and make progress.
+ */
+static void timer_sync_wait_running(struct timer_base *base)
+{
+	if (atomic_read(&base->timer_waiters)) {
+		spin_unlock(&base->expiry_lock);
+		spin_lock(&base->expiry_lock);
+	}
+}
+
+/*
+ * This function is called on PREEMPT_RT kernels when the fast path
+ * deletion of a timer failed because the timer callback function was
+ * running.
+ *
+ * This prevents priority inversion, if the softirq thread on a remote CPU
+ * got preempted, and it prevents a life lock when the task which tries to
+ * delete a timer preempted the softirq thread running the timer callback
+ * function.
+ */
+static void del_timer_wait_running(struct timer_list *timer)
+{
+	u32 tf;
+
+	tf = READ_ONCE(timer->flags);
+	if (!(tf & TIMER_MIGRATING)) {
+		struct timer_base *base = get_timer_base(tf);
+
+		/*
+		 * Mark the base as contended and grab the expiry lock,
+		 * which is held by the softirq across the timer
+		 * callback. Drop the lock immediately so the softirq can
+		 * expire the next timer. In theory the timer could already
+		 * be running again, but that's more than unlikely and just
+		 * causes another wait loop.
+		 */
+		atomic_inc(&base->timer_waiters);
+		spin_lock_bh(&base->expiry_lock);
+		atomic_dec(&base->timer_waiters);
+		spin_unlock_bh(&base->expiry_lock);
+	}
+}
+#else
+static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
+static inline void timer_base_lock_expiry(struct timer_base *base) { }
+static inline void timer_base_unlock_expiry(struct timer_base *base) { }
+static inline void timer_sync_wait_running(struct timer_base *base) { }
+static inline void del_timer_wait_running(struct timer_list *timer) { }
+#endif
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1266,6 +1341,8 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  */
 int del_timer_sync(struct timer_list *timer)
 {
+	int ret;
+
 #ifdef CONFIG_LOCKDEP
 	unsigned long flags;
 
@@ -1283,12 +1360,17 @@ int del_timer_sync(struct timer_list *timer)
 	 * could lead to deadlock.
 	 */
 	WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
-	for (;;) {
-		int ret = try_to_del_timer_sync(timer);
-		if (ret >= 0)
-			return ret;
-		cpu_relax();
-	}
+
+	do {
+		ret = try_to_del_timer_sync(timer);
+
+		if (unlikely(ret < 0)) {
+			del_timer_wait_running(timer);
+			cpu_relax();
+		}
+	} while (ret < 0);
+
+	return ret;
 }
 EXPORT_SYMBOL(del_timer_sync);
 #endif
@@ -1360,10 +1442,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 		if (timer->flags & TIMER_IRQSAFE) {
 			raw_spin_unlock(&base->lock);
 			call_timer_fn(timer, fn, baseclk);
+			base->running_timer = NULL;
 			raw_spin_lock(&base->lock);
 		} else {
 			raw_spin_unlock_irq(&base->lock);
 			call_timer_fn(timer, fn, baseclk);
+			base->running_timer = NULL;
+			timer_sync_wait_running(base);
 			raw_spin_lock_irq(&base->lock);
 		}
 	}
@@ -1658,6 +1743,7 @@ static inline void __run_timers(struct timer_base *base)
 	if (!time_after_eq(jiffies, base->clk))
 		return;
 
+	timer_base_lock_expiry(base);
 	raw_spin_lock_irq(&base->lock);
 
 	/*
@@ -1684,8 +1770,8 @@ static inline void __run_timers(struct timer_base *base)
 		while (levels--)
 			expire_timers(base, heads + levels);
 	}
-	base->running_timer = NULL;
 	raw_spin_unlock_irq(&base->lock);
+	timer_base_unlock_expiry(base);
 }
 
 /*
@@ -1930,6 +2016,7 @@ static void __init init_timer_cpu(int cpu)
 		base->cpu = cpu;
 		raw_spin_lock_init(&base->lock);
 		base->clk = jiffies;
+		timer_base_init_expiry_lock(base);
 	}
 }
 
-- 
cgit 


From 850377a875a481c393ce59111b0c9725005e0eb4 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Wed, 31 Jul 2019 12:37:15 +0200
Subject: sched/deadline: Ensure inactive_timer runs in hardirq context

SCHED_DEADLINE inactive timer needs to run in hardirq context (as
dl_task_timer already does) on PREEMPT_RT

Change the mode to HRTIMER_MODE_REL_HARD.

[ tglx: Fixed up the start site, so mode debugging works ]

Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190731103715.4047-1-juri.lelli@redhat.com
---
 kernel/sched/deadline.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0359612d5443..83a663a34196 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p)
 
 	dl_se->dl_non_contending = 1;
 	get_task_struct(p);
-	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
 }
 
 static void task_contending(struct sched_dl_entity *dl_se, int flags)
@@ -1292,7 +1292,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
 {
 	struct hrtimer *timer = &dl_se->inactive_timer;
 
-	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	timer->function = inactive_task_timer;
 }
 
-- 
cgit 


From 51ae33092bb8320497ec75ddc5ab383d8fafd55c Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Wed, 31 Jul 2019 00:33:49 +0200
Subject: alarmtimer: Prepare for PREEMPT_RT

Use the hrtimer_cancel_wait_running() synchronization mechanism to prevent
priority inversion and live locks on PREEMPT_RT.

[ tglx: Split out of combo patch ]

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190730223828.508744705@linutronix.de
---
 kernel/time/alarmtimer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 57518efc3810..36947449dba2 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -432,7 +432,7 @@ int alarm_cancel(struct alarm *alarm)
 		int ret = alarm_try_to_cancel(alarm);
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		hrtimer_cancel_wait_running(&alarm->timer);
 	}
 }
 EXPORT_SYMBOL_GPL(alarm_cancel);
-- 
cgit 


From a125ecc16453a4fe0ba865c7df87b9c722991fdf Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Wed, 31 Jul 2019 00:33:50 +0200
Subject: timerfd: Prepare for PREEMPT_RT

Use the hrtimer_cancel_wait_running() synchronization mechanism to prevent
priority inversion and live locks on PREEMPT_RT.

[ tglx: Split out of combo patch ]

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190730223828.600085866@linutronix.de
---
 fs/timerfd.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a6fc8aa1de7..48305ba41e3c 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -471,7 +471,11 @@ static int do_timerfd_settime(int ufd, int flags,
 				break;
 		}
 		spin_unlock_irq(&ctx->wqh.lock);
-		cpu_relax();
+
+		if (isalarm(ctx))
+			hrtimer_cancel_wait_running(&ctx->t.alarm.timer);
+		else
+			hrtimer_cancel_wait_running(&ctx->t.tmr);
 	}
 
 	/*
-- 
cgit 


From c7e6d704a0097e59667495cf52dcc4e1085e620b Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Wed, 31 Jul 2019 00:33:51 +0200
Subject: itimers: Prepare for PREEMPT_RT

Use the hrtimer_cancel_wait_running() synchronization mechanism to prevent
priority inversion and live locks on PREEMPT_RT.

As a benefit the retry loop gains the missing cpu_relax() on !RT.

[ tglx: Split out of combo patch ]

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190730223828.690771827@linutronix.de
---
 kernel/time/itimer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 02068b2d5862..9d26fd4ba4c0 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -213,6 +213,7 @@ again:
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_cancel_wait_running(timer);
 			goto again;
 		}
 		expires = timeval_to_ktime(value->it_value);
-- 
cgit 


From 21670ee44f1e3565030bcabc62178b8e5eb2fce7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Jul 2019 00:33:52 +0200
Subject: posix-timers: Cleanup the flag/flags confusion

do_timer_settime() has a 'flags' argument and uses 'flag' for the interrupt
flags, which is confusing at best.

Rename the argument so 'flags' can be used for interrupt flags as usual.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190730223828.782664411@linutronix.de
---
 kernel/time/posix-timers.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index d7f2d91acdac..f5aedd2f60df 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -844,13 +844,13 @@ int common_timer_set(struct k_itimer *timr, int flags,
 	return 0;
 }
 
-static int do_timer_settime(timer_t timer_id, int flags,
+static int do_timer_settime(timer_t timer_id, int tmr_flags,
 			    struct itimerspec64 *new_spec64,
 			    struct itimerspec64 *old_spec64)
 {
 	const struct k_clock *kc;
 	struct k_itimer *timr;
-	unsigned long flag;
+	unsigned long flags;
 	int error = 0;
 
 	if (!timespec64_valid(&new_spec64->it_interval) ||
@@ -860,7 +860,7 @@ static int do_timer_settime(timer_t timer_id, int flags,
 	if (old_spec64)
 		memset(old_spec64, 0, sizeof(*old_spec64));
 retry:
-	timr = lock_timer(timer_id, &flag);
+	timr = lock_timer(timer_id, &flags);
 	if (!timr)
 		return -EINVAL;
 
@@ -868,9 +868,9 @@ retry:
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
 	else
-		error = kc->timer_set(timr, flags, new_spec64, old_spec64);
+		error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
 
-	unlock_timer(timr, flag);
+	unlock_timer(timr, flags);
 	if (error == TIMER_RETRY) {
 		old_spec64 = NULL;	// We already got the old time...
 		goto retry;
-- 
cgit 


From 6945e5c2abe008302b20266248d6de95575311a8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Jul 2019 00:33:53 +0200
Subject: posix-timers: Rework cancel retry loops

As a preparatory step for adding the PREEMPT RT specific synchronization
mechanism to wait for a running timer callback, rework the timer cancel
retry loops so they call a common function. This allows trivial
substitution in one place.

Originally-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190730223828.874901027@linutronix.de
---
 kernel/time/posix-timers.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f5aedd2f60df..bbe8f9686a70 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -805,6 +805,17 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
 	return hrtimer_try_to_cancel(&timr->it.real.timer);
 }
 
+static struct k_itimer *timer_wait_running(struct k_itimer *timer,
+					   unsigned long *flags)
+{
+	timer_t timer_id = READ_ONCE(timer->it_id);
+
+	unlock_timer(timer, *flags);
+	cpu_relax();
+	/* Relock the timer. It might be not longer hashed. */
+	return lock_timer(timer_id, flags);
+}
+
 /* Set a POSIX.1b interval timer. */
 int common_timer_set(struct k_itimer *timr, int flags,
 		     struct itimerspec64 *new_setting,
@@ -859,8 +870,9 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags,
 
 	if (old_spec64)
 		memset(old_spec64, 0, sizeof(*old_spec64));
-retry:
+
 	timr = lock_timer(timer_id, &flags);
+retry:
 	if (!timr)
 		return -EINVAL;
 
@@ -870,11 +882,14 @@ retry:
 	else
 		error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
 
-	unlock_timer(timr, flags);
 	if (error == TIMER_RETRY) {
-		old_spec64 = NULL;	// We already got the old time...
+		// We already got the old time...
+		old_spec64 = NULL;
+		/* Unlocks and relocks the timer if it still exists */
+		timr = timer_wait_running(timr, &flags);
 		goto retry;
 	}
+	unlock_timer(timr, flags);
 
 	return error;
 }
@@ -951,13 +966,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
 	struct k_itimer *timer;
 	unsigned long flags;
 
-retry_delete:
 	timer = lock_timer(timer_id, &flags);
+
+retry_delete:
 	if (!timer)
 		return -EINVAL;
 
-	if (timer_delete_hook(timer) == TIMER_RETRY) {
-		unlock_timer(timer, flags);
+	if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
+		/* Unlocks and relocks the timer if it still exists */
+		timer = timer_wait_running(timer, &flags);
 		goto retry_delete;
 	}
 
-- 
cgit 


From 5d99b32a009e900a561f6a42ea7afe5b21288b8a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 31 Jul 2019 00:33:54 +0200
Subject: posix-timers: Move rcu_head out of it union

Timer deletion on PREEMPT_RT is prone to priority inversion and live
locks. The hrtimer code has a synchronization mechanism for this. Posix CPU
timers will grow one.

But that mechanism cannot be invoked while holding the k_itimer lock
because that can deadlock against the running timer callback. So the lock
must be dropped which allows the timer to be freed.

The timer free can be prevented by taking RCU readlock before dropping the
lock, but because the rcu_head is part of the 'it' union a concurrent free
will overwrite the hrtimer on which the task is trying to synchronize.

Move the rcu_head out of the union to prevent this.

[ tglx: Fixed up kernel-doc. Rewrote changelog ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190730223828.965541887@linutronix.de
---
 include/linux/posix-timers.h | 5 +++--
 kernel/time/posix-timers.c   | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index b20798fc5191..604cec0e41ba 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -85,7 +85,8 @@ static inline int clockid_to_fd(const clockid_t clk)
  * @it_process:		The task to wakeup on clock_nanosleep (CPU timers)
  * @sigq:		Pointer to preallocated sigqueue
  * @it:			Union representing the various posix timer type
- *			internals. Also used for rcu freeing the timer.
+ *			internals.
+ * @rcu:		RCU head for freeing the timer.
  */
 struct k_itimer {
 	struct list_head	list;
@@ -114,8 +115,8 @@ struct k_itimer {
 		struct {
 			struct alarm	alarmtimer;
 		} alarm;
-		struct rcu_head		rcu;
 	} it;
+	struct rcu_head		rcu;
 };
 
 void run_posix_cpu_timers(struct task_struct *task);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index bbe8f9686a70..3e663f982c82 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -442,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void)
 
 static void k_itimer_rcu_free(struct rcu_head *head)
 {
-	struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+	struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
 
 	kmem_cache_free(posix_timers_cache, tmr);
 }
@@ -459,7 +459,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	}
 	put_pid(tmr->it_pid);
 	sigqueue_free(tmr->sigq);
-	call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
+	call_rcu(&tmr->rcu, k_itimer_rcu_free);
 }
 
 static int common_timer_create(struct k_itimer *new_timer)
-- 
cgit 


From ec8f954a40da8cd3d159713b608e901f0cd909a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 2 Aug 2019 07:35:59 +0200
Subject: posix-timers: Use a callback for cancel synchronization on PREEMPT_RT

Posix timer delete retry loops are affected by the same priority inversion
and live lock issues as the other timers.

Provide a RT specific synchronization function which keeps a reference to
the timer by holding rcu read lock to prevent the timer from being freed,
dropping the timer lock and invoking the timer specific wait function via a
new callback.

This does not yet cover posix CPU timers because they need more special
treatment on PREEMPT_RT.

[ This is folded into the original attempt which did not use a callback. ]

Originally-by: Anna-Maria Gleixenr <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190819143801.656864506@linutronix.de
---
 kernel/time/alarmtimer.c   | 14 ++++++++++++++
 kernel/time/posix-timers.c | 18 +++++++++++++++++-
 kernel/time/posix-timers.h |  1 +
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 36947449dba2..ec32876e284d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -605,6 +605,19 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr)
 	return alarm_try_to_cancel(&timr->it.alarm.alarmtimer);
 }
 
+/**
+ * alarm_timer_wait_running - Posix timer callback to wait for a timer
+ * @timr:	Pointer to the posixtimer data struct
+ *
+ * Called from the core code when timer cancel detected that the callback
+ * is running. @timr is unlocked and rcu read lock is held to prevent it
+ * from being freed.
+ */
+static void alarm_timer_wait_running(struct k_itimer *timr)
+{
+	hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer);
+}
+
 /**
  * alarm_timer_arm - Posix timer callback to arm a timer
  * @timr:	Pointer to the posixtimer data struct
@@ -834,6 +847,7 @@ const struct k_clock alarm_clock = {
 	.timer_forward		= alarm_timer_forward,
 	.timer_remaining	= alarm_timer_remaining,
 	.timer_try_to_cancel	= alarm_timer_try_to_cancel,
+	.timer_wait_running	= alarm_timer_wait_running,
 	.nsleep			= alarm_timer_nsleep,
 };
 #endif /* CONFIG_POSIX_TIMERS */
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 3e663f982c82..9e377830cc10 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -805,13 +805,25 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
 	return hrtimer_try_to_cancel(&timr->it.real.timer);
 }
 
+static void common_timer_wait_running(struct k_itimer *timer)
+{
+	hrtimer_cancel_wait_running(&timer->it.real.timer);
+}
+
 static struct k_itimer *timer_wait_running(struct k_itimer *timer,
 					   unsigned long *flags)
 {
+	const struct k_clock *kc = READ_ONCE(timer->kclock);
 	timer_t timer_id = READ_ONCE(timer->it_id);
 
+	/* Prevent kfree(timer) after dropping the lock */
+	rcu_read_lock();
 	unlock_timer(timer, *flags);
-	cpu_relax();
+
+	if (!WARN_ON_ONCE(!kc->timer_wait_running))
+		kc->timer_wait_running(timer);
+
+	rcu_read_unlock();
 	/* Relock the timer. It might be not longer hashed. */
 	return lock_timer(timer_id, flags);
 }
@@ -1255,6 +1267,7 @@ static const struct k_clock clock_realtime = {
 	.timer_forward		= common_hrtimer_forward,
 	.timer_remaining	= common_hrtimer_remaining,
 	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_wait_running	= common_timer_wait_running,
 	.timer_arm		= common_hrtimer_arm,
 };
 
@@ -1270,6 +1283,7 @@ static const struct k_clock clock_monotonic = {
 	.timer_forward		= common_hrtimer_forward,
 	.timer_remaining	= common_hrtimer_remaining,
 	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_wait_running	= common_timer_wait_running,
 	.timer_arm		= common_hrtimer_arm,
 };
 
@@ -1300,6 +1314,7 @@ static const struct k_clock clock_tai = {
 	.timer_forward		= common_hrtimer_forward,
 	.timer_remaining	= common_hrtimer_remaining,
 	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_wait_running	= common_timer_wait_running,
 	.timer_arm		= common_hrtimer_arm,
 };
 
@@ -1315,6 +1330,7 @@ static const struct k_clock clock_boottime = {
 	.timer_forward		= common_hrtimer_forward,
 	.timer_remaining	= common_hrtimer_remaining,
 	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_wait_running	= common_timer_wait_running,
 	.timer_arm		= common_hrtimer_arm,
 };
 
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index de5daa6d975a..897c29e162b9 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -24,6 +24,7 @@ struct k_clock {
 	int	(*timer_try_to_cancel)(struct k_itimer *timr);
 	void	(*timer_arm)(struct k_itimer *timr, ktime_t expires,
 			     bool absolute, bool sigev_none);
+	void	(*timer_wait_running)(struct k_itimer *timr);
 };
 
 extern const struct k_clock clock_posix_cpu;
-- 
cgit 


From 0bee3b601b77dbe7981b5474ae8758d6bf60177a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 20 Aug 2019 15:12:23 +0200
Subject: hrtimer: Improve comments on handling priority inversion against
 softirq kthread

The handling of a priority inversion between timer cancelling and a a not
well defined possible preemption of softirq kthread is not very clear.

Especially in the posix timers side it's unclear why there is a specific RT
wait callback.

All the nice explanations can be found in the initial changelog of
f61eff83cec9 (hrtimer: Prepare support for PREEMPT_RT").

Extract the detailed informations from there and put it into comments.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190820132656.GC2093@lenoir
---
 kernel/time/hrtimer.c      | 14 ++++++++++----
 kernel/time/posix-timers.c |  6 ++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 499122752649..833353732554 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1201,10 +1201,16 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
  * deletion of a timer failed because the timer callback function was
  * running.
  *
- * This prevents priority inversion, if the softirq thread on a remote CPU
- * got preempted, and it prevents a life lock when the task which tries to
- * delete a timer preempted the softirq thread running the timer callback
- * function.
+ * This prevents priority inversion: if the soft irq thread is preempted
+ * in the middle of a timer callback, then calling del_timer_sync() can
+ * lead to two issues:
+ *
+ *  - If the caller is on a remote CPU then it has to spin wait for the timer
+ *    handler to complete. This can result in unbound priority inversion.
+ *
+ *  - If the caller originates from the task which preempted the timer
+ *    handler on the same CPU, then spin waiting for the timer handler to
+ *    complete is never going to end.
  */
 void hrtimer_cancel_wait_running(const struct hrtimer *timer)
 {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 9e377830cc10..0ec5b7a1d769 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -810,6 +810,12 @@ static void common_timer_wait_running(struct k_itimer *timer)
 	hrtimer_cancel_wait_running(&timer->it.real.timer);
 }
 
+/*
+ * On PREEMPT_RT this prevent priority inversion against softirq kthread in
+ * case it gets preempted while executing a timer callback. See comments in
+ * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a
+ * cpu_relax().
+ */
 static struct k_itimer *timer_wait_running(struct k_itimer *timer,
 					   unsigned long *flags)
 {
-- 
cgit 


From ce03f613461642669d6150c405dd28f4bfd54bbb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 19 Aug 2019 16:31:42 +0200
Subject: posix-timers: Cleanup forward declarations and includes

 - Rename struct siginfo to kernel_siginfo as that is used and required
 - Add a forward declaration for task_struct and remove sched.h include
 - Remove timex.h include as it is not needed

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190819143801.472005793@linutronix.de
---
 include/linux/posix-timers.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 604cec0e41ba..26c636d1485b 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,11 +4,10 @@
 
 #include <linux/spinlock.h>
 #include <linux/list.h>
-#include <linux/sched.h>
-#include <linux/timex.h>
 #include <linux/alarmtimer.h>
 
-struct siginfo;
+struct kernel_siginfo;
+struct task_struct;
 
 struct cpu_timer_list {
 	struct list_head entry;
-- 
cgit 


From 3758b0f86ef502e2f342055caef6d2232c2558b7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 19 Aug 2019 16:31:43 +0200
Subject: alarmtimers: Avoid rtc.h include

rtc.h is not needed in alarmtimers when a forward declaration of struct
rtc_device is provided. That allows to include posix-timers.h without
adding more includes to alarmtimer.h or creating circular include
dependencies.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190819143801.565389536@linutronix.de
---
 include/linux/alarmtimer.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index 0760ca1cb009..74748e306f4b 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h
@@ -5,7 +5,8 @@
 #include <linux/time.h>
 #include <linux/hrtimer.h>
 #include <linux/timerqueue.h>
-#include <linux/rtc.h>
+
+struct rtc_device;
 
 enum alarmtimer_type {
 	ALARM_REALTIME,
-- 
cgit 


From 7cb9a94c158b956f46cf093ed966d0c1e996dddb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 19 Aug 2019 16:31:45 +0200
Subject: posix-cpu-timers: Fixup stale comment

The comment above cleanup_timers() is outdated. The timers are only removed
from the task/process list heads but not modified in any other way.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190819143801.747233612@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0a426f4e3125..742d4a4e6f71 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -412,9 +412,10 @@ static void cleanup_timers_list(struct list_head *head)
 }
 
 /*
- * Clean out CPU timers still ticking when a thread exited.  The task
- * pointer is cleared, and the expiry time is replaced with the residual
- * time for later timer_gettime calls to return.
+ * Clean out CPU timers which are still armed when a thread exits. The
+ * timers are only removed from the list. No other updates are done. The
+ * corresponding posix timers are still accessible, but cannot be rearmed.
+ *
  * This must be called with the siglock held.
  */
 static void cleanup_timers(struct list_head *head)
-- 
cgit 


From dd2261ed45aaeddeb77768f291d604179bcab096 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Wed, 21 Aug 2019 10:24:07 +0100
Subject: hrtimer: Protect lockless access to timer->base

The update to timer->base is protected by the base->cpu_base->lock().
However, hrtimer_cancel_wait_running() does access it lockless.  So the
compiler is allowed to refetch timer->base which can cause havoc when the
timer base is changed concurrently.

Use READ_ONCE() to prevent this.

[ tglx: Adapted from a RT patch ]

Signed-off-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190821092409.13225-2-julien.grall@arm.com
---
 kernel/time/hrtimer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 833353732554..f48864e2ff8a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1214,7 +1214,8 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
  */
 void hrtimer_cancel_wait_running(const struct hrtimer *timer)
 {
-	struct hrtimer_clock_base *base = timer->base;
+	/* Lockless read. Prevent the compiler from reloading it below */
+	struct hrtimer_clock_base *base = READ_ONCE(timer->base);
 
 	if (!timer->is_soft || !base || !base->cpu_base) {
 		cpu_relax();
-- 
cgit 


From 68b2c8c1e421096f4b46ac2ac502d25ca067a2a6 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Wed, 21 Aug 2019 10:24:09 +0100
Subject: hrtimer: Don't take expiry_lock when timer is currently migrated

migration_base is used as a placeholder when an hrtimer is migrated to a
different CPU. In the case that hrtimer_cancel_wait_running() hits a timer
which is currently migrated it would pointlessly acquire the expiry lock of
the migration base, which is even not initialized.

Surely it could be initialized, but there is absolutely no point in
acquiring this lock because the timer is guaranteed not to run it's
callback for which the caller waits to finish on that base. So it would
just do the inc/lock/dec/unlock dance for nothing.

As the base switch is short and non-preemptible, there is no issue when the
wait function returns immediately.

The timer base and base->cpu_base cannot be NULL in the code path which is
invoking that, so just replace those checks with a check whether base is
migration base.

[ tglx: Updated from RT patch. Massaged changelog. Added comment. ]

Signed-off-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190821092409.13225-4-julien.grall@arm.com
---
 kernel/time/hrtimer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f48864e2ff8a..ebbd0fb3512b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1217,7 +1217,11 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer)
 	/* Lockless read. Prevent the compiler from reloading it below */
 	struct hrtimer_clock_base *base = READ_ONCE(timer->base);
 
-	if (!timer->is_soft || !base || !base->cpu_base) {
+	/*
+	 * Just relax if the timer expires in hard interrupt context or if
+	 * it is currently on the migration base.
+	 */
+	if (!timer->is_soft || base == &migration_base)
 		cpu_relax();
 		return;
 	}
-- 
cgit 


From 692117c1f7a6770ed41dd8f277cd9fed1dfb16f1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 19 Aug 2019 16:31:46 +0200
Subject: posix-cpu-timers: Sanitize bogus WARNONS

Warning when p == NULL and then proceeding and dereferencing p does not
make any sense as the kernel will crash with a NULL pointer dereference
right away.

Bailing out when p == NULL and returning an error code does not cure the
underlying problem which caused p to be NULL. Though it might allow to
do proper debugging.

Same applies to the clock id check in set_process_cpu_timer().

Clean them up and make them return without trying to do further damage.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190819143801.846497772@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 742d4a4e6f71..98223d2805c2 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -375,7 +375,8 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 	struct sighand_struct *sighand;
 	struct task_struct *p = timer->it.cpu.task;
 
-	WARN_ON_ONCE(p == NULL);
+	if (WARN_ON_ONCE(!p))
+		return -EINVAL;
 
 	/*
 	 * Protect against sighand release/switch in exit/exec and process/
@@ -581,7 +582,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	u64 old_expires, new_expires, old_incr, val;
 	int ret;
 
-	WARN_ON_ONCE(p == NULL);
+	if (WARN_ON_ONCE(!p))
+		return -EINVAL;
 
 	/*
 	 * Use the to_ktime conversion because that clamps the maximum
@@ -716,10 +718,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 
 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
 {
-	u64 now;
 	struct task_struct *p = timer->it.cpu.task;
+	u64 now;
 
-	WARN_ON_ONCE(p == NULL);
+	if (WARN_ON_ONCE(!p))
+		return;
 
 	/*
 	 * Easy part: convert the reload time.
@@ -1001,12 +1004,13 @@ static void check_process_timers(struct task_struct *tsk,
  */
 static void posix_cpu_timer_rearm(struct k_itimer *timer)
 {
+	struct task_struct *p = timer->it.cpu.task;
 	struct sighand_struct *sighand;
 	unsigned long flags;
-	struct task_struct *p = timer->it.cpu.task;
 	u64 now;
 
-	WARN_ON_ONCE(p == NULL);
+	if (WARN_ON_ONCE(!p))
+		return;
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
@@ -1203,7 +1207,9 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	u64 now;
 	int ret;
 
-	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
+	if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED))
+		return;
+
 	ret = cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval && ret != -EINVAL) {
-- 
cgit 


From dce3e8fd039cc1b62760b3ad6822cf04c262cd0e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 19 Aug 2019 16:31:47 +0200
Subject: posix-cpu-timers: Remove tsk argument from run_posix_cpu_timers()

It's always current. Don't give people wrong ideas.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190819143801.945469967@linutronix.de
---
 include/linux/posix-timers.h   | 2 +-
 kernel/time/posix-cpu-timers.c | 5 +++--
 kernel/time/timer.c            | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 26c636d1485b..033374b99767 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -118,7 +118,7 @@ struct k_itimer {
 	struct rcu_head		rcu;
 };
 
-void run_posix_cpu_timers(struct task_struct *task);
+void run_posix_cpu_timers(void);
 void posix_cpu_timers_exit(struct task_struct *task);
 void posix_cpu_timers_exit_group(struct task_struct *task);
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 98223d2805c2..387e0e86e1b8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1137,11 +1137,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
  * already updated our counts.  We need to check if any timers fire now.
  * Interrupts are disabled.
  */
-void run_posix_cpu_timers(struct task_struct *tsk)
+void run_posix_cpu_timers(void)
 {
-	LIST_HEAD(firing);
+	struct task_struct *tsk = current;
 	struct k_itimer *timer, *next;
 	unsigned long flags;
+	LIST_HEAD(firing);
 
 	lockdep_assert_irqs_disabled();
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 673c6a0f0c45..0e315a2e77ae 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1728,7 +1728,7 @@ void update_process_times(int user_tick)
 #endif
 	scheduler_tick();
 	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
-		run_posix_cpu_timers(p);
+		run_posix_cpu_timers();
 }
 
 /**
-- 
cgit 


From adb87ff4f96c9700718e09c97a804124d5cd61ff Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Wed, 14 Aug 2019 20:32:15 +0800
Subject: clocksource/drivers/hyperv: Allocate Hyper-V TSC page statically

Prepare to add Hyper-V sched clock callback and move Hyper-V Reference TSC
initialization much earlier in the boot process.  Earlier initialization is
needed so that it happens while the timestamp value is still 0 and no
discontinuity in the timestamp will occur when pv_ops.time.sched_clock
calculates its offset.

The earlier initialization requires that the Hyper-V TSC page be allocated
statically instead of with vmalloc(), so fixup the references to the TSC
page and the method of getting its physical address.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lkml.kernel.org/r/20190814123216.32245-2-Tianyu.Lan@microsoft.com
---
 arch/x86/entry/vdso/vma.c          |  2 +-
 drivers/clocksource/hyperv_timer.c | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 349a61d8bf34..f5937742b290 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -122,7 +122,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
 
 		if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
 			return vmf_insert_pfn(vma, vmf->address,
-					vmalloc_to_pfn(tsc_pg));
+					virt_to_phys(tsc_pg) >> PAGE_SHIFT);
 	}
 
 	return VM_FAULT_SIGBUS;
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index ba2c79e6a0ee..432aa331df04 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -214,17 +214,17 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
 
 #ifdef CONFIG_HYPERV_TSCPAGE
 
-static struct ms_hyperv_tsc_page *tsc_pg;
+static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE);
 
 struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 {
-	return tsc_pg;
+	return &tsc_pg;
 }
 EXPORT_SYMBOL_GPL(hv_get_tsc_page);
 
 static u64 notrace read_hv_sched_clock_tsc(void)
 {
-	u64 current_tick = hv_read_tsc_page(tsc_pg);
+	u64 current_tick = hv_read_tsc_page(&tsc_pg);
 
 	if (current_tick == U64_MAX)
 		hv_get_time_ref_count(current_tick);
@@ -280,12 +280,8 @@ static bool __init hv_init_tsc_clocksource(void)
 	if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
 		return false;
 
-	tsc_pg = vmalloc(PAGE_SIZE);
-	if (!tsc_pg)
-		return false;
-
 	hyperv_cs = &hyperv_cs_tsc;
-	phys_addr = page_to_phys(vmalloc_to_page(tsc_pg));
+	phys_addr = virt_to_phys(&tsc_pg);
 
 	/*
 	 * The Hyper-V TLFS specifies to preserve the value of reserved
-- 
cgit 


From bd00cd52d5be655a2f217e2ed74b91a71cb2b14f Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Wed, 14 Aug 2019 20:32:16 +0800
Subject: clocksource/drivers/hyperv: Add Hyper-V specific sched clock function

Hyper-V guests use the default native_sched_clock() in
pv_ops.time.sched_clock on x86. But native_sched_clock() directly uses the
raw TSC value, which can be discontinuous in a Hyper-V VM.

Add the generic hv_setup_sched_clock() to set the sched clock function
appropriately. On x86, this sets pv_ops.time.sched_clock to read the
Hyper-V reference TSC value that is scaled and adjusted to be continuous.

Also move the Hyper-V reference TSC initialization much earlier in the boot
process so no discontinuity is observed when pv_ops.time.sched_clock
calculates its offset.

[ tglx: Folded build fix ]

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lkml.kernel.org/r/20190814123216.32245-3-Tianyu.Lan@microsoft.com
---
 arch/x86/hyperv/hv_init.c          |  2 --
 arch/x86/kernel/cpu/mshyperv.c     |  8 ++++++++
 drivers/clocksource/hyperv_timer.c | 22 ++++++++++++----------
 include/asm-generic/mshyperv.h     |  1 +
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 0d258688c8cf..866dfb3dca48 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -301,8 +301,6 @@ void __init hyperv_init(void)
 
 	x86_init.pci.arch_init = hv_pci_init;
 
-	/* Register Hyper-V specific clocksource */
-	hv_init_clocksource();
 	return;
 
 remove_cpuhp_state:
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 062f77279ce3..53afd33990eb 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -29,6 +29,7 @@
 #include <asm/timer.h>
 #include <asm/reboot.h>
 #include <asm/nmi.h>
+#include <clocksource/hyperv_timer.h>
 
 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -338,9 +339,16 @@ static void __init ms_hyperv_init_platform(void)
 		x2apic_phys = 1;
 # endif
 
+	/* Register Hyper-V specific clocksource */
+	hv_init_clocksource();
 #endif
 }
 
+void hv_setup_sched_clock(void *sched_clock)
+{
+	pv_ops.time.sched_clock = sched_clock;
+}
+
 const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
 	.name			= "Microsoft Hyper-V",
 	.detect			= ms_hyperv_platform,
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 432aa331df04..c322ab4d3689 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -22,6 +22,7 @@
 #include <asm/mshyperv.h>
 
 static struct clock_event_device __percpu *hv_clock_event;
+static u64 hv_sched_clock_offset __ro_after_init;
 
 /*
  * If false, we're using the old mechanism for stimer0 interrupts
@@ -222,7 +223,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 }
 EXPORT_SYMBOL_GPL(hv_get_tsc_page);
 
-static u64 notrace read_hv_sched_clock_tsc(void)
+static u64 notrace read_hv_clock_tsc(struct clocksource *arg)
 {
 	u64 current_tick = hv_read_tsc_page(&tsc_pg);
 
@@ -232,9 +233,9 @@ static u64 notrace read_hv_sched_clock_tsc(void)
 	return current_tick;
 }
 
-static u64 read_hv_clock_tsc(struct clocksource *arg)
+static u64 read_hv_sched_clock_tsc(void)
 {
-	return read_hv_sched_clock_tsc();
+	return read_hv_clock_tsc(NULL) - hv_sched_clock_offset;
 }
 
 static struct clocksource hyperv_cs_tsc = {
@@ -246,7 +247,7 @@ static struct clocksource hyperv_cs_tsc = {
 };
 #endif
 
-static u64 notrace read_hv_sched_clock_msr(void)
+static u64 notrace read_hv_clock_msr(struct clocksource *arg)
 {
 	u64 current_tick;
 	/*
@@ -258,9 +259,9 @@ static u64 notrace read_hv_sched_clock_msr(void)
 	return current_tick;
 }
 
-static u64 read_hv_clock_msr(struct clocksource *arg)
+static u64 read_hv_sched_clock_msr(void)
 {
-	return read_hv_sched_clock_msr();
+	return read_hv_clock_msr(NULL) - hv_sched_clock_offset;
 }
 
 static struct clocksource hyperv_cs_msr = {
@@ -298,8 +299,9 @@ static bool __init hv_init_tsc_clocksource(void)
 	hv_set_clocksource_vdso(hyperv_cs_tsc);
 	clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
 
-	/* sched_clock_register is needed on ARM64 but is a no-op on x86 */
-	sched_clock_register(read_hv_sched_clock_tsc, 64, HV_CLOCK_HZ);
+	hv_sched_clock_offset = hyperv_cs->read(hyperv_cs);
+	hv_setup_sched_clock(read_hv_sched_clock_tsc);
+
 	return true;
 }
 #else
@@ -329,7 +331,7 @@ void __init hv_init_clocksource(void)
 	hyperv_cs = &hyperv_cs_msr;
 	clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
 
-	/* sched_clock_register is needed on ARM64 but is a no-op on x86 */
-	sched_clock_register(read_hv_sched_clock_msr, 64, HV_CLOCK_HZ);
+	hv_sched_clock_offset = hyperv_cs->read(hyperv_cs);
+	hv_setup_sched_clock(read_hv_sched_clock_msr);
 }
 EXPORT_SYMBOL_GPL(hv_init_clocksource);
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 0becb7d9704d..18d8e2d8210f 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -167,6 +167,7 @@ void hyperv_report_panic(struct pt_regs *regs, long err);
 void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
 bool hv_is_hyperv_initialized(void);
 void hyperv_cleanup(void);
+void hv_setup_sched_clock(void *sched_clock);
 #else /* CONFIG_HYPERV */
 static inline bool hv_is_hyperv_initialized(void) { return false; }
 static inline void hyperv_cleanup(void) {}
-- 
cgit 


From 3e2d94535adb2df15f3907e4b4c7cd8a5a4c2b5a Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 22 Aug 2019 10:36:30 +0200
Subject: clocksource/drivers/hyperv: Enable TSC page clocksource on 32bit

There is no particular reason to not enable TSC page clocksource on
32-bit. mul_u64_u64_shr() is available and despite the increased
computational complexity (compared to 64bit) TSC page is still a huge win
compared to MSR-based clocksource.

In-kernel reads:
  MSR based clocksource: 3361 cycles
  TSC page clocksource: 49 cycles

Reads from userspace (utilizing vDSO in case of TSC page):
  MSR based clocksource: 5664 cycles
  TSC page clocksource: 131 cycles

Enabling TSC page on 32bits allows to get rid of CONFIG_HYPERV_TSCPAGE as
it is now not any different from CONFIG_HYPERV_TIMER.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lkml.kernel.org/r/20190822083630.17059-1-vkuznets@redhat.com
---
 arch/x86/include/asm/vdso/gettimeofday.h |  6 +++---
 drivers/clocksource/hyperv_timer.c       | 11 -----------
 drivers/hv/Kconfig                       |  3 ---
 include/clocksource/hyperv_timer.h       |  8 +++-----
 4 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index ae91429129a6..bcbf901befbe 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -51,7 +51,7 @@ extern struct pvclock_vsyscall_time_info pvclock_page
 	__attribute__((visibility("hidden")));
 #endif
 
-#ifdef CONFIG_HYPERV_TSCPAGE
+#ifdef CONFIG_HYPERV_TIMER
 extern struct ms_hyperv_tsc_page hvclock_page
 	__attribute__((visibility("hidden")));
 #endif
@@ -192,7 +192,7 @@ static u64 vread_pvclock(void)
 }
 #endif
 
-#ifdef CONFIG_HYPERV_TSCPAGE
+#ifdef CONFIG_HYPERV_TIMER
 static u64 vread_hvclock(void)
 {
 	return hv_read_tsc_page(&hvclock_page);
@@ -215,7 +215,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode)
 		return vread_pvclock();
 	}
 #endif
-#ifdef CONFIG_HYPERV_TSCPAGE
+#ifdef CONFIG_HYPERV_TIMER
 	if (clock_mode == VCLOCK_HVCLOCK) {
 		barrier();
 		return vread_hvclock();
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index c322ab4d3689..2317d4e3daaf 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -213,8 +213,6 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
 struct clocksource *hyperv_cs;
 EXPORT_SYMBOL_GPL(hyperv_cs);
 
-#ifdef CONFIG_HYPERV_TSCPAGE
-
 static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE);
 
 struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
@@ -245,7 +243,6 @@ static struct clocksource hyperv_cs_tsc = {
 	.mask	= CLOCKSOURCE_MASK(64),
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 };
-#endif
 
 static u64 notrace read_hv_clock_msr(struct clocksource *arg)
 {
@@ -272,7 +269,6 @@ static struct clocksource hyperv_cs_msr = {
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-#ifdef CONFIG_HYPERV_TSCPAGE
 static bool __init hv_init_tsc_clocksource(void)
 {
 	u64		tsc_msr;
@@ -304,13 +300,6 @@ static bool __init hv_init_tsc_clocksource(void)
 
 	return true;
 }
-#else
-static bool __init hv_init_tsc_clocksource(void)
-{
-	return false;
-}
-#endif
-
 
 void __init hv_init_clocksource(void)
 {
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 9a59957922d4..79e5356a737a 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -14,9 +14,6 @@ config HYPERV
 config HYPERV_TIMER
 	def_bool HYPERV
 
-config HYPERV_TSCPAGE
-       def_bool HYPERV && X86_64
-
 config HYPERV_UTILS
 	tristate "Microsoft Hyper-V Utilities driver"
 	depends on HYPERV && CONNECTOR && NLS
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h
index a821deb8ecb2..422f5e5237be 100644
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -28,12 +28,10 @@ extern void hv_stimer_cleanup(unsigned int cpu);
 extern void hv_stimer_global_cleanup(void);
 extern void hv_stimer0_isr(void);
 
-#if IS_ENABLED(CONFIG_HYPERV)
+#ifdef CONFIG_HYPERV_TIMER
 extern struct clocksource *hyperv_cs;
 extern void hv_init_clocksource(void);
-#endif /* CONFIG_HYPERV */
 
-#ifdef CONFIG_HYPERV_TSCPAGE
 extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
 
 static inline notrace u64
@@ -91,7 +89,7 @@ hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
 	return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
 }
 
-#else /* CONFIG_HYPERV_TSC_PAGE */
+#else /* CONFIG_HYPERV_TIMER */
 static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 {
 	return NULL;
@@ -102,6 +100,6 @@ static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
 {
 	return U64_MAX;
 }
-#endif /* CONFIG_HYPERV_TSCPAGE */
+#endif /* CONFIG_HYPERV_TIMER */
 
 #endif
-- 
cgit 


From 9f475d084c032116cbecd4dc840003dc36465db5 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Tue, 30 Jul 2019 11:15:04 -0700
Subject: clocksource: Remove dev_err() usage after platform_get_irq()

We don't need dev_err() messages when platform_get_irq() fails now that
platform_get_irq() prints an error message itself when something goes
wrong. Let's remove these prints with a simple semantic patch.

// <smpl>
@@
expression ret;
struct platform_device *E;
@@

ret =
(
platform_get_irq(E, ...)
|
platform_get_irq_byname(E, ...)
);

if ( \( ret < 0 \| ret <= 0 \) )
{
(
-if (ret != -EPROBE_DEFER)
-{ ...
-dev_err(...);
-... }
|
...
-dev_err(...);
)
...
}
// </smpl>

While we're here, remove braces on if statements that only have one
statement (manually).

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/em_sti.c | 4 +---
 drivers/clocksource/sh_cmt.c | 5 +----
 drivers/clocksource/sh_tmu.c | 5 +----
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c
index 8e12b11e81b0..9039df4f90e2 100644
--- a/drivers/clocksource/em_sti.c
+++ b/drivers/clocksource/em_sti.c
@@ -291,10 +291,8 @@ static int em_sti_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, p);
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
-		dev_err(&pdev->dev, "failed to get irq\n");
+	if (irq < 0)
 		return irq;
-	}
 
 	/* map memory, let base point to the STI instance */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 55d3e03f2cd4..f6424b61e212 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -776,11 +776,8 @@ static int sh_cmt_register_clockevent(struct sh_cmt_channel *ch,
 	int ret;
 
 	irq = platform_get_irq(ch->cmt->pdev, ch->index);
-	if (irq < 0) {
-		dev_err(&ch->cmt->pdev->dev, "ch%u: failed to get irq\n",
-			ch->index);
+	if (irq < 0)
 		return irq;
-	}
 
 	ret = request_irq(irq, sh_cmt_interrupt,
 			  IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING,
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index 49f1c805fc95..8c4f3753b36e 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -462,11 +462,8 @@ static int sh_tmu_channel_setup(struct sh_tmu_channel *ch, unsigned int index,
 		ch->base = tmu->mapbase + 8 + ch->index * 12;
 
 	ch->irq = platform_get_irq(tmu->pdev, index);
-	if (ch->irq < 0) {
-		dev_err(&tmu->pdev->dev, "ch%u: failed to get irq\n",
-			ch->index);
+	if (ch->irq < 0)
 		return ch->irq;
-	}
 
 	ch->cs_enabled = false;
 	ch->enable_count = 0;
-- 
cgit 


From a08bda2d27f271aacc92b75f4f260d3136cf6f58 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Mon, 22 Jul 2019 10:12:19 +0200
Subject: dt-bindings: timer: Convert Allwinner A10 Timer to a schema

The older Allwinner SoCs have a Timer supported in Linux, with a matching
Device Tree binding.

While the original binding only mentions one interrupt, the timer actually
has 6 of them.

Now that we have the DT validation in place, let's convert the device tree
bindings for that controller over to a YAML schemas.

Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 .../bindings/timer/allwinner,sun4i-a10-timer.yaml  | 76 ++++++++++++++++++++++
 .../bindings/timer/allwinner,sun4i-timer.txt       | 19 ------
 2 files changed, 76 insertions(+), 19 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml
 delete mode 100644 Documentation/devicetree/bindings/timer/allwinner,sun4i-timer.txt

diff --git a/Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml b/Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml
new file mode 100644
index 000000000000..7292a424092c
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/timer/allwinner,sun4i-a10-timer.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Timer Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  compatible:
+    enum:
+      - allwinner,sun4i-a10-timer
+      - allwinner,suniv-f1c100s-timer
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    description:
+      List of timers interrupts
+
+  clocks:
+    maxItems: 1
+
+allOf:
+  - if:
+      properties:
+        compatible:
+          items:
+            const: allwinner,sun4i-a10-timer
+
+    then:
+      properties:
+        interrupts:
+          minItems: 6
+          maxItems: 6
+
+  - if:
+      properties:
+        compatible:
+          items:
+            const: allwinner,suniv-f1c100s-timer
+
+    then:
+      properties:
+        interrupts:
+          minItems: 3
+          maxItems: 3
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+
+additionalProperties: false
+
+examples:
+  - |
+    timer {
+        compatible = "allwinner,sun4i-a10-timer";
+        reg = <0x01c20c00 0x400>;
+        interrupts = <22>,
+                     <23>,
+                     <24>,
+                     <25>,
+                     <67>,
+                     <68>;
+        clocks = <&osc>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/timer/allwinner,sun4i-timer.txt b/Documentation/devicetree/bindings/timer/allwinner,sun4i-timer.txt
deleted file mode 100644
index 3da9d515c03a..000000000000
--- a/Documentation/devicetree/bindings/timer/allwinner,sun4i-timer.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Allwinner A1X SoCs Timer Controller
-
-Required properties:
-
-- compatible : should be one of the following:
-              "allwinner,sun4i-a10-timer"
-              "allwinner,suniv-f1c100s-timer"
-- reg : Specifies base physical address and size of the registers.
-- interrupts : The interrupt of the first timer
-- clocks: phandle to the source clock (usually a 24 MHz fixed clock)
-
-Example:
-
-timer {
-	compatible = "allwinner,sun4i-a10-timer";
-	reg = <0x01c20c00 0x400>;
-	interrupts = <22>;
-	clocks = <&osc>;
-};
-- 
cgit 


From 7fccfcd678e80cc8cf131922296eccf72e19a69c Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Mon, 22 Jul 2019 10:12:20 +0200
Subject: dt-bindings: timer: Add missing compatibles

Newer Allwinner SoCs have different number of interrupts, let's add
different compatibles for all of them to deal with this properly.

Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 .../bindings/timer/allwinner,sun4i-a10-timer.yaml  | 26 ++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml b/Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml
index 7292a424092c..20adc1c8e9cc 100644
--- a/Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml
+++ b/Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml
@@ -14,6 +14,8 @@ properties:
   compatible:
     enum:
       - allwinner,sun4i-a10-timer
+      - allwinner,sun8i-a23-timer
+      - allwinner,sun8i-v3s-timer
       - allwinner,suniv-f1c100s-timer
 
   reg:
@@ -39,6 +41,30 @@ allOf:
           minItems: 6
           maxItems: 6
 
+  - if:
+      properties:
+        compatible:
+          items:
+            const: allwinner,sun8i-a23-timer
+
+    then:
+      properties:
+        interrupts:
+          minItems: 2
+          maxItems: 2
+
+  - if:
+      properties:
+        compatible:
+          items:
+            const: allwinner,sun8i-v3s-timer
+
+    then:
+      properties:
+        interrupts:
+          minItems: 3
+          maxItems: 3
+
   - if:
       properties:
         compatible:
-- 
cgit 


From bca4e08426e48d1f59b9bb7a8835b08140525828 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Mon, 22 Jul 2019 10:12:21 +0200
Subject: clocksource: sun4i: Add missing compatibles

Newer Allwinner SoCs have different number of interrupts, let's add
different compatibles for all of them to deal with this properly.

Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-sun4i.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/clocksource/timer-sun4i.c b/drivers/clocksource/timer-sun4i.c
index 65f38f6ca714..0ba8155b8287 100644
--- a/drivers/clocksource/timer-sun4i.c
+++ b/drivers/clocksource/timer-sun4i.c
@@ -219,5 +219,9 @@ static int __init sun4i_timer_init(struct device_node *node)
 }
 TIMER_OF_DECLARE(sun4i, "allwinner,sun4i-a10-timer",
 		       sun4i_timer_init);
+TIMER_OF_DECLARE(sun8i_a23, "allwinner,sun8i-a23-timer",
+		 sun4i_timer_init);
+TIMER_OF_DECLARE(sun8i_v3s, "allwinner,sun8i-v3s-timer",
+		 sun4i_timer_init);
 TIMER_OF_DECLARE(suniv, "allwinner,suniv-f1c100s-timer",
 		       sun4i_timer_init);
-- 
cgit 


From d9b51093cca430f75a054d78eb35a6e13c1540cb Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Mon, 22 Jul 2019 10:12:22 +0200
Subject: dt-bindings: timer: Convert Allwinner A13 HSTimer to a schema

The newer Allwinner SoCs have a High Speed Timer supported in Linux, with a
matching Device Tree binding.

Now that we have the DT validation in place, let's convert the device tree
bindings for that controller over to a YAML schemas.

Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 .../bindings/timer/allwinner,sun5i-a13-hstimer.txt | 26 -------
 .../timer/allwinner,sun5i-a13-hstimer.yaml         | 79 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 26 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.txt
 create mode 100644 Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.yaml

diff --git a/Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.txt b/Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.txt
deleted file mode 100644
index 2c5c1be78360..000000000000
--- a/Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Allwinner SoCs High Speed Timer Controller
-
-Required properties:
-
-- compatible :	should be "allwinner,sun5i-a13-hstimer" or
-		"allwinner,sun7i-a20-hstimer"
-- reg : Specifies base physical address and size of the registers.
-- interrupts :	The interrupts of these timers (2 for the sun5i IP, 4 for the sun7i
-		one)
-- clocks: phandle to the source clock (usually the AHB clock)
-
-Optional properties:
-- resets: phandle to a reset controller asserting the timer
-
-Example:
-
-timer@1c60000 {
-	compatible = "allwinner,sun7i-a20-hstimer";
-	reg = <0x01c60000 0x1000>;
-	interrupts = <0 51 1>,
-		     <0 52 1>,
-		     <0 53 1>,
-		     <0 54 1>;
-	clocks = <&ahb1_gates 19>;
-	resets = <&ahb1rst 19>;
-};
diff --git a/Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.yaml b/Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.yaml
new file mode 100644
index 000000000000..dfa0c41fd261
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.yaml
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/timer/allwinner,sun5i-a13-hstimer.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A13 High-Speed Timer Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  compatible:
+    oneOf:
+      - const: allwinner,sun5i-a13-hstimer
+      - const: allwinner,sun7i-a20-hstimer
+      - items:
+          - const: allwinner,sun6i-a31-hstimer
+          - const: allwinner,sun7i-a20-hstimer
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    minItems: 2
+    maxItems: 4
+    items:
+      - description: Timer 0 Interrupt
+      - description: Timer 1 Interrupt
+      - description: Timer 2 Interrupt
+      - description: Timer 3 Interrupt
+
+  clocks:
+    maxItems: 1
+
+  resets:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+
+if:
+  properties:
+    compatible:
+      items:
+        const: allwinner,sun5i-a13-hstimer
+
+then:
+  properties:
+    interrupts:
+      minItems: 2
+      maxItems: 2
+
+else:
+  properties:
+    interrupts:
+      minItems: 4
+      maxItems: 4
+
+additionalProperties: false
+
+examples:
+  - |
+    timer@1c60000 {
+        compatible = "allwinner,sun7i-a20-hstimer";
+        reg = <0x01c60000 0x1000>;
+        interrupts = <0 51 1>,
+                     <0 52 1>,
+                     <0 53 1>,
+                     <0 54 1>;
+        clocks = <&ahb1_gates 19>;
+        resets = <&ahb1rst 19>;
+    };
+
+...
-- 
cgit 


From 1ce861cec03c79a68bae81a7e039edae46b2c493 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Tue, 13 Aug 2019 15:30:50 +0200
Subject: clocksource/drivers/tcb_clksrc: Register delay timer

Implement and register delay timer to allow get_cycles() to work properly.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/Kconfig           |  2 +-
 drivers/clocksource/timer-atmel-tcb.c | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 5e9317dc3d39..a642c23b2fba 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -429,7 +429,7 @@ config ATMEL_ST
 
 config ATMEL_TCB_CLKSRC
 	bool "Atmel TC Block timer driver" if COMPILE_TEST
-	depends on HAS_IOMEM
+	depends on ARM && HAS_IOMEM
 	select TIMER_OF if OF
 	help
 	  Support for Timer Counter Blocks on Atmel SoCs.
diff --git a/drivers/clocksource/timer-atmel-tcb.c b/drivers/clocksource/timer-atmel-tcb.c
index 6ed31f9def7e..7427b07495a8 100644
--- a/drivers/clocksource/timer-atmel-tcb.c
+++ b/drivers/clocksource/timer-atmel-tcb.c
@@ -6,6 +6,7 @@
 #include <linux/irq.h>
 
 #include <linux/clk.h>
+#include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/ioport.h>
 #include <linux/io.h>
@@ -125,6 +126,18 @@ static u64 notrace tc_sched_clock_read32(void)
 	return tc_get_cycles32(&clksrc);
 }
 
+static struct delay_timer tc_delay_timer;
+
+static unsigned long tc_delay_timer_read(void)
+{
+	return tc_get_cycles(&clksrc);
+}
+
+static unsigned long notrace tc_delay_timer_read32(void)
+{
+	return tc_get_cycles32(&clksrc);
+}
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 
 struct tc_clkevt_device {
@@ -432,6 +445,7 @@ static int __init tcb_clksrc_init(struct device_node *node)
 		/* setup ony channel 0 */
 		tcb_setup_single_chan(&tc, best_divisor_idx);
 		tc_sched_clock = tc_sched_clock_read32;
+		tc_delay_timer.read_current_timer = tc_delay_timer_read32;
 	} else {
 		/* we have three clocks no matter what the
 		 * underlying platform supports.
@@ -444,6 +458,7 @@ static int __init tcb_clksrc_init(struct device_node *node)
 		/* setup both channel 0 & 1 */
 		tcb_setup_dual_chan(&tc, best_divisor_idx);
 		tc_sched_clock = tc_sched_clock_read;
+		tc_delay_timer.read_current_timer = tc_delay_timer_read;
 	}
 
 	/* and away we go! */
@@ -458,6 +473,9 @@ static int __init tcb_clksrc_init(struct device_node *node)
 
 	sched_clock_register(tc_sched_clock, 32, divided_rate);
 
+	tc_delay_timer.freq = divided_rate;
+	register_current_timer_delay(&tc_delay_timer);
+
 	return 0;
 
 err_unregister_clksrc:
-- 
cgit 


From 4419e19d8117dda6bccfbc62090e766f469ff20a Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Thu, 15 Aug 2019 20:38:42 -0400
Subject: clocksource/drivers/imx-sysctr: Add internal clock divider handle

The system counter block guide states that the base clock is
internally divided by 3 before use, that means the clock input of
system counter defined in DT should be base clock which is normally
from OSC, and then internally divided by 3 before use.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-imx-sysctr.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/clocksource/timer-imx-sysctr.c b/drivers/clocksource/timer-imx-sysctr.c
index fd7d68066efb..b7c80a368a1b 100644
--- a/drivers/clocksource/timer-imx-sysctr.c
+++ b/drivers/clocksource/timer-imx-sysctr.c
@@ -20,6 +20,8 @@
 #define SYS_CTR_EN		0x1
 #define SYS_CTR_IRQ_MASK	0x2
 
+#define SYS_CTR_CLK_DIV		0x3
+
 static void __iomem *sys_ctr_base;
 static u32 cmpcr;
 
@@ -134,6 +136,9 @@ static int __init sysctr_timer_init(struct device_node *np)
 	if (ret)
 		return ret;
 
+	/* system counter clock is divided by 3 internally */
+	to_sysctr.of_clk.rate /= SYS_CTR_CLK_DIV;
+
 	sys_ctr_base = timer_of_base(&to_sysctr);
 	cmpcr = readl(sys_ctr_base + CMPCR);
 	cmpcr &= ~SYS_CTR_EN;
-- 
cgit 


From 5b0221bf7b08163030e6801255ffbaf52775df01 Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Thu, 15 Aug 2019 20:38:43 -0400
Subject: arm64: dts: imx8mm: Add system counter node

Add i.MX8MM system counter node to enable timer-imx-sysctr
broadcast timer driver.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 arch/arm64/boot/dts/freescale/imx8mm.dtsi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi
index 232a7412755a..89ef22a8f81e 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi
@@ -510,6 +510,14 @@
 				#pwm-cells = <2>;
 				status = "disabled";
 			};
+
+			system_counter: timer@306a0000 {
+				compatible = "nxp,sysctr-timer";
+				reg = <0x306a0000 0x20000>;
+				interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+				clocks = <&osc_24m>;
+				clock-names = "per";
+			};
 		};
 
 		aips3: bus@30800000 {
-- 
cgit 


From 24e8a5db8ae46bf021d3b4063c005f443282ab4f Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Thu, 15 Aug 2019 20:38:44 -0400
Subject: arm64: dts: imx8mq: Add system counter node

Add i.MX8MQ system counter node to enable timer-imx-sysctr
broadcast timer driver.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 arch/arm64/boot/dts/freescale/imx8mq.dtsi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/boot/dts/freescale/imx8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
index d09b808eff87..b4529773af51 100644
--- a/arch/arm64/boot/dts/freescale/imx8mq.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
@@ -635,6 +635,14 @@
 				#pwm-cells = <2>;
 				status = "disabled";
 			};
+
+			system_counter: timer@306a0000 {
+				compatible = "nxp,sysctr-timer";
+				reg = <0x306a0000 0x20000>;
+				interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+				clocks = <&osc_25m>;
+				clock-names = "per";
+			};
 		};
 
 		bus@30800000 { /* AIPS3 */
-- 
cgit 


From 8d18f6fc75c86b2efa5865655a0630a083c8a1fd Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 7 Aug 2019 10:46:33 +0200
Subject: clocksource/drivers/renesas-ostm: Use DIV_ROUND_CLOSEST() helper

Use the DIV_ROUND_CLOSEST() helper instead of open-coding the same
operation.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/renesas-ostm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clocksource/renesas-ostm.c b/drivers/clocksource/renesas-ostm.c
index 61d5f3b539ce..37c39b901bb1 100644
--- a/drivers/clocksource/renesas-ostm.c
+++ b/drivers/clocksource/renesas-ostm.c
@@ -221,7 +221,7 @@ static int __init ostm_init(struct device_node *np)
 	}
 
 	rate = clk_get_rate(ostm_clk);
-	ostm->ticks_per_jiffy = (rate + HZ / 2) / HZ;
+	ostm->ticks_per_jiffy = DIV_ROUND_CLOSEST(rate, HZ);
 
 	/*
 	 * First probed device will be used as system clocksource. Any
-- 
cgit 


From a2b58537b4a1cc08fd254fb8d1c24191ce286ae1 Mon Sep 17 00:00:00 2001
From: Avi Fishman <avifishman70@gmail.com>
Date: Mon, 29 Jul 2019 20:03:54 +0300
Subject: clocksource/drivers/npcm: Fix GENMASK and timer operation

NPCM7XX_Tx_OPER GENMASK bits are wrong, fix them.

Hopefully the NPCM7XX_REG_TICR0 register reset value of those bits was 0,
so it did not cause an issue.

The function npcm7xx_timer_oneshot() reads the register
NPCM7XX_REG_TCSR0, modifies it and then reads it again overwriting the
previous changes. Remove the extra read which is pointless.

The function npcm7xx_timer_periodic() is correct but the code writes
to the NPCM7XX_REG_TICR0 register while it is dealing with the
NPCM7XX_REG_TCSR0 register, that is confusing. Separate the write to
the registers in the code for the sake of clarity.

Fixes: 1c00289ecd12 ("clocksource/drivers/npcm: Add NPCM7xx timer driver")
Signed-off-by: Avi Fishman <avifishman70@gmail.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-npcm7xx.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/clocksource/timer-npcm7xx.c b/drivers/clocksource/timer-npcm7xx.c
index 8a30da7f083b..9780ffd8010e 100644
--- a/drivers/clocksource/timer-npcm7xx.c
+++ b/drivers/clocksource/timer-npcm7xx.c
@@ -32,7 +32,7 @@
 #define NPCM7XX_Tx_INTEN		BIT(29)
 #define NPCM7XX_Tx_COUNTEN		BIT(30)
 #define NPCM7XX_Tx_ONESHOT		0x0
-#define NPCM7XX_Tx_OPER			GENMASK(27, 3)
+#define NPCM7XX_Tx_OPER			GENMASK(28, 27)
 #define NPCM7XX_Tx_MIN_PRESCALE		0x1
 #define NPCM7XX_Tx_TDR_MASK_BITS	24
 #define NPCM7XX_Tx_MAX_CNT		0xFFFFFF
@@ -84,8 +84,6 @@ static int npcm7xx_timer_oneshot(struct clock_event_device *evt)
 
 	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
 	val &= ~NPCM7XX_Tx_OPER;
-
-	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
 	val |= NPCM7XX_START_ONESHOT_Tx;
 	writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
 
@@ -97,12 +95,11 @@ static int npcm7xx_timer_periodic(struct clock_event_device *evt)
 	struct timer_of *to = to_timer_of(evt);
 	u32 val;
 
+	writel(timer_of_period(to), timer_of_base(to) + NPCM7XX_REG_TICR0);
+
 	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
 	val &= ~NPCM7XX_Tx_OPER;
-
-	writel(timer_of_period(to), timer_of_base(to) + NPCM7XX_REG_TICR0);
 	val |= NPCM7XX_START_PERIODIC_Tx;
-
 	writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
 
 	return 0;
-- 
cgit 


From 763719771e84b8c8c2f53af668cdc905faa608de Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Wed, 21 Aug 2019 16:02:40 +0100
Subject: clocksource/drivers/timer-of: Do not warn on deferred probe

Deferred probe is an expected return value for clk_get() on many
platforms. The driver deals with it properly, so there's no need
to output a warning that may potentially confuse users.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-of.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c
index 80542289fae7..d8c2bd4391d0 100644
--- a/drivers/clocksource/timer-of.c
+++ b/drivers/clocksource/timer-of.c
@@ -113,8 +113,10 @@ static __init int timer_of_clk_init(struct device_node *np,
 	of_clk->clk = of_clk->name ? of_clk_get_by_name(np, of_clk->name) :
 		of_clk_get(np, of_clk->index);
 	if (IS_ERR(of_clk->clk)) {
-		pr_err("Failed to get clock for %pOF\n", np);
-		return PTR_ERR(of_clk->clk);
+		ret = PTR_ERR(of_clk->clk);
+		if (ret != -EPROBE_DEFER)
+			pr_err("Failed to get clock for %pOF\n", np);
+		goto out;
 	}
 
 	ret = clk_prepare_enable(of_clk->clk);
-- 
cgit 


From 14e019df1e64c8b19ce8e0b3da25b6f40c8716be Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Wed, 21 Aug 2019 16:02:41 +0100
Subject: clocksource/drivers: Do not warn on probe defer

Deferred probe is an expected return value on many platforms and so
there's no need to output a warning that may potentially confuse users.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-probe.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/clocksource/timer-probe.c b/drivers/clocksource/timer-probe.c
index dda1946e84dd..ee9574da53c0 100644
--- a/drivers/clocksource/timer-probe.c
+++ b/drivers/clocksource/timer-probe.c
@@ -29,7 +29,9 @@ void __init timer_probe(void)
 
 		ret = init_func_ret(np);
 		if (ret) {
-			pr_err("Failed to initialize '%pOF': %d\n", np, ret);
+			if (ret != -EPROBE_DEFER)
+				pr_err("Failed to initialize '%pOF': %d\n", np,
+				       ret);
 			continue;
 		}
 
-- 
cgit 


From c90d37c9c41a572ea7183299951341b4640d5b4b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm+renesas@opensource.se>
Date: Tue, 20 Aug 2019 21:35:03 +0900
Subject: dt-bindings: timer: renesas, cmt: Add CMT0234 to sh73a0 and r8a7740

Document the on-chip CMT devices included in r8a7740 and sh73a0.

Included in this patch is DT binding documentation for 32-bit CMTs
CMT0, CMT2, CMT3 and CMT4. They all contain a single channel and are
quite similar however some minor differences still exist:
 - "Counter input clock" (clock input and on-device divider)
    One example is that RCLK 1/1 is supported by CMT2, CMT3 and CMT4.
 - "Wakeup request" (supported by CMT0 and CMT2)

Because of this one unique compat string per CMT device is selected.

Signed-off-by: Magnus Damm <damm+renesas@opensource.se>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 Documentation/devicetree/bindings/timer/renesas,cmt.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
index c5220bcd852b..45840d475050 100644
--- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt
+++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
@@ -22,6 +22,10 @@ Required Properties:
 
     - "renesas,r8a73a4-cmt0" for the 32-bit CMT0 device included in r8a73a4.
     - "renesas,r8a73a4-cmt1" for the 48-bit CMT1 device included in r8a73a4.
+    - "renesas,r8a7740-cmt0" for the 32-bit CMT0 device included in r8a7740.
+    - "renesas,r8a7740-cmt2" for the 32-bit CMT2 device included in r8a7740.
+    - "renesas,r8a7740-cmt3" for the 32-bit CMT3 device included in r8a7740.
+    - "renesas,r8a7740-cmt4" for the 32-bit CMT4 device included in r8a7740.
     - "renesas,r8a7743-cmt0" for the 32-bit CMT0 device included in r8a7743.
     - "renesas,r8a7743-cmt1" for the 48-bit CMT1 device included in r8a7743.
     - "renesas,r8a7744-cmt0" for the 32-bit CMT0 device included in r8a7744.
@@ -54,6 +58,10 @@ Required Properties:
     - "renesas,r8a77980-cmt1" for the 48-bit CMT1 device included in r8a77980.
     - "renesas,r8a77990-cmt0" for the 32-bit CMT0 device included in r8a77990.
     - "renesas,r8a77990-cmt1" for the 48-bit CMT1 device included in r8a77990.
+    - "renesas,sh73a0-cmt0" for the 32-bit CMT0 device included in sh73a0.
+    - "renesas,sh73a0-cmt2" for the 32-bit CMT2 device included in sh73a0.
+    - "renesas,sh73a0-cmt3" for the 32-bit CMT3 device included in sh73a0.
+    - "renesas,sh73a0-cmt4" for the 32-bit CMT4 device included in sh73a0.
 
     - "renesas,rcar-gen2-cmt0" for 32-bit CMT0 devices included in R-Car Gen2
 		and RZ/G1.
-- 
cgit 


From 81b604c39997de91f4b2912f803074c85045fe36 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm+renesas@opensource.se>
Date: Tue, 20 Aug 2019 21:35:14 +0900
Subject: dt-bindings: timer: renesas, cmt: Update CMT1 on sh73a0 and r8a7740

This patch reworks the DT binding documentation for the 6-channel
48-bit CMTs known as CMT1 on r8a7740 and sh73a0.

After the update the same style of DT binding as the rest of the upstream
SoCs will now also be used by r8a7740 and sh73a0. The DT binding "cmt-48"
is removed from the DT binding documentation, however software support for
this deprecated binding will still remain in the CMT driver for some time.

Signed-off-by: Magnus Damm <damm+renesas@opensource.se>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 Documentation/devicetree/bindings/timer/renesas,cmt.txt | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
index 45840d475050..a297fca5b61e 100644
--- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt
+++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
@@ -12,17 +12,10 @@ datasheets.
 Required Properties:
 
   - compatible: must contain one or more of the following:
-    - "renesas,cmt-48-sh73a0" for the sh73A0 48-bit CMT
-		(CMT1)
-    - "renesas,cmt-48-r8a7740" for the r8a7740 48-bit CMT
-		(CMT1)
-    - "renesas,cmt-48" for all non-second generation 48-bit CMT
-		(CMT1 on sh73a0 and r8a7740)
-		This is a fallback for the above renesas,cmt-48-* entries.
-
     - "renesas,r8a73a4-cmt0" for the 32-bit CMT0 device included in r8a73a4.
     - "renesas,r8a73a4-cmt1" for the 48-bit CMT1 device included in r8a73a4.
     - "renesas,r8a7740-cmt0" for the 32-bit CMT0 device included in r8a7740.
+    - "renesas,r8a7740-cmt1" for the 48-bit CMT1 device included in r8a7740.
     - "renesas,r8a7740-cmt2" for the 32-bit CMT2 device included in r8a7740.
     - "renesas,r8a7740-cmt3" for the 32-bit CMT3 device included in r8a7740.
     - "renesas,r8a7740-cmt4" for the 32-bit CMT4 device included in r8a7740.
@@ -59,6 +52,7 @@ Required Properties:
     - "renesas,r8a77990-cmt0" for the 32-bit CMT0 device included in r8a77990.
     - "renesas,r8a77990-cmt1" for the 48-bit CMT1 device included in r8a77990.
     - "renesas,sh73a0-cmt0" for the 32-bit CMT0 device included in sh73a0.
+    - "renesas,sh73a0-cmt1" for the 48-bit CMT1 device included in sh73a0.
     - "renesas,sh73a0-cmt2" for the 32-bit CMT2 device included in sh73a0.
     - "renesas,sh73a0-cmt3" for the 32-bit CMT3 device included in sh73a0.
     - "renesas,sh73a0-cmt4" for the 32-bit CMT4 device included in sh73a0.
-- 
cgit 


From 649dd060334f13792f624ec3fa8a0024ed1e02bc Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm+renesas@opensource.se>
Date: Tue, 20 Aug 2019 21:35:25 +0900
Subject: dt-bindings: timer: renesas, cmt: Add CMT0 and CMT1 to r8a7792

This patch adds DT binding documentation for the CMT devices on
the R-Car Gen2 V2H (r8a7792) SoC.

Signed-off-by: Magnus Damm <damm+renesas@opensource.se>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 Documentation/devicetree/bindings/timer/renesas,cmt.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
index a297fca5b61e..5b7690ae8b9d 100644
--- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt
+++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
@@ -35,6 +35,8 @@ Required Properties:
     - "renesas,r8a7790-cmt1" for the 48-bit CMT1 device included in r8a7790.
     - "renesas,r8a7791-cmt0" for the 32-bit CMT0 device included in r8a7791.
     - "renesas,r8a7791-cmt1" for the 48-bit CMT1 device included in r8a7791.
+    - "renesas,r8a7792-cmt0" for the 32-bit CMT0 device included in r8a7792.
+    - "renesas,r8a7792-cmt1" for the 48-bit CMT1 device included in r8a7792.
     - "renesas,r8a7793-cmt0" for the 32-bit CMT0 device included in r8a7793.
     - "renesas,r8a7793-cmt1" for the 48-bit CMT1 device included in r8a7793.
     - "renesas,r8a7794-cmt0" for the 32-bit CMT0 device included in r8a7794.
-- 
cgit 


From 53933bc3a69e0f07a1af2fea16fda9c816ffcf87 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm+renesas@opensource.se>
Date: Tue, 20 Aug 2019 21:35:36 +0900
Subject: dt-bindings: timer: renesas, cmt: Add CMT0 and CMT1 to r8a77995

This patch adds DT binding documentation for the CMT devices on
the R-Car Gen3 D3 (r8a77995) SoC.

Signed-off-by: Magnus Damm <damm+renesas@opensource.se>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 Documentation/devicetree/bindings/timer/renesas,cmt.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
index 5b7690ae8b9d..c7fdcb02e083 100644
--- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt
+++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
@@ -53,6 +53,8 @@ Required Properties:
     - "renesas,r8a77980-cmt1" for the 48-bit CMT1 device included in r8a77980.
     - "renesas,r8a77990-cmt0" for the 32-bit CMT0 device included in r8a77990.
     - "renesas,r8a77990-cmt1" for the 48-bit CMT1 device included in r8a77990.
+    - "renesas,r8a77995-cmt0" for the 32-bit CMT0 device included in r8a77995.
+    - "renesas,r8a77995-cmt1" for the 48-bit CMT1 device included in r8a77995.
     - "renesas,sh73a0-cmt0" for the 32-bit CMT0 device included in sh73a0.
     - "renesas,sh73a0-cmt1" for the 48-bit CMT1 device included in sh73a0.
     - "renesas,sh73a0-cmt2" for the 32-bit CMT2 device included in sh73a0.
-- 
cgit 


From 1be8c9fd2ac9ad730cf537b8909f66c357866c5d Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm+renesas@opensource.se>
Date: Tue, 20 Aug 2019 21:35:46 +0900
Subject: dt-bindings: timer: renesas, cmt: Update R-Car Gen3 CMT1 usage

The R-Car Gen3 SoCs so far come with a total for 4 on-chip CMT devices:
 - CMT0
 - CMT1
 - CMT2
 - CMT3

CMT0 includes two rather basic 32-bit timer channels. The rest of the on-chip
CMT devices support 48-bit counters and have 8 channels each.

Based on the data sheet information "CMT2/3 are exactly same as CMT1"
it seems that CMT2 and CMT3 now use the CMT1 compat string in the DTSI.

Clarify this in the DT binding documentation by describing R-Car Gen3 and
RZ/G2 CMT1 as "48-bit CMT devices".

Signed-off-by: Magnus Damm <damm+renesas@opensource.se>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 .../devicetree/bindings/timer/renesas,cmt.txt        | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
index c7fdcb02e083..a444cfc5852a 100644
--- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt
+++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
@@ -28,9 +28,9 @@ Required Properties:
     - "renesas,r8a77470-cmt0" for the 32-bit CMT0 device included in r8a77470.
     - "renesas,r8a77470-cmt1" for the 48-bit CMT1 device included in r8a77470.
     - "renesas,r8a774a1-cmt0" for the 32-bit CMT0 device included in r8a774a1.
-    - "renesas,r8a774a1-cmt1" for the 48-bit CMT1 device included in r8a774a1.
+    - "renesas,r8a774a1-cmt1" for the 48-bit CMT devices included in r8a774a1.
     - "renesas,r8a774c0-cmt0" for the 32-bit CMT0 device included in r8a774c0.
-    - "renesas,r8a774c0-cmt1" for the 48-bit CMT1 device included in r8a774c0.
+    - "renesas,r8a774c0-cmt1" for the 48-bit CMT devices included in r8a774c0.
     - "renesas,r8a7790-cmt0" for the 32-bit CMT0 device included in r8a7790.
     - "renesas,r8a7790-cmt1" for the 48-bit CMT1 device included in r8a7790.
     - "renesas,r8a7791-cmt0" for the 32-bit CMT0 device included in r8a7791.
@@ -42,19 +42,19 @@ Required Properties:
     - "renesas,r8a7794-cmt0" for the 32-bit CMT0 device included in r8a7794.
     - "renesas,r8a7794-cmt1" for the 48-bit CMT1 device included in r8a7794.
     - "renesas,r8a7795-cmt0" for the 32-bit CMT0 device included in r8a7795.
-    - "renesas,r8a7795-cmt1" for the 48-bit CMT1 device included in r8a7795.
+    - "renesas,r8a7795-cmt1" for the 48-bit CMT devices included in r8a7795.
     - "renesas,r8a7796-cmt0" for the 32-bit CMT0 device included in r8a7796.
-    - "renesas,r8a7796-cmt1" for the 48-bit CMT1 device included in r8a7796.
+    - "renesas,r8a7796-cmt1" for the 48-bit CMT devices included in r8a7796.
     - "renesas,r8a77965-cmt0" for the 32-bit CMT0 device included in r8a77965.
-    - "renesas,r8a77965-cmt1" for the 48-bit CMT1 device included in r8a77965.
+    - "renesas,r8a77965-cmt1" for the 48-bit CMT devices included in r8a77965.
     - "renesas,r8a77970-cmt0" for the 32-bit CMT0 device included in r8a77970.
-    - "renesas,r8a77970-cmt1" for the 48-bit CMT1 device included in r8a77970.
+    - "renesas,r8a77970-cmt1" for the 48-bit CMT devices included in r8a77970.
     - "renesas,r8a77980-cmt0" for the 32-bit CMT0 device included in r8a77980.
-    - "renesas,r8a77980-cmt1" for the 48-bit CMT1 device included in r8a77980.
+    - "renesas,r8a77980-cmt1" for the 48-bit CMT devices included in r8a77980.
     - "renesas,r8a77990-cmt0" for the 32-bit CMT0 device included in r8a77990.
-    - "renesas,r8a77990-cmt1" for the 48-bit CMT1 device included in r8a77990.
+    - "renesas,r8a77990-cmt1" for the 48-bit CMT devices included in r8a77990.
     - "renesas,r8a77995-cmt0" for the 32-bit CMT0 device included in r8a77995.
-    - "renesas,r8a77995-cmt1" for the 48-bit CMT1 device included in r8a77995.
+    - "renesas,r8a77995-cmt1" for the 48-bit CMT devices included in r8a77995.
     - "renesas,sh73a0-cmt0" for the 32-bit CMT0 device included in sh73a0.
     - "renesas,sh73a0-cmt1" for the 48-bit CMT1 device included in sh73a0.
     - "renesas,sh73a0-cmt2" for the 32-bit CMT2 device included in sh73a0.
@@ -69,7 +69,7 @@ Required Properties:
 		listed above.
     - "renesas,rcar-gen3-cmt0" for 32-bit CMT0 devices included in R-Car Gen3
 		and RZ/G2.
-    - "renesas,rcar-gen3-cmt1" for 48-bit CMT1 devices included in R-Car Gen3
+    - "renesas,rcar-gen3-cmt1" for 48-bit CMT devices included in R-Car Gen3
 		and RZ/G2.
 		These are fallbacks for R-Car Gen3 and RZ/G2 entries listed
 		above.
-- 
cgit 


From 8c1afba285a86b9dbb0637f8c70a34fe2d88569e Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm+renesas@opensource.se>
Date: Tue, 20 Aug 2019 21:35:56 +0900
Subject: clocksource/drivers/sh_cmt: r8a7740 and sh73a0 SoC-specific match

Add SoC-specific matching for CMT1 on r8a7740 and sh73a0.

This allows us to move away from the old DT bindings such as
 - "renesas,cmt-48-sh73a0"
 - "renesas,cmt-48-r8a7740"
 - "renesas,cmt-48"
in favour for the now commonly used format "renesas,<soc>-<device>"

Signed-off-by: Magnus Damm <damm+renesas@opensource.se>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/sh_cmt.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index f6424b61e212..abf5e7873a18 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -924,6 +924,14 @@ static const struct of_device_id sh_cmt_of_table[] __maybe_unused = {
 		.compatible = "renesas,cmt-48-gen2",
 		.data = &sh_cmt_info[SH_CMT0_RCAR_GEN2]
 	},
+	{
+		.compatible = "renesas,r8a7740-cmt1",
+		.data = &sh_cmt_info[SH_CMT_48BIT]
+	},
+	{
+		.compatible = "renesas,sh73a0-cmt1",
+		.data = &sh_cmt_info[SH_CMT_48BIT]
+	},
 	{
 		.compatible = "renesas,rcar-gen2-cmt0",
 		.data = &sh_cmt_info[SH_CMT0_RCAR_GEN2]
-- 
cgit 


From 19d608458f4f3bb3a1f89bd7e4814c3fd30dbec7 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm+renesas@opensource.se>
Date: Tue, 20 Aug 2019 21:36:07 +0900
Subject: clocksource/drivers/sh_cmt: Document "cmt-48" as deprecated

Update the CMT driver to mark "renesas,cmt-48" as deprecated.

Instead of documenting a theoretical hardware device based on current software
support level, define DT bindings top-down based on available data sheet
information and make use of part numbers in the DT compat string.

In case of the only in-tree users r8a7740 and sh73a0 the compat strings
"renesas,r8a7740-cmt1" and "renesas,sh73a0-cmt1" may be used instead.

Signed-off-by: Magnus Damm <damm+renesas@opensource.se>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/sh_cmt.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index abf5e7873a18..ef773db080e9 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -918,7 +918,11 @@ static const struct platform_device_id sh_cmt_id_table[] = {
 MODULE_DEVICE_TABLE(platform, sh_cmt_id_table);
 
 static const struct of_device_id sh_cmt_of_table[] __maybe_unused = {
-	{ .compatible = "renesas,cmt-48", .data = &sh_cmt_info[SH_CMT_48BIT] },
+	{
+		/* deprecated, preserved for backward compatibility */
+		.compatible = "renesas,cmt-48",
+		.data = &sh_cmt_info[SH_CMT_48BIT]
+	},
 	{
 		/* deprecated, preserved for backward compatibility */
 		.compatible = "renesas,cmt-48-gen2",
-- 
cgit 


From 6ae40e3fdcd33a6ff3c490b9302d6a1861093f65 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:48 +0200
Subject: posix-cpu-timers: Provide task validation functions

The code contains three slightly different copies of validating whether a
given clock resolves to a valid task and whether the current caller has
permissions to access it.

Create central functions. Replace check_clock() as a first step and rename
it to something sensible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190821192919.326097175@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 65 ++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 387e0e86e1b8..b06ed8b14861 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -35,27 +35,52 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 	spin_unlock_irq(&task->sighand->siglock);
 }
 
-static int check_clock(const clockid_t which_clock)
+/*
+ * Functions for validating access to tasks.
+ */
+static struct task_struct *lookup_task(const pid_t pid, bool thread)
 {
-	int error = 0;
 	struct task_struct *p;
-	const pid_t pid = CPUCLOCK_PID(which_clock);
 
-	if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
-		return -EINVAL;
+	if (!pid)
+		return thread ? current : current->group_leader;
+
+	p = find_task_by_vpid(pid);
+	if (!p || p == current)
+		return p;
+	if (thread)
+		return same_thread_group(p, current) ? p : NULL;
+	if (p == current)
+		return p;
+	return has_group_leader_pid(p) ? p : NULL;
+}
+
+static struct task_struct *__get_task_for_clock(const clockid_t clock,
+						bool getref)
+{
+	const bool thread = !!CPUCLOCK_PERTHREAD(clock);
+	const pid_t pid = CPUCLOCK_PID(clock);
+	struct task_struct *p;
 
-	if (pid == 0)
-		return 0;
+	if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
+		return NULL;
 
 	rcu_read_lock();
-	p = find_task_by_vpid(pid);
-	if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
-		   same_thread_group(p, current) : has_group_leader_pid(p))) {
-		error = -EINVAL;
-	}
+	p = lookup_task(pid, thread);
+	if (p && getref)
+		get_task_struct(p);
 	rcu_read_unlock();
+	return p;
+}
 
-	return error;
+static inline struct task_struct *get_task_for_clock(const clockid_t clock)
+{
+	return __get_task_for_clock(clock, true);
+}
+
+static inline int validate_clock_permissions(const clockid_t clock)
+{
+	return __get_task_for_clock(clock, false) ? 0 : -EINVAL;
 }
 
 /*
@@ -125,7 +150,8 @@ static inline u64 virt_ticks(struct task_struct *p)
 static int
 posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
-	int error = check_clock(which_clock);
+	int error = validate_clock_permissions(which_clock);
+
 	if (!error) {
 		tp->tv_sec = 0;
 		tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
@@ -142,20 +168,17 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 }
 
 static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
+posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
 {
+	int error = validate_clock_permissions(clock);
+
 	/*
 	 * You can never reset a CPU clock, but we check for other errors
 	 * in the call before failing with EPERM.
 	 */
-	int error = check_clock(which_clock);
-	if (error == 0) {
-		error = -EPERM;
-	}
-	return error;
+	return error ? : -EPERM;
 }
 
-
 /*
  * Sample a per-thread clock for the given task.
  */
-- 
cgit 


From bfcf3e92c6c07cd1084624bad5622f3dad96328c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:49 +0200
Subject: posix-cpu-timers: Use common permission check in
 posix_cpu_clock_get()

Replace the next slightly different copy of permission checks. That also
removes the necessarity to check the return value of the sample functions
because the clock id is already validated.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192919.414813172@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 57 +++++++++++-------------------------------
 1 file changed, 14 insertions(+), 43 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index b06ed8b14861..eb11117bf227 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -289,53 +289,24 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 	return 0;
 }
 
-static int posix_cpu_clock_get_task(struct task_struct *tsk,
-				    const clockid_t which_clock,
-				    struct timespec64 *tp)
+static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
 {
-	int err = -EINVAL;
-	u64 rtn;
-
-	if (CPUCLOCK_PERTHREAD(which_clock)) {
-		if (same_thread_group(tsk, current))
-			err = cpu_clock_sample(which_clock, tsk, &rtn);
-	} else {
-		if (tsk == current || thread_group_leader(tsk))
-			err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-	}
-
-	if (!err)
-		*tp = ns_to_timespec64(rtn);
+	const clockid_t clkid = CPUCLOCK_WHICH(clock);
+	struct task_struct *tsk;
+	u64 t;
 
-	return err;
-}
-
-
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
-{
-	const pid_t pid = CPUCLOCK_PID(which_clock);
-	int err = -EINVAL;
+	tsk = get_task_for_clock(clock);
+	if (!tsk)
+		return -EINVAL;
 
-	if (pid == 0) {
-		/*
-		 * Special case constant value for our own clocks.
-		 * We don't have to do any lookup to find ourselves.
-		 */
-		err = posix_cpu_clock_get_task(current, which_clock, tp);
-	} else {
-		/*
-		 * Find the given PID, and validate that the caller
-		 * should be able to see it.
-		 */
-		struct task_struct *p;
-		rcu_read_lock();
-		p = find_task_by_vpid(pid);
-		if (p)
-			err = posix_cpu_clock_get_task(p, which_clock, tp);
-		rcu_read_unlock();
-	}
+	if (CPUCLOCK_PERTHREAD(clock))
+		cpu_clock_sample(clkid, tsk, &t);
+	else
+		cpu_clock_sample_group(clkid, tsk, &t);
+	put_task_struct(tsk);
 
-	return err;
+	*tp = ns_to_timespec64(t);
+	return 0;
 }
 
 /*
-- 
cgit 


From e5a8b65b4cb2fe024b83bdec0424269949cc0a27 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:50 +0200
Subject: posix-cpu-timers: Use common permission check in
 posix_cpu_timer_create()

Yet another copy of the same thing gone...

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192919.505833418@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 35 +++--------------------------------
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index eb11117bf227..4426a0f9c470 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -316,44 +316,15 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
  */
 static int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
-	int ret = 0;
-	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
-	struct task_struct *p;
+	struct task_struct *p = get_task_for_clock(new_timer->it_clock);
 
-	if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
+	if (!p)
 		return -EINVAL;
 
 	new_timer->kclock = &clock_posix_cpu;
-
 	INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-
-	rcu_read_lock();
-	if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
-		if (pid == 0) {
-			p = current;
-		} else {
-			p = find_task_by_vpid(pid);
-			if (p && !same_thread_group(p, current))
-				p = NULL;
-		}
-	} else {
-		if (pid == 0) {
-			p = current->group_leader;
-		} else {
-			p = find_task_by_vpid(pid);
-			if (p && !has_group_leader_pid(p))
-				p = NULL;
-		}
-	}
 	new_timer->it.cpu.task = p;
-	if (p) {
-		get_task_struct(p);
-	} else {
-		ret = -EINVAL;
-	}
-	rcu_read_unlock();
-
-	return ret;
+	return 0;
 }
 
 /*
-- 
cgit 


From 19298fbf453c90a6cf72288155f80c6f55e9139d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:51 +0200
Subject: posix-cpu-timers: Provide quick sample function for itimer

get_itimer() needs a sample of the current thread group cputime. It invokes
thread_group_cputimer() - which is a misnomer. That function also starts
eventually the group cputime accouting which is bogus because the
accounting is already active when a timer is armed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192919.599658199@linutronix.de
---
 include/linux/sched/cputime.h  |  2 +-
 kernel/time/posix-cpu-timers.c | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 53f883f5a2fd..6de7b384d2be 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -62,7 +62,7 @@ extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
-
+void thread_group_sample_cputime(struct task_struct *tsk, struct task_cputime *times);
 
 /*
  * The following are functions that support scheduler-internal time accounting.
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 4426a0f9c470..c22b6b604a95 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -232,6 +232,27 @@ static inline void sample_cputime_atomic(struct task_cputime *times,
 	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 
+/**
+ * thread_group_sample_cputime - Sample cputime for a given task
+ * @tsk:	Task for which cputime needs to be started
+ * @iimes:	Storage for time samples
+ *
+ * Called from sys_getitimer() to calculate the expiry time of an active
+ * timer. That means group cputime accounting is already active. Called
+ * with task sighand lock held.
+ *
+ * Updates @times with an uptodate sample of the thread group cputimes.
+ */
+void thread_group_sample_cputime(struct task_struct *tsk,
+				struct task_cputime *times)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+	WARN_ON_ONCE(!cputimer->running);
+
+	sample_cputime_atomic(times, &cputimer->cputime_atomic);
+}
+
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-- 
cgit 


From a34360d42434bbf28c0f375444c52c154ae3e6cf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:52 +0200
Subject: itimers: Use quick sample function

get_itimer() locks sighand lock and checks whether the timer is already
expired. If it is not expired then the thread group cputime accounting is
already enabled. Use the sampling function not the one which is meant for
starting a timer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192919.689713638@linutronix.de
---
 kernel/time/itimer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 9d26fd4ba4c0..ae04bc259240 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -58,7 +58,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 		struct task_cputime cputime;
 		u64 t;
 
-		thread_group_cputimer(tsk, &cputime);
+		thread_group_sample_cputime(tsk, &cputime);
 		if (clock_id == CPUCLOCK_PROF)
 			t = cputime.utime + cputime.stime;
 		else
-- 
cgit 


From a324956fae05d863386c682830e917f6685f1d4f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:53 +0200
Subject: posix-cpu-timers: Sample directly in timer check

The thread group accounting is active, otherwise the expiry function would
not be running. Sample the thread group time directly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192919.780348088@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index c22b6b604a95..cb736787145b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -914,16 +914,17 @@ static void check_process_timers(struct task_struct *tsk,
 	if (!READ_ONCE(tsk->signal->cputimer.running))
 		return;
 
-        /*
+       /*
 	 * Signify that a thread is checking for process timers.
 	 * Write access to this field is protected by the sighand lock.
 	 */
 	sig->cputimer.checking_timer = true;
 
 	/*
-	 * Collect the current process totals.
+	 * Collect the current process totals. Group accounting is active
+	 * so the sample can be taken directly.
 	 */
-	thread_group_cputimer(tsk, &cputime);
+	sample_cputime_atomic(&cputime, &sig->cputimer.cputime_atomic);
 	utime = cputime.utime;
 	ptime = utime + cputime.stime;
 	sum_sched_runtime = cputime.sum_exec_runtime;
-- 
cgit 


From c506bef424ca282f2ad357e86fee940c69018974 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:54 +0200
Subject: posix-cpu-timers: Rename thread_group_cputimer() and make it static

thread_group_cputimer() is a complete misnomer. The function does two things:

 - For arming process wide timers it makes sure that the atomic time
   storage is up to date. If no cpu timer is armed yet, then the atomic
   time storage is not updated by the scheduler for performance reasons.

   In that case a full summing up of all threads needs to be done and the
   update needs to be enabled.

- Samples the current time into the caller supplied storage.

Rename it to thread_group_start_cputime(), make it static and fixup the
callsite.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192919.869350319@linutronix.de
---
 include/linux/sched/cputime.h  |  1 -
 kernel/time/posix-cpu-timers.c | 17 +++++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 6de7b384d2be..2638fd0ab3f2 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -61,7 +61,6 @@ extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_sample_cputime(struct task_struct *tsk, struct task_cputime *times);
 
 /*
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cb736787145b..def225ae069a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -253,7 +253,20 @@ void thread_group_sample_cputime(struct task_struct *tsk,
 	sample_cputime_atomic(times, &cputimer->cputime_atomic);
 }
 
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+/**
+ * thread_group_start_cputime - Start cputime and return a sample
+ * @tsk:	Task for which cputime needs to be started
+ * @iimes:	Storage for time samples
+ *
+ * The thread group cputime accouting is avoided when there are no posix
+ * CPU timers armed. Before starting a timer it's required to check whether
+ * the time accounting is active. If not, a full update of the atomic
+ * accounting store needs to be done and the accounting enabled.
+ *
+ * Updates @times with an uptodate sample of the thread group cputimes.
+ */
+static void
+thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	struct task_cputime sum;
@@ -536,7 +549,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 {
 	struct task_cputime cputime;
 
-	thread_group_cputimer(p, &cputime);
+	thread_group_start_cputime(p, &cputime);
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
-- 
cgit 


From 24ab7f5a7b2c917e89fc6a87252f18faff91d6ce Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:55 +0200
Subject: posix-cpu-timers: Consolidate thread group sample code

cpu_clock_sample_group() and cpu_timer_sample_group() are almost the
same. Before the rename one called thread_group_cputimer() and the other
thread_group_cputime(). Really intuitive function names.

Consolidate the functions and also avoid the thread traversal when
the thread group's accounting is already active.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192919.960966884@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 59 ++++++++++++++----------------------------
 1 file changed, 20 insertions(+), 39 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index def225ae069a..a9003b2d614d 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -294,29 +294,37 @@ thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times)
 }
 
 /*
- * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with task sighand lock held for safe while_each_thread()
- * traversal.
+ * Sample a process (thread group) clock for the given task clkid. If the
+ * group's cputime accounting is already enabled, read the atomic
+ * store. Otherwise a full update is required.  Task's sighand lock must be
+ * held to protect the task traversal on a full update.
  */
 static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
-				  u64 *sample)
+				  u64 *sample, bool start)
 {
+	struct thread_group_cputimer *cputimer = &p->signal->cputimer;
 	struct task_cputime cputime;
 
+	if (!READ_ONCE(cputimer->running)) {
+		if (start)
+			thread_group_start_cputime(p, &cputime);
+		else
+			thread_group_cputime(p, &cputime);
+	} else {
+		sample_cputime_atomic(&cputime, &cputimer->cputime_atomic);
+	}
+
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		thread_group_cputime(p, &cputime);
 		*sample = cputime.utime + cputime.stime;
 		break;
 	case CPUCLOCK_VIRT:
-		thread_group_cputime(p, &cputime);
 		*sample = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		thread_group_cputime(p, &cputime);
 		*sample = cputime.sum_exec_runtime;
 		break;
 	}
@@ -336,7 +344,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
 	if (CPUCLOCK_PERTHREAD(clock))
 		cpu_clock_sample(clkid, tsk, &t);
 	else
-		cpu_clock_sample_group(clkid, tsk, &t);
+		cpu_clock_sample_group(clkid, tsk, &t, false);
 	put_task_struct(tsk);
 
 	*tp = ns_to_timespec64(t);
@@ -539,33 +547,6 @@ static void cpu_timer_fire(struct k_itimer *timer)
 	}
 }
 
-/*
- * Sample a process (thread group) timer for the given group_leader task.
- * Must be called with task sighand lock held for safe while_each_thread()
- * traversal.
- */
-static int cpu_timer_sample_group(const clockid_t which_clock,
-				  struct task_struct *p, u64 *sample)
-{
-	struct task_cputime cputime;
-
-	thread_group_start_cputime(p, &cputime);
-	switch (CPUCLOCK_WHICH(which_clock)) {
-	default:
-		return -EINVAL;
-	case CPUCLOCK_PROF:
-		*sample = cputime.utime + cputime.stime;
-		break;
-	case CPUCLOCK_VIRT:
-		*sample = cputime.utime;
-		break;
-	case CPUCLOCK_SCHED:
-		*sample = cputime.sum_exec_runtime;
-		break;
-	}
-	return 0;
-}
-
 /*
  * Guts of sys_timer_settime for CPU timers.
  * This is called with the timer locked and interrupts disabled.
@@ -627,7 +608,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 		cpu_clock_sample(timer->it_clock, p, &val);
 	} else {
-		cpu_timer_sample_group(timer->it_clock, p, &val);
+		cpu_clock_sample_group(timer->it_clock, p, &val, true);
 	}
 
 	if (old) {
@@ -755,7 +736,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 			timer->it.cpu.expires = 0;
 			return;
 		} else {
-			cpu_timer_sample_group(timer->it_clock, p, &now);
+			cpu_clock_sample_group(timer->it_clock, p, &now, false);
 			unlock_task_sighand(p, &flags);
 		}
 	}
@@ -1042,7 +1023,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 			/* If the process is dying, no need to rearm */
 			goto unlock;
 		}
-		cpu_timer_sample_group(timer->it_clock, p, &now);
+		cpu_clock_sample_group(timer->it_clock, p, &now, true);
 		bump_cpu_timer(timer, now);
 		/* Leave the sighand locked for the call below.  */
 	}
@@ -1211,7 +1192,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED))
 		return;
 
-	ret = cpu_timer_sample_group(clock_idx, tsk, &now);
+	ret = cpu_clock_sample_group(clock_idx, tsk, &now, true);
 
 	if (oldval && ret != -EINVAL) {
 		/*
-- 
cgit 


From c7a37c6f4c651a531101c5721814333bae2804ec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:56 +0200
Subject: posix-cpu-timers: Use clock ID in posix_cpu_timer_set()

Extract the clock ID (PROF/VIRT/SCHED) from the clock selector and use it
as argument to the sample functions. That allows to simplify them once all
callers are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.050770464@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a9003b2d614d..12561f8ef378 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -556,10 +556,11 @@ static void cpu_timer_fire(struct k_itimer *timer)
 static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 			       struct itimerspec64 *new, struct itimerspec64 *old)
 {
-	unsigned long flags;
-	struct sighand_struct *sighand;
-	struct task_struct *p = timer->it.cpu.task;
+	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
 	u64 old_expires, new_expires, old_incr, val;
+	struct task_struct *p = timer->it.cpu.task;
+	struct sighand_struct *sighand;
+	unsigned long flags;
 	int ret;
 
 	if (WARN_ON_ONCE(!p))
@@ -606,9 +607,9 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * check if it's already passed.  In short, we need a sample.
 	 */
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		cpu_clock_sample(timer->it_clock, p, &val);
+		cpu_clock_sample(clkid, p, &val);
 	} else {
-		cpu_clock_sample_group(timer->it_clock, p, &val, true);
+		cpu_clock_sample_group(clkid, p, &val, true);
 	}
 
 	if (old) {
-- 
cgit 


From 99093c5b81f58815fbfc1fe301c87206c6f5f730 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:57 +0200
Subject: posix-cpu-timers: Use clock ID in posix_cpu_timer_get()

Extract the clock ID (PROF/VIRT/SCHED) from the clock selector and use it
as argument to the sample functions. That allows to simplify them once all
callers are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.155487201@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 12561f8ef378..8e4ce8ade88c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -699,6 +699,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 
 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
 {
+	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
 	struct task_struct *p = timer->it.cpu.task;
 	u64 now;
 
@@ -717,7 +718,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 	 * Sample the clock to take the difference with the expiry time.
 	 */
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		cpu_clock_sample(timer->it_clock, p, &now);
+		cpu_clock_sample(clkid, p, &now);
 	} else {
 		struct sighand_struct *sighand;
 		unsigned long flags;
@@ -737,7 +738,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 			timer->it.cpu.expires = 0;
 			return;
 		} else {
-			cpu_clock_sample_group(timer->it_clock, p, &now, false);
+			cpu_clock_sample_group(clkid, p, &now, false);
 			unlock_task_sighand(p, &flags);
 		}
 	}
-- 
cgit 


From da020ce406b2a9b714b82de9123a4c5a4848647b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:58 +0200
Subject: posix-cpu-timers: Use clock ID in posix_cpu_timer_rearm()

Extract the clock ID (PROF/VIRT/SCHED) from the clock selector and use it
as argument to the sample functions. That allows to simplify them once all
callers are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.245357769@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 8e4ce8ade88c..aff7e3b64d84 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -987,6 +987,7 @@ static void check_process_timers(struct task_struct *tsk,
  */
 static void posix_cpu_timer_rearm(struct k_itimer *timer)
 {
+	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
 	struct task_struct *p = timer->it.cpu.task;
 	struct sighand_struct *sighand;
 	unsigned long flags;
@@ -999,7 +1000,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 	 * Fetch the current sample and update the timer's expiry time.
 	 */
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		cpu_clock_sample(timer->it_clock, p, &now);
+		cpu_clock_sample(clkid, p, &now);
 		bump_cpu_timer(timer, now);
 		if (unlikely(p->exit_state))
 			return;
@@ -1025,7 +1026,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 			/* If the process is dying, no need to rearm */
 			goto unlock;
 		}
-		cpu_clock_sample_group(timer->it_clock, p, &now, true);
+		cpu_clock_sample_group(clkid, p, &now, true);
 		bump_cpu_timer(timer, now);
 		/* Leave the sighand locked for the call below.  */
 	}
-- 
cgit 


From 5405d0051f7ca820d1781d87baf4d730ff58f208 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:08:59 +0200
Subject: posix-cpu-timers: Remove pointless return value check

set_process_cpu_timer() checks already whether the clock id is valid. No
point in checking the return value of the sample function. That allows to
simplify the sample function later.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.339725769@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index aff7e3b64d84..5944b7494be7 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1190,14 +1190,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   u64 *newval, u64 *oldval)
 {
 	u64 now;
-	int ret;
 
 	if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED))
 		return;
 
-	ret = cpu_clock_sample_group(clock_idx, tsk, &now, true);
+	cpu_clock_sample_group(clock_idx, tsk, &now, true);
 
-	if (oldval && ret != -EINVAL) {
+	if (oldval) {
 		/*
 		 * We are setting itimer. The *oldval is absolute and we update
 		 * it to be relative, *newval argument is relative and we update
-- 
cgit 


From 2092c1d4fed9f279d10600b4c1b5167dd8486a2a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:00 +0200
Subject: posix-cpu-timers: Simplify sample functions

All callers hand in a valdiated clock id. Remove the return value which was
unchecked in most places anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.430475832@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5944b7494be7..cc3d148344d3 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -180,14 +180,12 @@ posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
 }
 
 /*
- * Sample a per-thread clock for the given task.
+ * Sample a per-thread clock for the given task. clkid is validated.
  */
-static int cpu_clock_sample(const clockid_t which_clock,
-			    struct task_struct *p, u64 *sample)
+static void cpu_clock_sample(const clockid_t clkid, struct task_struct *p,
+			     u64 *sample)
 {
-	switch (CPUCLOCK_WHICH(which_clock)) {
-	default:
-		return -EINVAL;
+	switch (clkid) {
 	case CPUCLOCK_PROF:
 		*sample = prof_ticks(p);
 		break;
@@ -197,8 +195,9 @@ static int cpu_clock_sample(const clockid_t which_clock,
 	case CPUCLOCK_SCHED:
 		*sample = task_sched_runtime(p);
 		break;
+	default:
+		WARN_ON_ONCE(1);
 	}
-	return 0;
 }
 
 /*
@@ -297,11 +296,11 @@ thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times)
  * Sample a process (thread group) clock for the given task clkid. If the
  * group's cputime accounting is already enabled, read the atomic
  * store. Otherwise a full update is required.  Task's sighand lock must be
- * held to protect the task traversal on a full update.
+ * held to protect the task traversal on a full update. clkid is already
+ * validated.
  */
-static int cpu_clock_sample_group(const clockid_t which_clock,
-				  struct task_struct *p,
-				  u64 *sample, bool start)
+static void cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
+				   u64 *sample, bool start)
 {
 	struct thread_group_cputimer *cputimer = &p->signal->cputimer;
 	struct task_cputime cputime;
@@ -315,9 +314,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 		sample_cputime_atomic(&cputime, &cputimer->cputime_atomic);
 	}
 
-	switch (CPUCLOCK_WHICH(which_clock)) {
-	default:
-		return -EINVAL;
+	switch (clkid) {
 	case CPUCLOCK_PROF:
 		*sample = cputime.utime + cputime.stime;
 		break;
@@ -327,8 +324,9 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 	case CPUCLOCK_SCHED:
 		*sample = cputime.sum_exec_runtime;
 		break;
+	default:
+		WARN_ON_ONCE(1);
 	}
-	return 0;
 }
 
 static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
-- 
cgit 


From 8c2d74f03705c2c993a57673bae8fd535eabede6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:01 +0200
Subject: posix-cpu-timers: Get rid of pointer indirection

Now that the sample functions have no return value anymore, the result can
simply be returned instead of using pointer indirection.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.535079278@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 50 +++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cc3d148344d3..a2007ce9322a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -182,22 +182,19 @@ posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
 /*
  * Sample a per-thread clock for the given task. clkid is validated.
  */
-static void cpu_clock_sample(const clockid_t clkid, struct task_struct *p,
-			     u64 *sample)
+static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
 {
 	switch (clkid) {
 	case CPUCLOCK_PROF:
-		*sample = prof_ticks(p);
-		break;
+		return prof_ticks(p);
 	case CPUCLOCK_VIRT:
-		*sample = virt_ticks(p);
-		break;
+		return virt_ticks(p);
 	case CPUCLOCK_SCHED:
-		*sample = task_sched_runtime(p);
-		break;
+		return task_sched_runtime(p);
 	default:
 		WARN_ON_ONCE(1);
 	}
+	return 0;
 }
 
 /*
@@ -299,8 +296,8 @@ thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times)
  * held to protect the task traversal on a full update. clkid is already
  * validated.
  */
-static void cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
-				   u64 *sample, bool start)
+static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
+				  bool start)
 {
 	struct thread_group_cputimer *cputimer = &p->signal->cputimer;
 	struct task_cputime cputime;
@@ -316,17 +313,15 @@ static void cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
 
 	switch (clkid) {
 	case CPUCLOCK_PROF:
-		*sample = cputime.utime + cputime.stime;
-		break;
+		return cputime.utime + cputime.stime;
 	case CPUCLOCK_VIRT:
-		*sample = cputime.utime;
-		break;
+		return cputime.utime;
 	case CPUCLOCK_SCHED:
-		*sample = cputime.sum_exec_runtime;
-		break;
+		return cputime.sum_exec_runtime;
 	default:
 		WARN_ON_ONCE(1);
 	}
+	return 0;
 }
 
 static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
@@ -340,9 +335,9 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
 		return -EINVAL;
 
 	if (CPUCLOCK_PERTHREAD(clock))
-		cpu_clock_sample(clkid, tsk, &t);
+		t = cpu_clock_sample(clkid, tsk);
 	else
-		cpu_clock_sample_group(clkid, tsk, &t, false);
+		t = cpu_clock_sample_group(clkid, tsk, false);
 	put_task_struct(tsk);
 
 	*tp = ns_to_timespec64(t);
@@ -604,11 +599,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * times (in arm_timer).  With an absolute time, we must
 	 * check if it's already passed.  In short, we need a sample.
 	 */
-	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		cpu_clock_sample(clkid, p, &val);
-	} else {
-		cpu_clock_sample_group(clkid, p, &val, true);
-	}
+	if (CPUCLOCK_PERTHREAD(timer->it_clock))
+		val = cpu_clock_sample(clkid, p);
+	else
+		val = cpu_clock_sample_group(clkid, p, true);
 
 	if (old) {
 		if (old_expires == 0) {
@@ -716,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 	 * Sample the clock to take the difference with the expiry time.
 	 */
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		cpu_clock_sample(clkid, p, &now);
+		now = cpu_clock_sample(clkid, p);
 	} else {
 		struct sighand_struct *sighand;
 		unsigned long flags;
@@ -736,7 +730,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 			timer->it.cpu.expires = 0;
 			return;
 		} else {
-			cpu_clock_sample_group(clkid, p, &now, false);
+			now = cpu_clock_sample_group(clkid, p, false);
 			unlock_task_sighand(p, &flags);
 		}
 	}
@@ -998,7 +992,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 	 * Fetch the current sample and update the timer's expiry time.
 	 */
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		cpu_clock_sample(clkid, p, &now);
+		now = cpu_clock_sample(clkid, p);
 		bump_cpu_timer(timer, now);
 		if (unlikely(p->exit_state))
 			return;
@@ -1024,7 +1018,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 			/* If the process is dying, no need to rearm */
 			goto unlock;
 		}
-		cpu_clock_sample_group(clkid, p, &now, true);
+		now = cpu_clock_sample_group(clkid, p, true);
 		bump_cpu_timer(timer, now);
 		/* Leave the sighand locked for the call below.  */
 	}
@@ -1192,7 +1186,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED))
 		return;
 
-	cpu_clock_sample_group(clock_idx, tsk, &now, true);
+	now = cpu_clock_sample_group(clock_idx, tsk, true);
 
 	if (oldval) {
 		/*
-- 
cgit 


From 0476ff2c151ee35bda2f938e8828b6f6113ebf55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:02 +0200
Subject: posix-cpu-timers: Sample task times once in expiry check

Sampling the task times twice does not make sense. Do it once.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.639878168@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a2007ce9322a..98dab3e4be08 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -785,9 +785,9 @@ static inline void check_dl_overrun(struct task_struct *tsk)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
-	struct list_head *timers = tsk->cpu_timers;
 	struct task_cputime *tsk_expires = &tsk->cputime_expires;
-	u64 expires;
+	struct list_head *timers = tsk->cpu_timers;
+	u64 expires, stime, utime;
 	unsigned long soft;
 
 	if (dl_task(tsk))
@@ -800,10 +800,12 @@ static void check_thread_timers(struct task_struct *tsk,
 	if (task_cputime_zero(&tsk->cputime_expires))
 		return;
 
-	expires = check_timers_list(timers, firing, prof_ticks(tsk));
+	task_cputime(tsk, &utime, &stime);
+
+	expires = check_timers_list(timers, firing, utime + stime);
 	tsk_expires->prof_exp = expires;
 
-	expires = check_timers_list(++timers, firing, virt_ticks(tsk));
+	expires = check_timers_list(++timers, firing, utime);
 	tsk_expires->virt_exp = expires;
 
 	tsk_expires->sched_exp = check_timers_list(++timers, firing,
-- 
cgit 


From ab693c5a5e3107f035d5162e6ada9aaf7dd76a1d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:03 +0200
Subject: posix-cpu-timers: Move prof/virt_ticks into caller

The functions have only one caller left. No point in having them.

Move the almost duplicated code into the caller and simplify it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.729298382@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 98dab3e4be08..b1c97664a730 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -130,23 +130,6 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
 	return 0;
 }
 
-static inline u64 prof_ticks(struct task_struct *p)
-{
-	u64 utime, stime;
-
-	task_cputime(p, &utime, &stime);
-
-	return utime + stime;
-}
-static inline u64 virt_ticks(struct task_struct *p)
-{
-	u64 utime, stime;
-
-	task_cputime(p, &utime, &stime);
-
-	return utime;
-}
-
 static int
 posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
@@ -184,13 +167,18 @@ posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
  */
 static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
 {
+	u64 utime, stime;
+
+	if (clkid == CPUCLOCK_SCHED)
+		return task_sched_runtime(p);
+
+	task_cputime(p, &utime, &stime);
+
 	switch (clkid) {
 	case CPUCLOCK_PROF:
-		return prof_ticks(p);
+		return utime + stime;
 	case CPUCLOCK_VIRT:
-		return virt_ticks(p);
-	case CPUCLOCK_SCHED:
-		return task_sched_runtime(p);
+		return utime;
 	default:
 		WARN_ON_ONCE(1);
 	}
-- 
cgit 


From 2b69942f9021bf75bd1b001f53bd2578361fadf3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:04 +0200
Subject: posix-cpu-timers: Create a container struct

Per task/process data of posix CPU timers is all over the place which
makes the code hard to follow and requires ifdeffery.

Create a container to hold all this information in one place, so data is
consolidated and the ifdeffery can be confined to the posix timer header
file and removed from places like fork.

As a first step, move the cpu_timers list head array into the new struct
and clean up the initializers and simplify fork. The remaining #ifdef in
fork will be removed later.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192920.819418976@linutronix.de
---
 include/linux/init_task.h      | 11 -----------
 include/linux/posix-timers.h   | 34 ++++++++++++++++++++++++++++++++++
 include/linux/sched.h          |  4 +++-
 include/linux/sched/signal.h   |  5 +++--
 kernel/fork.c                  | 11 ++++-------
 kernel/time/posix-cpu-timers.c | 20 ++++++++++----------
 6 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6049baa5b8bc..2c620d7ac432 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -36,17 +36,6 @@ extern struct cred init_cred;
 #define INIT_PREV_CPUTIME(x)
 #endif
 
-#ifdef CONFIG_POSIX_TIMERS
-#define INIT_CPU_TIMERS(s)						\
-	.cpu_timers = {							\
-		LIST_HEAD_INIT(s.cpu_timers[0]),			\
-		LIST_HEAD_INIT(s.cpu_timers[1]),			\
-		LIST_HEAD_INIT(s.cpu_timers[2]),			\
-	},
-#else
-#define INIT_CPU_TIMERS(s)
-#endif
-
 #define INIT_TASK_COMM "swapper"
 
 /* Attach to the init_task data structure for proper alignment */
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 033374b99767..cdef89750b2c 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -62,6 +62,40 @@ static inline int clockid_to_fd(const clockid_t clk)
 	return ~(clk >> 3);
 }
 
+#ifdef CONFIG_POSIX_TIMERS
+/**
+ * posix_cputimers - Container for posix CPU timer related data
+ * @cpu_timers:		List heads to queue posix CPU timers
+ *
+ * Used in task_struct and signal_struct
+ */
+struct posix_cputimers {
+	struct list_head	cpu_timers[CPUCLOCK_MAX];
+};
+
+static inline void posix_cputimers_init(struct posix_cputimers *pct)
+{
+	INIT_LIST_HEAD(&pct->cpu_timers[0]);
+	INIT_LIST_HEAD(&pct->cpu_timers[1]);
+	INIT_LIST_HEAD(&pct->cpu_timers[2]);
+}
+
+/* Init task static initializer */
+#define INIT_CPU_TIMERLISTS(c)	{					\
+	LIST_HEAD_INIT(c.cpu_timers[0]),				\
+	LIST_HEAD_INIT(c.cpu_timers[1]),				\
+	LIST_HEAD_INIT(c.cpu_timers[2]),				\
+}
+
+#define INIT_CPU_TIMERS(s)						\
+	.posix_cputimers = {						\
+		.cpu_timers = INIT_CPU_TIMERLISTS(s.posix_cputimers),	\
+	},
+#else
+struct posix_cputimers { };
+#define INIT_CPU_TIMERS(s)
+#endif
+
 #define REQUEUE_PENDING 1
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8dc1811487f5..fde844a3b86e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -28,6 +28,7 @@
 #include <linux/signal_types.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
+#include <linux/posix-timers.h>
 #include <linux/rseq.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -878,8 +879,9 @@ struct task_struct {
 
 #ifdef CONFIG_POSIX_TIMERS
 	struct task_cputime		cputime_expires;
-	struct list_head		cpu_timers[3];
 #endif
+	/* Empty if CONFIG_POSIX_CPUTIMERS=n */
+	struct posix_cputimers		posix_cputimers;
 
 	/* Process credentials: */
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index efd8ce7675ed..88fbb3f1c375 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -9,6 +9,7 @@
 #include <linux/sched/task.h>
 #include <linux/cred.h>
 #include <linux/refcount.h>
+#include <linux/posix-timers.h>
 
 /*
  * Types defining task->signal and task->sighand and APIs using them:
@@ -151,9 +152,9 @@ struct signal_struct {
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
 
-	struct list_head cpu_timers[3];
-
 #endif
+	/* Empty if CONFIG_POSIX_TIMERS=n */
+	struct posix_cputimers posix_cputimers;
 
 	/* PID/PID hash table linkage. */
 	struct pid *pids[PIDTYPE_MAX];
diff --git a/kernel/fork.c b/kernel/fork.c
index d8ae0f1b4148..b6a135e4275b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1523,6 +1523,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
  */
 static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
+	struct posix_cputimers *pct = &sig->posix_cputimers;
 	unsigned long cpu_limit;
 
 	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
@@ -1531,10 +1532,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 		sig->cputimer.running = true;
 	}
 
-	/* The timer lists. */
-	INIT_LIST_HEAD(&sig->cpu_timers[0]);
-	INIT_LIST_HEAD(&sig->cpu_timers[1]);
-	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+	posix_cputimers_init(pct);
 }
 #else
 static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
@@ -1649,9 +1647,8 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	tsk->cputime_expires.prof_exp = 0;
 	tsk->cputime_expires.virt_exp = 0;
 	tsk->cputime_expires.sched_exp = 0;
-	INIT_LIST_HEAD(&tsk->cpu_timers[0]);
-	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
-	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+
+	posix_cputimers_init(&tsk->posix_cputimers);
 }
 #else
 static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index b1c97664a730..849e2045fb6e 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -407,11 +407,11 @@ static void cleanup_timers_list(struct list_head *head)
  *
  * This must be called with the siglock held.
  */
-static void cleanup_timers(struct list_head *head)
+static void cleanup_timers(struct posix_cputimers *pct)
 {
-	cleanup_timers_list(head);
-	cleanup_timers_list(++head);
-	cleanup_timers_list(++head);
+	cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_PROF]);
+	cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_VIRT]);
+	cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_SCHED]);
 }
 
 /*
@@ -421,11 +421,11 @@ static void cleanup_timers(struct list_head *head)
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-	cleanup_timers(tsk->cpu_timers);
+	cleanup_timers(&tsk->posix_cputimers);
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-	cleanup_timers(tsk->signal->cpu_timers);
+	cleanup_timers(&tsk->signal->posix_cputimers);
 }
 
 static inline int expires_gt(u64 expires, u64 new_exp)
@@ -446,10 +446,10 @@ static void arm_timer(struct k_itimer *timer)
 	struct cpu_timer_list *next;
 
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		head = p->cpu_timers;
+		head = p->posix_cputimers.cpu_timers;
 		cputime_expires = &p->cputime_expires;
 	} else {
-		head = p->signal->cpu_timers;
+		head = p->signal->posix_cputimers.cpu_timers;
 		cputime_expires = &p->signal->cputime_expires;
 	}
 	head += CPUCLOCK_WHICH(timer->it_clock);
@@ -773,8 +773,8 @@ static inline void check_dl_overrun(struct task_struct *tsk)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
+	struct list_head *timers = tsk->posix_cputimers.cpu_timers;
 	struct task_cputime *tsk_expires = &tsk->cputime_expires;
-	struct list_head *timers = tsk->cpu_timers;
 	u64 expires, stime, utime;
 	unsigned long soft;
 
@@ -879,9 +879,9 @@ static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
 	struct signal_struct *const sig = tsk->signal;
+	struct list_head *timers = sig->posix_cputimers.cpu_timers;
 	u64 utime, ptime, virt_expires, prof_expires;
 	u64 sum_sched_runtime, sched_expires;
-	struct list_head *timers = sig->cpu_timers;
 	struct task_cputime cputime;
 	unsigned long soft;
 
-- 
cgit 


From 9eacb5c7e6607aba00a7322b21cad83fc8b101c8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:05 +0200
Subject: sched: Move struct task_cputime to types.h

For upcoming posix-timer changes to avoid include recursion hell.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190821192920.909530418@linutronix.de
---
 include/linux/sched.h       | 17 +----------------
 include/linux/sched/types.h | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/sched/types.h

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fde844a3b86e..37c39df9b186 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,6 +25,7 @@
 #include <linux/resource.h>
 #include <linux/latencytop.h>
 #include <linux/sched/prio.h>
+#include <linux/sched/types.h>
 #include <linux/signal_types.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
@@ -245,22 +246,6 @@ struct prev_cputime {
 #endif
 };
 
-/**
- * struct task_cputime - collected CPU time counts
- * @utime:		time spent in user mode, in nanoseconds
- * @stime:		time spent in kernel mode, in nanoseconds
- * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
- *
- * This structure groups together three kinds of CPU time that are tracked for
- * threads and thread groups.  Most things considering CPU time want to group
- * these counts together and treat all three of them in parallel.
- */
-struct task_cputime {
-	u64				utime;
-	u64				stime;
-	unsigned long long		sum_exec_runtime;
-};
-
 /* Alternate field names when used on cache expirations: */
 #define virt_exp			utime
 #define prof_exp			stime
diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
new file mode 100644
index 000000000000..2c5c28ddd9b2
--- /dev/null
+++ b/include/linux/sched/types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_TYPES_H
+#define _LINUX_SCHED_TYPES_H
+
+#include <linux/types.h>
+
+/**
+ * struct task_cputime - collected CPU time counts
+ * @utime:		time spent in user mode, in nanoseconds
+ * @stime:		time spent in kernel mode, in nanoseconds
+ * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
+ *
+ * This structure groups together three kinds of CPU time that are tracked for
+ * threads and thread groups.  Most things considering CPU time want to group
+ * these counts together and treat all three of them in parallel.
+ */
+struct task_cputime {
+	u64				utime;
+	u64				stime;
+	unsigned long long		sum_exec_runtime;
+};
+
+#endif
-- 
cgit 


From 3a245c0f110e2bfcf7f2cd2248a29005c78999e3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:06 +0200
Subject: posix-cpu-timers: Move expiry cache into struct posix_cputimers

The expiry cache belongs into the posix_cputimers container where the other
cpu timers information is.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.014444012@linutronix.de
---
 include/linux/posix-timers.h   | 22 +++++++++++++++++++++
 include/linux/sched.h          |  8 --------
 include/linux/sched/signal.h   |  3 ---
 kernel/fork.c                  | 25 +++--------------------
 kernel/sched/rt.c              |  6 ++++--
 kernel/time/posix-cpu-timers.c | 45 +++++++++++++++++++++++++-----------------
 6 files changed, 56 insertions(+), 53 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index cdef89750b2c..a3731ba15bce 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -62,24 +62,43 @@ static inline int clockid_to_fd(const clockid_t clk)
 	return ~(clk >> 3);
 }
 
+/*
+ * Alternate field names for struct task_cputime when used on cache
+ * expirations. Will go away soon.
+ */
+#define virt_exp			utime
+#define prof_exp			stime
+#define sched_exp			sum_exec_runtime
+
 #ifdef CONFIG_POSIX_TIMERS
 /**
  * posix_cputimers - Container for posix CPU timer related data
+ * @cputime_expires:	Earliest-expiration cache
  * @cpu_timers:		List heads to queue posix CPU timers
  *
  * Used in task_struct and signal_struct
  */
 struct posix_cputimers {
+	struct task_cputime	cputime_expires;
 	struct list_head	cpu_timers[CPUCLOCK_MAX];
 };
 
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
+	memset(&pct->cputime_expires, 0, sizeof(pct->cputime_expires));
 	INIT_LIST_HEAD(&pct->cpu_timers[0]);
 	INIT_LIST_HEAD(&pct->cpu_timers[1]);
 	INIT_LIST_HEAD(&pct->cpu_timers[2]);
 }
 
+void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit);
+
+static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
+					       u64 runtime)
+{
+	pct->cputime_expires.sched_exp = runtime;
+}
+
 /* Init task static initializer */
 #define INIT_CPU_TIMERLISTS(c)	{					\
 	LIST_HEAD_INIT(c.cpu_timers[0]),				\
@@ -94,6 +113,9 @@ static inline void posix_cputimers_init(struct posix_cputimers *pct)
 #else
 struct posix_cputimers { };
 #define INIT_CPU_TIMERS(s)
+static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
+static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
+					      u64 cpu_limit) { }
 #endif
 
 #define REQUEUE_PENDING 1
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 37c39df9b186..8cc8e323093f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -246,11 +246,6 @@ struct prev_cputime {
 #endif
 };
 
-/* Alternate field names when used on cache expirations: */
-#define virt_exp			utime
-#define prof_exp			stime
-#define sched_exp			sum_exec_runtime
-
 enum vtime_state {
 	/* Task is sleeping or running in a CPU with VTIME inactive: */
 	VTIME_INACTIVE = 0,
@@ -862,9 +857,6 @@ struct task_struct {
 	unsigned long			min_flt;
 	unsigned long			maj_flt;
 
-#ifdef CONFIG_POSIX_TIMERS
-	struct task_cputime		cputime_expires;
-#endif
 	/* Empty if CONFIG_POSIX_CPUTIMERS=n */
 	struct posix_cputimers		posix_cputimers;
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 88fbb3f1c375..729bd892ee45 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -149,9 +149,6 @@ struct signal_struct {
 	 */
 	struct thread_group_cputimer cputimer;
 
-	/* Earliest-expiration cache. */
-	struct task_cputime cputime_expires;
-
 #endif
 	/* Empty if CONFIG_POSIX_TIMERS=n */
 	struct posix_cputimers posix_cputimers;
diff --git a/kernel/fork.c b/kernel/fork.c
index b6a135e4275b..52bfe7c20ff6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1527,12 +1527,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	unsigned long cpu_limit;
 
 	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-	if (cpu_limit != RLIM_INFINITY) {
-		sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
+	posix_cputimers_group_init(pct, cpu_limit);
+	if (cpu_limit != RLIM_INFINITY)
 		sig->cputimer.running = true;
-	}
-
-	posix_cputimers_init(pct);
 }
 #else
 static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
@@ -1638,22 +1635,6 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
 
-#ifdef CONFIG_POSIX_TIMERS
-/*
- * Initialize POSIX timer handling for a single task.
- */
-static void posix_cpu_timers_init(struct task_struct *tsk)
-{
-	tsk->cputime_expires.prof_exp = 0;
-	tsk->cputime_expires.virt_exp = 0;
-	tsk->cputime_expires.sched_exp = 0;
-
-	posix_cputimers_init(&tsk->posix_cputimers);
-}
-#else
-static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
-#endif
-
 static inline void init_task_pid_links(struct task_struct *task)
 {
 	enum pid_type type;
@@ -1932,7 +1913,7 @@ static __latent_entropy struct task_struct *copy_process(
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
-	posix_cpu_timers_init(p);
+	posix_cputimers_init(&p->posix_cputimers);
 
 	p->io_context = NULL;
 	audit_set_context(p, NULL);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index da3e85e61013..d6678f773c96 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2305,8 +2305,10 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 		}
 
 		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
-		if (p->rt.timeout > next)
-			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+		if (p->rt.timeout > next) {
+			posix_cputimers_rt_watchdog(&p->posix_cputimers,
+						    p->se.sum_exec_runtime);
+		}
 	}
 }
 #else
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 849e2045fb6e..3e29d1692437 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -20,11 +20,18 @@
 
 static void posix_cpu_timer_rearm(struct k_itimer *timer);
 
+void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
+{
+	posix_cputimers_init(pct);
+	if (cpu_limit != RLIM_INFINITY)
+		pct->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
+}
+
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
- * tsk->signal->cputime_expires expiration cache if necessary. Needs
- * siglock protection since other code may update expiration cache as
- * well.
+ * tsk->signal->posix_cputimers.cputime_expires expiration cache if
+ * necessary. Needs siglock protection since other code may update
+ * expiration cache as well.
  */
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 {
@@ -447,10 +454,10 @@ static void arm_timer(struct k_itimer *timer)
 
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 		head = p->posix_cputimers.cpu_timers;
-		cputime_expires = &p->cputime_expires;
+		cputime_expires = &p->posix_cputimers.cputime_expires;
 	} else {
 		head = p->signal->posix_cputimers.cpu_timers;
-		cputime_expires = &p->signal->cputime_expires;
+		cputime_expires = &p->signal->posix_cputimers.cputime_expires;
 	}
 	head += CPUCLOCK_WHICH(timer->it_clock);
 
@@ -774,7 +781,7 @@ static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
 	struct list_head *timers = tsk->posix_cputimers.cpu_timers;
-	struct task_cputime *tsk_expires = &tsk->cputime_expires;
+	struct task_cputime *tsk_expires = &tsk->posix_cputimers.cputime_expires;
 	u64 expires, stime, utime;
 	unsigned long soft;
 
@@ -785,7 +792,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	 * If cputime_expires is zero, then there are no active
 	 * per thread CPU timers.
 	 */
-	if (task_cputime_zero(&tsk->cputime_expires))
+	if (task_cputime_zero(tsk_expires))
 		return;
 
 	task_cputime(tsk, &utime, &stime);
@@ -954,10 +961,10 @@ static void check_process_timers(struct task_struct *tsk,
 			prof_expires = x;
 	}
 
-	sig->cputime_expires.prof_exp = prof_expires;
-	sig->cputime_expires.virt_exp = virt_expires;
-	sig->cputime_expires.sched_exp = sched_expires;
-	if (task_cputime_zero(&sig->cputime_expires))
+	sig->posix_cputimers.cputime_expires.prof_exp = prof_expires;
+	sig->posix_cputimers.cputime_expires.virt_exp = virt_expires;
+	sig->posix_cputimers.cputime_expires.sched_exp = sched_expires;
+	if (task_cputime_zero(&sig->posix_cputimers.cputime_expires))
 		stop_process_timers(sig);
 
 	sig->cputimer.checking_timer = false;
@@ -1058,12 +1065,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 {
 	struct signal_struct *sig;
 
-	if (!task_cputime_zero(&tsk->cputime_expires)) {
+	if (!task_cputime_zero(&tsk->posix_cputimers.cputime_expires)) {
 		struct task_cputime task_sample;
 
 		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
 		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
-		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+		if (task_cputime_expired(&task_sample,
+					 &tsk->posix_cputimers.cputime_expires))
 			return 1;
 	}
 
@@ -1088,7 +1096,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 
 		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
 
-		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
+		if (task_cputime_expired(&group_sample,
+					 &sig->posix_cputimers.cputime_expires))
 			return 1;
 	}
 
@@ -1204,12 +1213,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	 */
 	switch (clock_idx) {
 	case CPUCLOCK_PROF:
-		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
-			tsk->signal->cputime_expires.prof_exp = *newval;
+		if (expires_gt(tsk->signal->posix_cputimers.cputime_expires.prof_exp, *newval))
+			tsk->signal->posix_cputimers.cputime_expires.prof_exp = *newval;
 		break;
 	case CPUCLOCK_VIRT:
-		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
-			tsk->signal->cputime_expires.virt_exp = *newval;
+		if (expires_gt(tsk->signal->posix_cputimers.cputime_expires.virt_exp, *newval))
+			tsk->signal->posix_cputimers.cputime_expires.virt_exp = *newval;
 		break;
 	}
 
-- 
cgit 


From 11b8462f7e1d25f639c88949a2746a9c2667a766 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:07 +0200
Subject: posix-cpu-timers: Provide array based access to expiry cache

Using struct task_cputime for the expiry cache is a pretty odd choice and
comes with magic defines to rename the fields for usage in the expiry
cache.

struct task_cputime is basically a u64 array with 3 members, but it has
distinct members.

The expiry cache content is different than the content of task_cputime
because

  expiry[PROF]  = task_cputime.stime + task_cputime.utime
  expiry[VIRT]  = task_cputime.utime
  expiry[SCHED] = task_cputime.sum_exec_runtime

So there is no direct mapping between task_cputime and the expiry cache and
the #define based remapping is just a horrible hack.

Having the expiry cache array based allows further simplification of the
expiry code.

To avoid an all in one cleanup which is hard to review add a temporary
anonymous union into struct task_cputime which allows array based access to
it. That requires to reorder the members. Add a build time sanity check to
validate that the members are at the same place.

The union and the build time checks will be removed after conversion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.105793824@linutronix.de
---
 include/linux/posix-timers.h   | 24 ++++++++++++++++++------
 include/linux/sched/types.h    |  4 ++--
 kernel/time/posix-cpu-timers.c | 12 +++++++++++-
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a3731ba15bce..ed0f6ec30aa6 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -65,27 +65,39 @@ static inline int clockid_to_fd(const clockid_t clk)
 /*
  * Alternate field names for struct task_cputime when used on cache
  * expirations. Will go away soon.
+ *
+ * stime corresponds to CLOCKCPU_PROF
+ * utime corresponds to CLOCKCPU_VIRT
+ * sum_exex_runtime corresponds to CLOCKCPU_SCHED
+ *
+ * The ordering is currently enforced so struct task_cputime and the
+ * expiries array in struct posix_cputimers are equivalent.
  */
-#define virt_exp			utime
 #define prof_exp			stime
+#define virt_exp			utime
 #define sched_exp			sum_exec_runtime
 
 #ifdef CONFIG_POSIX_TIMERS
 /**
  * posix_cputimers - Container for posix CPU timer related data
- * @cputime_expires:	Earliest-expiration cache
+ * @cputime_expires:	Earliest-expiration cache task_cputime based
+ * @expiries:		Earliest-expiration cache array based
  * @cpu_timers:		List heads to queue posix CPU timers
  *
  * Used in task_struct and signal_struct
  */
 struct posix_cputimers {
-	struct task_cputime	cputime_expires;
-	struct list_head	cpu_timers[CPUCLOCK_MAX];
+	/* Temporary union until all users are cleaned up */
+	union {
+		struct task_cputime	cputime_expires;
+		u64			expiries[CPUCLOCK_MAX];
+	};
+	struct list_head		cpu_timers[CPUCLOCK_MAX];
 };
 
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
-	memset(&pct->cputime_expires, 0, sizeof(pct->cputime_expires));
+	memset(&pct->expiries, 0, sizeof(pct->expiries));
 	INIT_LIST_HEAD(&pct->cpu_timers[0]);
 	INIT_LIST_HEAD(&pct->cpu_timers[1]);
 	INIT_LIST_HEAD(&pct->cpu_timers[2]);
@@ -96,7 +108,7 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit);
 static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 					       u64 runtime)
 {
-	pct->cputime_expires.sched_exp = runtime;
+	pct->expiries[CPUCLOCK_SCHED] = runtime;
 }
 
 /* Init task static initializer */
diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
index 2c5c28ddd9b2..3c3e049224ae 100644
--- a/include/linux/sched/types.h
+++ b/include/linux/sched/types.h
@@ -6,8 +6,8 @@
 
 /**
  * struct task_cputime - collected CPU time counts
- * @utime:		time spent in user mode, in nanoseconds
  * @stime:		time spent in kernel mode, in nanoseconds
+ * @utime:		time spent in user mode, in nanoseconds
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
  *
  * This structure groups together three kinds of CPU time that are tracked for
@@ -15,8 +15,8 @@
  * these counts together and treat all three of them in parallel.
  */
 struct task_cputime {
-	u64				utime;
 	u64				stime;
+	u64				utime;
 	unsigned long long		sum_exec_runtime;
 };
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3e29d1692437..a38b6d04e8b5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -18,13 +18,23 @@
 
 #include "posix-timers.h"
 
+static inline void temporary_check(void)
+{
+	BUILD_BUG_ON(offsetof(struct task_cputime, stime) !=
+		     CPUCLOCK_PROF * sizeof(u64));
+	BUILD_BUG_ON(offsetof(struct task_cputime, utime) !=
+		     CPUCLOCK_VIRT * sizeof(u64));
+	BUILD_BUG_ON(offsetof(struct task_cputime, sum_exec_runtime) !=
+		     CPUCLOCK_SCHED * sizeof(u64));
+}
+
 static void posix_cpu_timer_rearm(struct k_itimer *timer);
 
 void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
 {
 	posix_cputimers_init(pct);
 	if (cpu_limit != RLIM_INFINITY)
-		pct->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
+		pct->expiries[CPUCLOCK_PROF] = cpu_limit * NSEC_PER_SEC;
 }
 
 /*
-- 
cgit 


From 3b495b22d04df3220ccae829bf7c5cadb3059ccf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:08 +0200
Subject: posix-cpu-timers: Simplify timer queueing

Now that the expiry cache can be accessed as an array, the per clock
checking can be reduced to just comparing the corresponding array elements.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.212129449@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 55 ++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 34 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a38b6d04e8b5..b13241756fdd 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -456,20 +456,20 @@ static inline int expires_gt(u64 expires, u64 new_exp)
  */
 static void arm_timer(struct k_itimer *timer)
 {
+	struct cpu_timer_list *const nt = &timer->it.cpu;
+	int clkidx = CPUCLOCK_WHICH(timer->it_clock);
+	u64 *cpuexp, newexp = timer->it.cpu.expires;
 	struct task_struct *p = timer->it.cpu.task;
 	struct list_head *head, *listpos;
-	struct task_cputime *cputime_expires;
-	struct cpu_timer_list *const nt = &timer->it.cpu;
 	struct cpu_timer_list *next;
 
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		head = p->posix_cputimers.cpu_timers;
-		cputime_expires = &p->posix_cputimers.cputime_expires;
+		head = p->posix_cputimers.cpu_timers + clkidx;
+		cpuexp = p->posix_cputimers.expiries + clkidx;
 	} else {
-		head = p->signal->posix_cputimers.cpu_timers;
-		cputime_expires = &p->signal->posix_cputimers.cputime_expires;
+		head = p->signal->posix_cputimers.cpu_timers + clkidx;
+		cpuexp = p->signal->posix_cputimers.expiries + clkidx;
 	}
-	head += CPUCLOCK_WHICH(timer->it_clock);
 
 	listpos = head;
 	list_for_each_entry(next, head, entry) {
@@ -479,35 +479,22 @@ static void arm_timer(struct k_itimer *timer)
 	}
 	list_add(&nt->entry, listpos);
 
-	if (listpos == head) {
-		u64 exp = nt->expires;
+	if (listpos != head)
+		return;
 
-		/*
-		 * We are the new earliest-expiring POSIX 1.b timer, hence
-		 * need to update expiration cache. Take into account that
-		 * for process timers we share expiration cache with itimers
-		 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
-		 */
+	/*
+	 * We are the new earliest-expiring POSIX 1.b timer, hence
+	 * need to update expiration cache. Take into account that
+	 * for process timers we share expiration cache with itimers
+	 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
+	 */
+	if (expires_gt(*cpuexp, newexp))
+		*cpuexp = newexp;
 
-		switch (CPUCLOCK_WHICH(timer->it_clock)) {
-		case CPUCLOCK_PROF:
-			if (expires_gt(cputime_expires->prof_exp, exp))
-				cputime_expires->prof_exp = exp;
-			break;
-		case CPUCLOCK_VIRT:
-			if (expires_gt(cputime_expires->virt_exp, exp))
-				cputime_expires->virt_exp = exp;
-			break;
-		case CPUCLOCK_SCHED:
-			if (expires_gt(cputime_expires->sched_exp, exp))
-				cputime_expires->sched_exp = exp;
-			break;
-		}
-		if (CPUCLOCK_PERTHREAD(timer->it_clock))
-			tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
-		else
-			tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
-	}
+	if (CPUCLOCK_PERTHREAD(timer->it_clock))
+		tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
+	else
+		tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 /*
-- 
cgit 


From 1b0dd96d0f07c482bf1d04987cc1b35e376a7518 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:09 +0200
Subject: posix-cpu-timers: Simplify set_process_cpu_timer()

The expiry cache can now be accessed as an array. Replace the per clock
checks with a simple comparison of the clock indexed array member.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.303316423@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index b13241756fdd..2c47ce6cab8c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1174,15 +1174,15 @@ void run_posix_cpu_timers(void)
  * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
  * The tsk->sighand->siglock must be held by the caller.
  */
-void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
+void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
 			   u64 *newval, u64 *oldval)
 {
-	u64 now;
+	u64 now, *expiry = tsk->signal->posix_cputimers.expiries + clkid;
 
-	if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED))
+	if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
 		return;
 
-	now = cpu_clock_sample_group(clock_idx, tsk, true);
+	now = cpu_clock_sample_group(clkid, tsk, true);
 
 	if (oldval) {
 		/*
@@ -1205,19 +1205,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	}
 
 	/*
-	 * Update expiration cache if we are the earliest timer, or eventually
-	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
+	 * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
+	 * expiry cache is also used by RLIMIT_CPU!.
 	 */
-	switch (clock_idx) {
-	case CPUCLOCK_PROF:
-		if (expires_gt(tsk->signal->posix_cputimers.cputime_expires.prof_exp, *newval))
-			tsk->signal->posix_cputimers.cputime_expires.prof_exp = *newval;
-		break;
-	case CPUCLOCK_VIRT:
-		if (expires_gt(tsk->signal->posix_cputimers.cputime_expires.virt_exp, *newval))
-			tsk->signal->posix_cputimers.cputime_expires.virt_exp = *newval;
-		break;
-	}
+	if (expires_gt(*expiry, *newval))
+		*expiry = *newval;
 
 	tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
 }
-- 
cgit 


From c02b078e63a6f42029cb655d0aa3c991271637ac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:10 +0200
Subject: posix-cpu-timers: Switch check_*_timers() to array cache

Use the array based expiry cache in check_thread_timers() and convert the
store in check_process_timers() for consistency.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.408222378@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2c47ce6cab8c..220e3c7ae849 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -778,8 +778,7 @@ static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
 	struct list_head *timers = tsk->posix_cputimers.cpu_timers;
-	struct task_cputime *tsk_expires = &tsk->posix_cputimers.cputime_expires;
-	u64 expires, stime, utime;
+	u64 stime, utime, *expires = tsk->posix_cputimers.expiries;
 	unsigned long soft;
 
 	if (dl_task(tsk))
@@ -789,19 +788,14 @@ static void check_thread_timers(struct task_struct *tsk,
 	 * If cputime_expires is zero, then there are no active
 	 * per thread CPU timers.
 	 */
-	if (task_cputime_zero(tsk_expires))
+	if (task_cputime_zero(&tsk->posix_cputimers.cputime_expires))
 		return;
 
 	task_cputime(tsk, &utime, &stime);
 
-	expires = check_timers_list(timers, firing, utime + stime);
-	tsk_expires->prof_exp = expires;
-
-	expires = check_timers_list(++timers, firing, utime);
-	tsk_expires->virt_exp = expires;
-
-	tsk_expires->sched_exp = check_timers_list(++timers, firing,
-						   tsk->se.sum_exec_runtime);
+	*expires++ = check_timers_list(timers, firing, utime + stime);
+	*expires++ = check_timers_list(++timers, firing, utime);
+	*expires = check_timers_list(++timers, firing, tsk->se.sum_exec_runtime);
 
 	/*
 	 * Check for the special case thread timers.
@@ -839,7 +833,8 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
 		}
 	}
-	if (task_cputime_zero(tsk_expires))
+
+	if (task_cputime_zero(&tsk->posix_cputimers.cputime_expires))
 		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
@@ -958,9 +953,10 @@ static void check_process_timers(struct task_struct *tsk,
 			prof_expires = x;
 	}
 
-	sig->posix_cputimers.cputime_expires.prof_exp = prof_expires;
-	sig->posix_cputimers.cputime_expires.virt_exp = virt_expires;
-	sig->posix_cputimers.cputime_expires.sched_exp = sched_expires;
+	sig->posix_cputimers.expiries[CPUCLOCK_PROF] = prof_expires;
+	sig->posix_cputimers.expiries[CPUCLOCK_VIRT] = virt_expires;
+	sig->posix_cputimers.expiries[CPUCLOCK_SCHED] = sched_expires;
+
 	if (task_cputime_zero(&sig->posix_cputimers.cputime_expires))
 		stop_process_timers(sig);
 
-- 
cgit 


From bbc9bae1e49bee9862c7f24101a728e73cd9f589 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:11 +0200
Subject: posix-cpu-timers: Remove the odd field rename defines

The last users of the odd define based renaming of struct task_cputime
members are gone. Good riddance.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.499058279@linutronix.de
---
 include/linux/posix-timers.h | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index ed0f6ec30aa6..e36c6fda1af9 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -62,21 +62,6 @@ static inline int clockid_to_fd(const clockid_t clk)
 	return ~(clk >> 3);
 }
 
-/*
- * Alternate field names for struct task_cputime when used on cache
- * expirations. Will go away soon.
- *
- * stime corresponds to CLOCKCPU_PROF
- * utime corresponds to CLOCKCPU_VIRT
- * sum_exex_runtime corresponds to CLOCKCPU_SCHED
- *
- * The ordering is currently enforced so struct task_cputime and the
- * expiries array in struct posix_cputimers are equivalent.
- */
-#define prof_exp			stime
-#define virt_exp			utime
-#define sched_exp			sum_exec_runtime
-
 #ifdef CONFIG_POSIX_TIMERS
 /**
  * posix_cputimers - Container for posix CPU timer related data
-- 
cgit 


From b0d524f77956eec887b30732af1f5f98cbf62b9f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:12 +0200
Subject: posix-cpu-timers: Provide array based sample functions

Instead of using task_cputime and doing the addition of utime and stime at
all call sites, it's way simpler to have a sample array which allows
indexed based checks against the expiry cache array.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.590362974@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 220e3c7ae849..11c841c6a45d 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -202,6 +202,32 @@ static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
 	return 0;
 }
 
+static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
+{
+	samples[CPUCLOCK_PROF] = stime + utime;
+	samples[CPUCLOCK_VIRT] = utime;
+	samples[CPUCLOCK_SCHED] = rtime;
+}
+
+static void task_sample_cputime(struct task_struct *p, u64 *samples)
+{
+	u64 stime, utime;
+
+	task_cputime(p, &utime, &stime);
+	store_samples(samples, stime, utime, p->se.sum_exec_runtime);
+}
+
+static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
+				       u64 *samples)
+{
+	u64 stime, utime, rtime;
+
+	utime = atomic64_read(&at->utime);
+	stime = atomic64_read(&at->stime);
+	rtime = atomic64_read(&at->sum_exec_runtime);
+	store_samples(samples, stime, utime, rtime);
+}
+
 /*
  * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
  * to avoid race conditions with concurrent updates to cputime.
-- 
cgit 


From 001f7971433a53bb76443cd8f5827ca27b0bc8ca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:13 +0200
Subject: posix-cpu-timers: Make expiry checks array based

The expiry cache is an array indexed by clock ids. The new sample functions
allow to retrieve a corresponding array of samples.

Convert the fastpath expiry checks to make use of the new sample functions
and do the comparisons on the sample and the expiry array.

Make the check for the expiry array being zero array based as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.695481430@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 85 ++++++++++++++++++------------------------
 1 file changed, 36 insertions(+), 49 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 11c841c6a45d..e159b039e44a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -39,7 +39,7 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
 
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
- * tsk->signal->posix_cputimers.cputime_expires expiration cache if
+ * tsk->signal->posix_cputimers.expiries expiration cache if
  * necessary. Needs siglock protection since other code may update
  * expiration cache as well.
  */
@@ -132,19 +132,9 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
 	}
 }
 
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime:	The struct to compare.
- *
- * Checks @cputime to see if all fields are zero.  Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
+static inline bool expiry_cache_is_zero(const u64 *ec)
 {
-	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
-		return 1;
-	return 0;
+	return !(ec[CPUCLOCK_PROF] | ec[CPUCLOCK_VIRT] | ec[CPUCLOCK_SCHED]);
 }
 
 static int
@@ -811,10 +801,10 @@ static void check_thread_timers(struct task_struct *tsk,
 		check_dl_overrun(tsk);
 
 	/*
-	 * If cputime_expires is zero, then there are no active
-	 * per thread CPU timers.
+	 * If the expiry cache is zero, then there are no active per thread
+	 * CPU timers.
 	 */
-	if (task_cputime_zero(&tsk->posix_cputimers.cputime_expires))
+	if (expiry_cache_is_zero(tsk->posix_cputimers.expiries))
 		return;
 
 	task_cputime(tsk, &utime, &stime);
@@ -860,7 +850,7 @@ static void check_thread_timers(struct task_struct *tsk,
 		}
 	}
 
-	if (task_cputime_zero(&tsk->posix_cputimers.cputime_expires))
+	if (expiry_cache_is_zero(tsk->posix_cputimers.expiries))
 		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
@@ -983,7 +973,7 @@ static void check_process_timers(struct task_struct *tsk,
 	sig->posix_cputimers.expiries[CPUCLOCK_VIRT] = virt_expires;
 	sig->posix_cputimers.expiries[CPUCLOCK_SCHED] = sched_expires;
 
-	if (task_cputime_zero(&sig->posix_cputimers.cputime_expires))
+	if (expiry_cache_is_zero(sig->posix_cputimers.expiries))
 		stop_process_timers(sig);
 
 	sig->cputimer.checking_timer = false;
@@ -1048,26 +1038,23 @@ unlock:
 }
 
 /**
- * task_cputime_expired - Compare two task_cputime entities.
+ * task_cputimers_expired - Compare two task_cputime entities.
  *
- * @sample:	The task_cputime structure to be checked for expiration.
- * @expires:	Expiration times, against which @sample will be checked.
+ * @samples:	Array of current samples for the CPUCLOCK clocks
+ * @expiries:	Array of expiry values for the CPUCLOCK clocks
  *
- * Checks @sample against @expires to see if any field of @sample has expired.
- * Returns true if any field of the former is greater than the corresponding
- * field of the latter if the latter field is set.  Otherwise returns false.
+ * Returns true if any mmember of @samples is greater than the corresponding
+ * member of @expiries if that member is non zero. False otherwise
  */
-static inline int task_cputime_expired(const struct task_cputime *sample,
-					const struct task_cputime *expires)
+static inline bool task_cputimers_expired(const u64 *sample, const u64 *expiries)
 {
-	if (expires->utime && sample->utime >= expires->utime)
-		return 1;
-	if (expires->stime && sample->utime + sample->stime >= expires->stime)
-		return 1;
-	if (expires->sum_exec_runtime != 0 &&
-	    sample->sum_exec_runtime >= expires->sum_exec_runtime)
-		return 1;
-	return 0;
+	int i;
+
+	for (i = 0; i < CPUCLOCK_MAX; i++) {
+		if (expiries[i] && sample[i] >= expiries[i])
+			return true;
+	}
+	return false;
 }
 
 /**
@@ -1080,18 +1067,17 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
  * timers and compare them with the corresponding expiration times.  Return
  * true if a timer has expired, else return false.
  */
-static inline int fastpath_timer_check(struct task_struct *tsk)
+static inline bool fastpath_timer_check(struct task_struct *tsk)
 {
+	u64 *expiries = tsk->posix_cputimers.expiries;
 	struct signal_struct *sig;
 
-	if (!task_cputime_zero(&tsk->posix_cputimers.cputime_expires)) {
-		struct task_cputime task_sample;
+	if (!expiry_cache_is_zero(expiries)) {
+		u64 samples[CPUCLOCK_MAX];
 
-		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
-		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
-		if (task_cputime_expired(&task_sample,
-					 &tsk->posix_cputimers.cputime_expires))
-			return 1;
+		task_sample_cputime(tsk, samples);
+		if (task_cputimers_expired(samples, expiries))
+			return true;
 	}
 
 	sig = tsk->signal;
@@ -1111,19 +1097,20 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	 */
 	if (READ_ONCE(sig->cputimer.running) &&
 	    !READ_ONCE(sig->cputimer.checking_timer)) {
-		struct task_cputime group_sample;
+		u64 samples[CPUCLOCK_MAX];
 
-		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
+		proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic,
+					   samples);
 
-		if (task_cputime_expired(&group_sample,
-					 &sig->posix_cputimers.cputime_expires))
-			return 1;
+		if (task_cputimers_expired(samples,
+					   sig->posix_cputimers.expiries))
+			return true;
 	}
 
 	if (dl_task(tsk) && tsk->dl.dl_overrun)
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
 /*
-- 
cgit 


From 46b883995c88520f2c4de6a64cccc04c69d59f83 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:14 +0200
Subject: posix-cpu-timers: Remove cputime_expires

The last users of the magic struct cputime based expiry cache are
gone. Remove the leftovers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.790209622@linutronix.de
---
 include/linux/posix-timers.h   |  9 ++-------
 kernel/time/posix-cpu-timers.c | 10 ----------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index e36c6fda1af9..fd9098467d6d 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -65,19 +65,14 @@ static inline int clockid_to_fd(const clockid_t clk)
 #ifdef CONFIG_POSIX_TIMERS
 /**
  * posix_cputimers - Container for posix CPU timer related data
- * @cputime_expires:	Earliest-expiration cache task_cputime based
  * @expiries:		Earliest-expiration cache array based
  * @cpu_timers:		List heads to queue posix CPU timers
  *
  * Used in task_struct and signal_struct
  */
 struct posix_cputimers {
-	/* Temporary union until all users are cleaned up */
-	union {
-		struct task_cputime	cputime_expires;
-		u64			expiries[CPUCLOCK_MAX];
-	};
-	struct list_head		cpu_timers[CPUCLOCK_MAX];
+	u64			expiries[CPUCLOCK_MAX];
+	struct list_head	cpu_timers[CPUCLOCK_MAX];
 };
 
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e159b039e44a..ffd49181e23d 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -18,16 +18,6 @@
 
 #include "posix-timers.h"
 
-static inline void temporary_check(void)
-{
-	BUILD_BUG_ON(offsetof(struct task_cputime, stime) !=
-		     CPUCLOCK_PROF * sizeof(u64));
-	BUILD_BUG_ON(offsetof(struct task_cputime, utime) !=
-		     CPUCLOCK_VIRT * sizeof(u64));
-	BUILD_BUG_ON(offsetof(struct task_cputime, sum_exec_runtime) !=
-		     CPUCLOCK_SCHED * sizeof(u64));
-}
-
 static void posix_cpu_timer_rearm(struct k_itimer *timer);
 
 void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
-- 
cgit 


From 87dc64480fb19a6a0fedbdff1e2557be50673287 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 26 Aug 2019 20:22:24 +0200
Subject: posix-cpu-timers: Restructure expiry array

Now that the abused struct task_cputime is gone, it's more natural to
bundle the expiry cache and the list head of each clock into a struct and
have an array of those structs.

Follow the hrtimer naming convention of 'bases' and rename the expiry cache
to 'nextevt' and adapt all usage sites.

Generates also better code .text size shrinks by 80 bytes.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1908262021140.1939@nanos.tec.linutronix.de
---
 include/linux/posix-timers.h   |  41 ++++++++++------
 kernel/time/posix-cpu-timers.c | 105 ++++++++++++++++++++++-------------------
 2 files changed, 83 insertions(+), 63 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index fd9098467d6d..64bd10d251fe 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -63,24 +63,33 @@ static inline int clockid_to_fd(const clockid_t clk)
 }
 
 #ifdef CONFIG_POSIX_TIMERS
+
 /**
- * posix_cputimers - Container for posix CPU timer related data
- * @expiries:		Earliest-expiration cache array based
+ * posix_cputimer_base - Container per posix CPU clock
+ * @nextevt:		Earliest-expiration cache
  * @cpu_timers:		List heads to queue posix CPU timers
+ */
+struct posix_cputimer_base {
+	u64			nextevt;
+	struct list_head	cpu_timers;
+};
+
+/**
+ * posix_cputimers - Container for posix CPU timer related data
+ * @bases:		Base container for posix CPU clocks
  *
  * Used in task_struct and signal_struct
  */
 struct posix_cputimers {
-	u64			expiries[CPUCLOCK_MAX];
-	struct list_head	cpu_timers[CPUCLOCK_MAX];
+	struct posix_cputimer_base	bases[CPUCLOCK_MAX];
 };
 
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
-	memset(&pct->expiries, 0, sizeof(pct->expiries));
-	INIT_LIST_HEAD(&pct->cpu_timers[0]);
-	INIT_LIST_HEAD(&pct->cpu_timers[1]);
-	INIT_LIST_HEAD(&pct->cpu_timers[2]);
+	memset(pct->bases, 0, sizeof(pct->bases));
+	INIT_LIST_HEAD(&pct->bases[0].cpu_timers);
+	INIT_LIST_HEAD(&pct->bases[1].cpu_timers);
+	INIT_LIST_HEAD(&pct->bases[2].cpu_timers);
 }
 
 void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit);
@@ -88,19 +97,23 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit);
 static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 					       u64 runtime)
 {
-	pct->expiries[CPUCLOCK_SCHED] = runtime;
+	pct->bases[CPUCLOCK_SCHED].nextevt = runtime;
 }
 
 /* Init task static initializer */
-#define INIT_CPU_TIMERLISTS(c)	{					\
-	LIST_HEAD_INIT(c.cpu_timers[0]),				\
-	LIST_HEAD_INIT(c.cpu_timers[1]),				\
-	LIST_HEAD_INIT(c.cpu_timers[2]),				\
+#define INIT_CPU_TIMERBASE(b) {						\
+	.cpu_timers = LIST_HEAD_INIT(b.cpu_timers),			\
+}
+
+#define INIT_CPU_TIMERBASES(b) {					\
+	INIT_CPU_TIMERBASE(b[0]),					\
+	INIT_CPU_TIMERBASE(b[1]),					\
+	INIT_CPU_TIMERBASE(b[2]),					\
 }
 
 #define INIT_CPU_TIMERS(s)						\
 	.posix_cputimers = {						\
-		.cpu_timers = INIT_CPU_TIMERLISTS(s.posix_cputimers),	\
+		.bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases),	\
 	},
 #else
 struct posix_cputimers { };
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index ffd49181e23d..9ac601abc4c4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -24,13 +24,13 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
 {
 	posix_cputimers_init(pct);
 	if (cpu_limit != RLIM_INFINITY)
-		pct->expiries[CPUCLOCK_PROF] = cpu_limit * NSEC_PER_SEC;
+		pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
 }
 
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
- * tsk->signal->posix_cputimers.expiries expiration cache if
- * necessary. Needs siglock protection since other code may update
+ * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
+ * necessary. Needs siglock protection since other code may update the
  * expiration cache as well.
  */
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
@@ -122,9 +122,11 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
 	}
 }
 
-static inline bool expiry_cache_is_zero(const u64 *ec)
+static inline bool expiry_cache_is_zero(const struct posix_cputimers *pct)
 {
-	return !(ec[CPUCLOCK_PROF] | ec[CPUCLOCK_VIRT] | ec[CPUCLOCK_SCHED]);
+	return !(pct->bases[CPUCLOCK_PROF].nextevt |
+		 pct->bases[CPUCLOCK_VIRT].nextevt |
+		 pct->bases[CPUCLOCK_SCHED].nextevt);
 }
 
 static int
@@ -432,9 +434,9 @@ static void cleanup_timers_list(struct list_head *head)
  */
 static void cleanup_timers(struct posix_cputimers *pct)
 {
-	cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_PROF]);
-	cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_VIRT]);
-	cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_SCHED]);
+	cleanup_timers_list(&pct->bases[CPUCLOCK_PROF].cpu_timers);
+	cleanup_timers_list(&pct->bases[CPUCLOCK_VIRT].cpu_timers);
+	cleanup_timers_list(&pct->bases[CPUCLOCK_SCHED].cpu_timers);
 }
 
 /*
@@ -464,21 +466,19 @@ static void arm_timer(struct k_itimer *timer)
 {
 	struct cpu_timer_list *const nt = &timer->it.cpu;
 	int clkidx = CPUCLOCK_WHICH(timer->it_clock);
-	u64 *cpuexp, newexp = timer->it.cpu.expires;
 	struct task_struct *p = timer->it.cpu.task;
+	u64 newexp = timer->it.cpu.expires;
+	struct posix_cputimer_base *base;
 	struct list_head *head, *listpos;
 	struct cpu_timer_list *next;
 
-	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-		head = p->posix_cputimers.cpu_timers + clkidx;
-		cpuexp = p->posix_cputimers.expiries + clkidx;
-	} else {
-		head = p->signal->posix_cputimers.cpu_timers + clkidx;
-		cpuexp = p->signal->posix_cputimers.expiries + clkidx;
-	}
+	if (CPUCLOCK_PERTHREAD(timer->it_clock))
+		base = p->posix_cputimers.bases + clkidx;
+	else
+		base = p->signal->posix_cputimers.bases + clkidx;
 
-	listpos = head;
-	list_for_each_entry(next, head, entry) {
+	listpos = head = &base->cpu_timers;
+	list_for_each_entry(next,head, entry) {
 		if (nt->expires < next->expires)
 			break;
 		listpos = &next->entry;
@@ -494,8 +494,8 @@ static void arm_timer(struct k_itimer *timer)
 	 * for process timers we share expiration cache with itimers
 	 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
 	 */
-	if (expires_gt(*cpuexp, newexp))
-		*cpuexp = newexp;
+	if (expires_gt(base->nextevt, newexp))
+		base->nextevt = newexp;
 
 	if (CPUCLOCK_PERTHREAD(timer->it_clock))
 		tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
@@ -783,9 +783,9 @@ static inline void check_dl_overrun(struct task_struct *tsk)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
-	struct list_head *timers = tsk->posix_cputimers.cpu_timers;
-	u64 stime, utime, *expires = tsk->posix_cputimers.expiries;
+	struct posix_cputimer_base *base = tsk->posix_cputimers.bases;
 	unsigned long soft;
+	u64 stime, utime;
 
 	if (dl_task(tsk))
 		check_dl_overrun(tsk);
@@ -794,14 +794,18 @@ static void check_thread_timers(struct task_struct *tsk,
 	 * If the expiry cache is zero, then there are no active per thread
 	 * CPU timers.
 	 */
-	if (expiry_cache_is_zero(tsk->posix_cputimers.expiries))
+	if (expiry_cache_is_zero(&tsk->posix_cputimers))
 		return;
 
 	task_cputime(tsk, &utime, &stime);
 
-	*expires++ = check_timers_list(timers, firing, utime + stime);
-	*expires++ = check_timers_list(++timers, firing, utime);
-	*expires = check_timers_list(++timers, firing, tsk->se.sum_exec_runtime);
+	base->nextevt = check_timers_list(&base->cpu_timers, firing,
+					  utime + stime);
+	base++;
+	base->nextevt = check_timers_list(&base->cpu_timers, firing, utime);
+	base++;
+	base->nextevt = check_timers_list(&base->cpu_timers, firing,
+					  tsk->se.sum_exec_runtime);
 
 	/*
 	 * Check for the special case thread timers.
@@ -840,7 +844,7 @@ static void check_thread_timers(struct task_struct *tsk,
 		}
 	}
 
-	if (expiry_cache_is_zero(tsk->posix_cputimers.expiries))
+	if (expiry_cache_is_zero(&tsk->posix_cputimers))
 		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
@@ -884,7 +888,7 @@ static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
 	struct signal_struct *const sig = tsk->signal;
-	struct list_head *timers = sig->posix_cputimers.cpu_timers;
+	struct posix_cputimer_base *base = sig->posix_cputimers.bases;
 	u64 utime, ptime, virt_expires, prof_expires;
 	u64 sum_sched_runtime, sched_expires;
 	struct task_cputime cputime;
@@ -912,9 +916,12 @@ static void check_process_timers(struct task_struct *tsk,
 	ptime = utime + cputime.stime;
 	sum_sched_runtime = cputime.sum_exec_runtime;
 
-	prof_expires = check_timers_list(timers, firing, ptime);
-	virt_expires = check_timers_list(++timers, firing, utime);
-	sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
+	prof_expires = check_timers_list(&base[CPUCLOCK_PROF].cpu_timers,
+					 firing, ptime);
+	virt_expires = check_timers_list(&base[CPUCLOCK_VIRT].cpu_timers,
+					 firing, utime);
+	sched_expires = check_timers_list(&base[CPUCLOCK_SCHED].cpu_timers,
+					  firing, sum_sched_runtime);
 
 	/*
 	 * Check for the special case process timers.
@@ -959,11 +966,11 @@ static void check_process_timers(struct task_struct *tsk,
 			prof_expires = x;
 	}
 
-	sig->posix_cputimers.expiries[CPUCLOCK_PROF] = prof_expires;
-	sig->posix_cputimers.expiries[CPUCLOCK_VIRT] = virt_expires;
-	sig->posix_cputimers.expiries[CPUCLOCK_SCHED] = sched_expires;
+	base[CPUCLOCK_PROF].nextevt = prof_expires;
+	base[CPUCLOCK_VIRT].nextevt = virt_expires;
+	base[CPUCLOCK_SCHED].nextevt = sched_expires;
 
-	if (expiry_cache_is_zero(sig->posix_cputimers.expiries))
+	if (expiry_cache_is_zero(&sig->posix_cputimers))
 		stop_process_timers(sig);
 
 	sig->cputimer.checking_timer = false;
@@ -1028,20 +1035,21 @@ unlock:
 }
 
 /**
- * task_cputimers_expired - Compare two task_cputime entities.
+ * task_cputimers_expired - Check whether posix CPU timers are expired
  *
  * @samples:	Array of current samples for the CPUCLOCK clocks
- * @expiries:	Array of expiry values for the CPUCLOCK clocks
+ * @pct:	Pointer to a posix_cputimers container
  *
- * Returns true if any mmember of @samples is greater than the corresponding
- * member of @expiries if that member is non zero. False otherwise
+ * Returns true if any member of @samples is greater than the corresponding
+ * member of @pct->bases[CLK].nextevt. False otherwise
  */
-static inline bool task_cputimers_expired(const u64 *sample, const u64 *expiries)
+static inline bool
+task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct)
 {
 	int i;
 
 	for (i = 0; i < CPUCLOCK_MAX; i++) {
-		if (expiries[i] && sample[i] >= expiries[i])
+		if (pct->bases[i].nextevt && sample[i] >= pct->bases[i].nextevt)
 			return true;
 	}
 	return false;
@@ -1059,14 +1067,13 @@ static inline bool task_cputimers_expired(const u64 *sample, const u64 *expiries
  */
 static inline bool fastpath_timer_check(struct task_struct *tsk)
 {
-	u64 *expiries = tsk->posix_cputimers.expiries;
 	struct signal_struct *sig;
 
-	if (!expiry_cache_is_zero(expiries)) {
+	if (!expiry_cache_is_zero(&tsk->posix_cputimers)) {
 		u64 samples[CPUCLOCK_MAX];
 
 		task_sample_cputime(tsk, samples);
-		if (task_cputimers_expired(samples, expiries))
+		if (task_cputimers_expired(samples, &tsk->posix_cputimers))
 			return true;
 	}
 
@@ -1092,8 +1099,7 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
 		proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic,
 					   samples);
 
-		if (task_cputimers_expired(samples,
-					   sig->posix_cputimers.expiries))
+		if (task_cputimers_expired(samples, &sig->posix_cputimers))
 			return true;
 	}
 
@@ -1176,11 +1182,12 @@ void run_posix_cpu_timers(void)
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
 			   u64 *newval, u64 *oldval)
 {
-	u64 now, *expiry = tsk->signal->posix_cputimers.expiries + clkid;
+	u64 now, *nextevt;
 
 	if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
 		return;
 
+	nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt;
 	now = cpu_clock_sample_group(clkid, tsk, true);
 
 	if (oldval) {
@@ -1207,8 +1214,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
 	 * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
 	 * expiry cache is also used by RLIMIT_CPU!.
 	 */
-	if (expires_gt(*expiry, *newval))
-		*expiry = *newval;
+	if (expires_gt(*nextevt, *newval))
+		*nextevt = *newval;
 
 	tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
 }
-- 
cgit 


From b7be4ef1365dcb56fdffc6689e41058b23f5996d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:16 +0200
Subject: posix-cpu-timers: Switch thread group sampling to array

That allows more simplifications in various places.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192921.988426956@linutronix.de
---
 include/linux/sched/cputime.h  |   2 +-
 kernel/time/itimer.c           |  11 ++---
 kernel/time/posix-cpu-timers.c | 104 ++++++++++++++++++-----------------------
 3 files changed, 49 insertions(+), 68 deletions(-)

diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 2638fd0ab3f2..eefa5dff16b4 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -61,7 +61,7 @@ extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_sample_cputime(struct task_struct *tsk, struct task_cputime *times);
+void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples);
 
 /*
  * The following are functions that support scheduler-internal time accounting.
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index ae04bc259240..77f1e5635cc1 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -55,15 +55,10 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	val = it->expires;
 	interval = it->incr;
 	if (val) {
-		struct task_cputime cputime;
-		u64 t;
+		u64 t, samples[CPUCLOCK_MAX];
 
-		thread_group_sample_cputime(tsk, &cputime);
-		if (clock_id == CPUCLOCK_PROF)
-			t = cputime.utime + cputime.stime;
-		else
-			/* CPUCLOCK_VIRT */
-			t = cputime.utime;
+		thread_group_sample_cputime(tsk, samples);
+		t = samples[clock_id];
 
 		if (val < t)
 			/* about to fire */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 9ac601abc4c4..e62139a89375 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -225,22 +225,14 @@ retry:
 	}
 }
 
-static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
+			      struct task_cputime *sum)
 {
 	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
 	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
 	__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
 }
 
-/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
-static inline void sample_cputime_atomic(struct task_cputime *times,
-					 struct task_cputime_atomic *atomic_times)
-{
-	times->utime = atomic64_read(&atomic_times->utime);
-	times->stime = atomic64_read(&atomic_times->stime);
-	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
-}
-
 /**
  * thread_group_sample_cputime - Sample cputime for a given task
  * @tsk:	Task for which cputime needs to be started
@@ -252,20 +244,19 @@ static inline void sample_cputime_atomic(struct task_cputime *times,
  *
  * Updates @times with an uptodate sample of the thread group cputimes.
  */
-void thread_group_sample_cputime(struct task_struct *tsk,
-				struct task_cputime *times)
+void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
 	WARN_ON_ONCE(!cputimer->running);
 
-	sample_cputime_atomic(times, &cputimer->cputime_atomic);
+	proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
 }
 
 /**
  * thread_group_start_cputime - Start cputime and return a sample
  * @tsk:	Task for which cputime needs to be started
- * @iimes:	Storage for time samples
+ * @samples:	Storage for time samples
  *
  * The thread group cputime accouting is avoided when there are no posix
  * CPU timers armed. Before starting a timer it's required to check whether
@@ -274,14 +265,14 @@ void thread_group_sample_cputime(struct task_struct *tsk,
  *
  * Updates @times with an uptodate sample of the thread group cputimes.
  */
-static void
-thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times)
+static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-	struct task_cputime sum;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
 	if (!READ_ONCE(cputimer->running)) {
+		struct task_cputime sum;
+
 		/*
 		 * The POSIX timer interface allows for absolute time expiry
 		 * values through the TIMER_ABSTIME flag, therefore we have
@@ -299,7 +290,15 @@ thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times)
 		 */
 		WRITE_ONCE(cputimer->running, true);
 	}
-	sample_cputime_atomic(times, &cputimer->cputime_atomic);
+	proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
+}
+
+static void __thread_group_cputime(struct task_struct *tsk, u64 *samples)
+{
+	struct task_cputime ct;
+
+	thread_group_cputime(tsk, &ct);
+	store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime);
 }
 
 /*
@@ -313,28 +312,18 @@ static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
 				  bool start)
 {
 	struct thread_group_cputimer *cputimer = &p->signal->cputimer;
-	struct task_cputime cputime;
+	u64 samples[CPUCLOCK_MAX];
 
 	if (!READ_ONCE(cputimer->running)) {
 		if (start)
-			thread_group_start_cputime(p, &cputime);
+			thread_group_start_cputime(p, samples);
 		else
-			thread_group_cputime(p, &cputime);
+			__thread_group_cputime(p, samples);
 	} else {
-		sample_cputime_atomic(&cputime, &cputimer->cputime_atomic);
+		proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
 	}
 
-	switch (clkid) {
-	case CPUCLOCK_PROF:
-		return cputime.utime + cputime.stime;
-	case CPUCLOCK_VIRT:
-		return cputime.utime;
-	case CPUCLOCK_SCHED:
-		return cputime.sum_exec_runtime;
-	default:
-		WARN_ON_ONCE(1);
-	}
-	return 0;
+	return samples[clkid];
 }
 
 static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
@@ -889,9 +878,7 @@ static void check_process_timers(struct task_struct *tsk,
 {
 	struct signal_struct *const sig = tsk->signal;
 	struct posix_cputimer_base *base = sig->posix_cputimers.bases;
-	u64 utime, ptime, virt_expires, prof_expires;
-	u64 sum_sched_runtime, sched_expires;
-	struct task_cputime cputime;
+	u64 virt_exp, prof_exp, sched_exp, samples[CPUCLOCK_MAX];
 	unsigned long soft;
 
 	/*
@@ -911,30 +898,29 @@ static void check_process_timers(struct task_struct *tsk,
 	 * Collect the current process totals. Group accounting is active
 	 * so the sample can be taken directly.
 	 */
-	sample_cputime_atomic(&cputime, &sig->cputimer.cputime_atomic);
-	utime = cputime.utime;
-	ptime = utime + cputime.stime;
-	sum_sched_runtime = cputime.sum_exec_runtime;
-
-	prof_expires = check_timers_list(&base[CPUCLOCK_PROF].cpu_timers,
-					 firing, ptime);
-	virt_expires = check_timers_list(&base[CPUCLOCK_VIRT].cpu_timers,
-					 firing, utime);
-	sched_expires = check_timers_list(&base[CPUCLOCK_SCHED].cpu_timers,
-					  firing, sum_sched_runtime);
+	proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples);
+
+	prof_exp = check_timers_list(&base[CPUCLOCK_PROF].cpu_timers,
+				     firing, samples[CPUCLOCK_PROF]);
+	virt_exp = check_timers_list(&base[CPUCLOCK_VIRT].cpu_timers,
+				     firing, samples[CPUCLOCK_VIRT]);
+	sched_exp = check_timers_list(&base[CPUCLOCK_SCHED].cpu_timers,
+				      firing, samples[CPUCLOCK_SCHED]);
 
 	/*
 	 * Check for the special case process timers.
 	 */
-	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
-			 SIGPROF);
-	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
-			 SIGVTALRM);
+	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_exp,
+			 samples[CPUCLOCK_PROF], SIGPROF);
+	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_exp,
+			 samples[CPUCLOCK_PROF], SIGVTALRM);
+
 	soft = task_rlimit(tsk, RLIMIT_CPU);
 	if (soft != RLIM_INFINITY) {
-		unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
+		u64 softns, ptime = samples[CPUCLOCK_PROF];
 		unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
-		u64 x;
+		unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
+
 		if (psecs >= hard) {
 			/*
 			 * At the hard limit, we just die.
@@ -961,14 +947,14 @@ static void check_process_timers(struct task_struct *tsk,
 				sig->rlim[RLIMIT_CPU].rlim_cur = soft;
 			}
 		}
-		x = soft * NSEC_PER_SEC;
-		if (!prof_expires || x < prof_expires)
-			prof_expires = x;
+		softns = soft * NSEC_PER_SEC;
+		if (!prof_exp || softns < prof_exp)
+			prof_exp = softns;
 	}
 
-	base[CPUCLOCK_PROF].nextevt = prof_expires;
-	base[CPUCLOCK_VIRT].nextevt = virt_expires;
-	base[CPUCLOCK_SCHED].nextevt = sched_expires;
+	base[CPUCLOCK_PROF].nextevt = prof_exp;
+	base[CPUCLOCK_VIRT].nextevt = virt_exp;
+	base[CPUCLOCK_SCHED].nextevt = sched_exp;
 
 	if (expiry_cache_is_zero(&sig->posix_cputimers))
 		stop_process_timers(sig);
-- 
cgit 


From fe0517f893d36636de20d0a809fc0c788ca0cade Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:17 +0200
Subject: posix-cpu-timers: Respect INFINITY for hard RTTIME limit

The RTIME limit expiry code does not check the hard RTTIME limit for
INFINITY, i.e. being disabled.  Add it.

While this could be considered an ABI breakage if something would depend on
this behaviour. Though it's highly unlikely to have an effect because
RLIM_INFINITY is at minimum INT_MAX and the RTTIME limit is in seconds, so
the timer would fire after ~68 years.

Adding this obvious correct limit check also allows further consolidation
of that code and is a prerequisite for cleaning up the 0 based checks and
the rlimit setter code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192922.078293002@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e62139a89375..a738d7659915 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -921,7 +921,7 @@ static void check_process_timers(struct task_struct *tsk,
 		unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
 		unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
 
-		if (psecs >= hard) {
+		if (hard != RLIM_INFINITY && psecs >= hard) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
-- 
cgit 


From 24db4dd90dd53ad6e3331b6f01cb985e466cface Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:18 +0200
Subject: rlimit: Rewrite non-sensical RLIMIT_CPU comment

The comment above the function which arms RLIMIT_CPU in the posix CPU timer
code makes no sense at all. It claims that the kernel does not return an
error code when it rejected the attempt to set RLIMIT_CPU. That's clearly
bogus as the code does an error check and the rlimit is only set and
activated when the permission checks are ok. In case of a rejection an
appropriate error code is returned.

This is a historical and outdated comment which got dragged along even when
the rlimit handling code was rewritten.

Replace it with an explanation why the setup function is not called when
the rlimit value is RLIM_INFINITY and how the 'disarming' is handled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192922.185511287@linutronix.de
---
 kernel/sys.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 2969304c29fe..c578b75d7923 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1576,10 +1576,9 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 	task_unlock(tsk->group_leader);
 
 	/*
-	 * RLIMIT_CPU handling.   Note that the kernel fails to return an error
-	 * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
-	 * very long-standing error, and fixing it now risks breakage of
-	 * applications, so we live with it
+	 * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+	 * infite. In case of RLIM_INFINITY the posix CPU timer code
+	 * ignores the rlimit.
 	 */
 	 if (!retval && new_rlim && resource == RLIMIT_CPU &&
 	     new_rlim->rlim_cur != RLIM_INFINITY &&
-- 
cgit 


From 2bbdbdae05167c688b6d3499a7dab74208b80a22 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:19 +0200
Subject: posix-cpu-timers: Get rid of zero checks

Deactivation of the expiry cache is done by setting all clock caches to
0. That requires to have a check for zero in all places which update the
expiry cache:

	if (cache == 0 || new < cache)
		cache = new;

Use U64_MAX as the deactivated value, which allows to remove the zero
checks when updating the cache and reduces it to the obvious check:

	if (new < cache)
		cache = new;

This also removes the weird workaround in do_prlimit() which was required
to convert a RLIMIT_CPU value of 0 (immediate expiry) to 1 because handing
in 0 to the posix CPU timer code would have effectively disarmed it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192922.275086128@linutronix.de
---
 include/linux/posix-timers.h   |  7 +++++--
 kernel/sys.c                   |  9 ---------
 kernel/time/posix-cpu-timers.c | 38 +++++++++++++++-----------------------
 3 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 64bd10d251fe..3ea920e8fe7f 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -86,7 +86,9 @@ struct posix_cputimers {
 
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
-	memset(pct->bases, 0, sizeof(pct->bases));
+	pct->bases[0].nextevt = U64_MAX;
+	pct->bases[1].nextevt = U64_MAX;
+	pct->bases[2].nextevt = U64_MAX;
 	INIT_LIST_HEAD(&pct->bases[0].cpu_timers);
 	INIT_LIST_HEAD(&pct->bases[1].cpu_timers);
 	INIT_LIST_HEAD(&pct->bases[2].cpu_timers);
@@ -102,7 +104,8 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 
 /* Init task static initializer */
 #define INIT_CPU_TIMERBASE(b) {						\
-	.cpu_timers = LIST_HEAD_INIT(b.cpu_timers),			\
+	.nextevt	= U64_MAX,					\
+	.cpu_timers	= LIST_HEAD_INIT(b.cpu_timers),			\
 }
 
 #define INIT_CPU_TIMERBASES(b) {					\
diff --git a/kernel/sys.c b/kernel/sys.c
index c578b75d7923..2462aa84247f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1557,15 +1557,6 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 			retval = -EPERM;
 		if (!retval)
 			retval = security_task_setrlimit(tsk, resource, new_rlim);
-		if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
-			/*
-			 * The caller is asking for an immediate RLIMIT_CPU
-			 * expiry.  But we use the zero value to mean "it was
-			 * never set".  So let's cheat and make it one second
-			 * instead
-			 */
-			new_rlim->rlim_cur = 1;
-		}
 	}
 	if (!retval) {
 		if (old_rlim)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a738d7659915..cf85292575c5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -122,11 +122,12 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
 	}
 }
 
-static inline bool expiry_cache_is_zero(const struct posix_cputimers *pct)
+/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */
+static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
 {
-	return !(pct->bases[CPUCLOCK_PROF].nextevt |
-		 pct->bases[CPUCLOCK_VIRT].nextevt |
-		 pct->bases[CPUCLOCK_SCHED].nextevt);
+	return !(~pct->bases[CPUCLOCK_PROF].nextevt |
+		 ~pct->bases[CPUCLOCK_VIRT].nextevt |
+		 ~pct->bases[CPUCLOCK_SCHED].nextevt);
 }
 
 static int
@@ -442,11 +443,6 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 	cleanup_timers(&tsk->signal->posix_cputimers);
 }
 
-static inline int expires_gt(u64 expires, u64 new_exp)
-{
-	return expires == 0 || expires > new_exp;
-}
-
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the sighand lock held.
@@ -483,7 +479,7 @@ static void arm_timer(struct k_itimer *timer)
 	 * for process timers we share expiration cache with itimers
 	 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
 	 */
-	if (expires_gt(base->nextevt, newexp))
+	if (newexp < base->nextevt)
 		base->nextevt = newexp;
 
 	if (CPUCLOCK_PERTHREAD(timer->it_clock))
@@ -753,7 +749,7 @@ check_timers_list(struct list_head *timers,
 		list_move_tail(&t->entry, firing);
 	}
 
-	return 0;
+	return U64_MAX;
 }
 
 static inline void check_dl_overrun(struct task_struct *tsk)
@@ -779,11 +775,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	if (dl_task(tsk))
 		check_dl_overrun(tsk);
 
-	/*
-	 * If the expiry cache is zero, then there are no active per thread
-	 * CPU timers.
-	 */
-	if (expiry_cache_is_zero(&tsk->posix_cputimers))
+	if (expiry_cache_is_inactive(&tsk->posix_cputimers))
 		return;
 
 	task_cputime(tsk, &utime, &stime);
@@ -833,7 +825,7 @@ static void check_thread_timers(struct task_struct *tsk,
 		}
 	}
 
-	if (expiry_cache_is_zero(&tsk->posix_cputimers))
+	if (expiry_cache_is_inactive(&tsk->posix_cputimers))
 		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
@@ -864,7 +856,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
 	}
 
-	if (it->expires && (!*expires || it->expires < *expires))
+	if (it->expires && it->expires < *expires)
 		*expires = it->expires;
 }
 
@@ -948,7 +940,7 @@ static void check_process_timers(struct task_struct *tsk,
 			}
 		}
 		softns = soft * NSEC_PER_SEC;
-		if (!prof_exp || softns < prof_exp)
+		if (softns < prof_exp)
 			prof_exp = softns;
 	}
 
@@ -956,7 +948,7 @@ static void check_process_timers(struct task_struct *tsk,
 	base[CPUCLOCK_VIRT].nextevt = virt_exp;
 	base[CPUCLOCK_SCHED].nextevt = sched_exp;
 
-	if (expiry_cache_is_zero(&sig->posix_cputimers))
+	if (expiry_cache_is_inactive(&sig->posix_cputimers))
 		stop_process_timers(sig);
 
 	sig->cputimer.checking_timer = false;
@@ -1035,7 +1027,7 @@ task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct)
 	int i;
 
 	for (i = 0; i < CPUCLOCK_MAX; i++) {
-		if (pct->bases[i].nextevt && sample[i] >= pct->bases[i].nextevt)
+		if (sample[i] >= pct->bases[i].nextevt)
 			return true;
 	}
 	return false;
@@ -1055,7 +1047,7 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
 {
 	struct signal_struct *sig;
 
-	if (!expiry_cache_is_zero(&tsk->posix_cputimers)) {
+	if (!expiry_cache_is_inactive(&tsk->posix_cputimers)) {
 		u64 samples[CPUCLOCK_MAX];
 
 		task_sample_cputime(tsk, samples);
@@ -1200,7 +1192,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
 	 * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
 	 * expiry cache is also used by RLIMIT_CPU!.
 	 */
-	if (expires_gt(*nextevt, *newval))
+	if (*newval < *nextevt)
 		*nextevt = *newval;
 
 	tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
-- 
cgit 


From 1cd07c0b94f2c320270d76edb7dd49bceb09c1df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:20 +0200
Subject: posix-cpu-timers: Consolidate timer expiry further

With the array based samples and expiry cache, the expiry function can use
a loop to collect timers from the clock specific lists.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192922.365469982@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 63 ++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cf85292575c5..caafdfdd6f0f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -752,6 +752,18 @@ check_timers_list(struct list_head *timers,
 	return U64_MAX;
 }
 
+static void collect_posix_cputimers(struct posix_cputimers *pct,
+				    u64 *samples, struct list_head *firing)
+{
+	struct posix_cputimer_base *base = pct->bases;
+	int i;
+
+	for (i = 0; i < CPUCLOCK_MAX; i++, base++) {
+		base->nextevt = check_timers_list(&base->cpu_timers, firing,
+						   samples[i]);
+	}
+}
+
 static inline void check_dl_overrun(struct task_struct *tsk)
 {
 	if (tsk->dl.dl_overrun) {
@@ -768,25 +780,18 @@ static inline void check_dl_overrun(struct task_struct *tsk)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
-	struct posix_cputimer_base *base = tsk->posix_cputimers.bases;
+	struct posix_cputimers *pct = &tsk->posix_cputimers;
+	u64 samples[CPUCLOCK_MAX];
 	unsigned long soft;
-	u64 stime, utime;
 
 	if (dl_task(tsk))
 		check_dl_overrun(tsk);
 
-	if (expiry_cache_is_inactive(&tsk->posix_cputimers))
+	if (expiry_cache_is_inactive(pct))
 		return;
 
-	task_cputime(tsk, &utime, &stime);
-
-	base->nextevt = check_timers_list(&base->cpu_timers, firing,
-					  utime + stime);
-	base++;
-	base->nextevt = check_timers_list(&base->cpu_timers, firing, utime);
-	base++;
-	base->nextevt = check_timers_list(&base->cpu_timers, firing,
-					  tsk->se.sum_exec_runtime);
+	task_sample_cputime(tsk, samples);
+	collect_posix_cputimers(pct, samples, firing);
 
 	/*
 	 * Check for the special case thread timers.
@@ -825,7 +830,7 @@ static void check_thread_timers(struct task_struct *tsk,
 		}
 	}
 
-	if (expiry_cache_is_inactive(&tsk->posix_cputimers))
+	if (expiry_cache_is_inactive(pct))
 		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
@@ -869,15 +874,15 @@ static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
 	struct signal_struct *const sig = tsk->signal;
-	struct posix_cputimer_base *base = sig->posix_cputimers.bases;
-	u64 virt_exp, prof_exp, sched_exp, samples[CPUCLOCK_MAX];
+	struct posix_cputimers *pct = &sig->posix_cputimers;
+	u64 samples[CPUCLOCK_MAX];
 	unsigned long soft;
 
 	/*
 	 * If cputimer is not running, then there are no active
 	 * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
 	 */
-	if (!READ_ONCE(tsk->signal->cputimer.running))
+	if (!READ_ONCE(sig->cputimer.running))
 		return;
 
        /*
@@ -891,21 +896,17 @@ static void check_process_timers(struct task_struct *tsk,
 	 * so the sample can be taken directly.
 	 */
 	proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples);
-
-	prof_exp = check_timers_list(&base[CPUCLOCK_PROF].cpu_timers,
-				     firing, samples[CPUCLOCK_PROF]);
-	virt_exp = check_timers_list(&base[CPUCLOCK_VIRT].cpu_timers,
-				     firing, samples[CPUCLOCK_VIRT]);
-	sched_exp = check_timers_list(&base[CPUCLOCK_SCHED].cpu_timers,
-				      firing, samples[CPUCLOCK_SCHED]);
+	collect_posix_cputimers(pct, samples, firing);
 
 	/*
 	 * Check for the special case process timers.
 	 */
-	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_exp,
+	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF],
+			 &pct->bases[CPUCLOCK_PROF].nextevt,
 			 samples[CPUCLOCK_PROF], SIGPROF);
-	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_exp,
-			 samples[CPUCLOCK_PROF], SIGVTALRM);
+	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT],
+			 &pct->bases[CPUCLOCK_VIRT].nextevt,
+			 samples[CPUCLOCK_VIRT], SIGVTALRM);
 
 	soft = task_rlimit(tsk, RLIMIT_CPU);
 	if (soft != RLIM_INFINITY) {
@@ -940,15 +941,11 @@ static void check_process_timers(struct task_struct *tsk,
 			}
 		}
 		softns = soft * NSEC_PER_SEC;
-		if (softns < prof_exp)
-			prof_exp = softns;
+		if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
+			pct->bases[CPUCLOCK_PROF].nextevt = softns;
 	}
 
-	base[CPUCLOCK_PROF].nextevt = prof_exp;
-	base[CPUCLOCK_VIRT].nextevt = virt_exp;
-	base[CPUCLOCK_SCHED].nextevt = sched_exp;
-
-	if (expiry_cache_is_inactive(&sig->posix_cputimers))
+	if (expiry_cache_is_inactive(pct))
 		stop_process_timers(sig);
 
 	sig->cputimer.checking_timer = false;
-- 
cgit 


From 8ea1de90a5eccdc18c8f05f8596bae8660a3ff9a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:21 +0200
Subject: posix-cpu-timers: Get rid of 64bit divisions

Instead of dividing A to match the units of B it's more efficient to
multiply B to match the units of A.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192922.458286860@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index caafdfdd6f0f..dcdf9c8241b1 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -798,10 +798,11 @@ static void check_thread_timers(struct task_struct *tsk,
 	 */
 	soft = task_rlimit(tsk, RLIMIT_RTTIME);
 	if (soft != RLIM_INFINITY) {
+		/* Task RT timeout is accounted in jiffies. RTTIME is usec */
+		unsigned long rtim = tsk->rt.timeout * (USEC_PER_SEC / HZ);
 		unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
 
-		if (hard != RLIM_INFINITY &&
-		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		if (hard != RLIM_INFINITY && rtim >= hard) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -813,7 +814,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+		if (rtim >= soft) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -910,11 +911,13 @@ static void check_process_timers(struct task_struct *tsk,
 
 	soft = task_rlimit(tsk, RLIMIT_CPU);
 	if (soft != RLIM_INFINITY) {
-		u64 softns, ptime = samples[CPUCLOCK_PROF];
+		/* RLIMIT_CPU is in seconds. Samples are nanoseconds */
 		unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
-		unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
+		u64 ptime = samples[CPUCLOCK_PROF];
+		u64 softns = (u64)soft * NSEC_PER_SEC;
+		u64 hardns = (u64)hard * NSEC_PER_SEC;
 
-		if (hard != RLIM_INFINITY && psecs >= hard) {
+		if (hard != RLIM_INFINITY && ptime >= hardns) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -926,7 +929,7 @@ static void check_process_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (psecs >= soft) {
+		if (ptime >= softns) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -936,11 +939,12 @@ static void check_process_timers(struct task_struct *tsk,
 			}
 			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
 			if (soft < hard) {
-				soft++;
-				sig->rlim[RLIMIT_CPU].rlim_cur = soft;
+				sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
+				softns += NSEC_PER_SEC;
 			}
 		}
-		softns = soft * NSEC_PER_SEC;
+
+		/* Update the expiry cache */
 		if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
 			pct->bases[CPUCLOCK_PROF].nextevt = softns;
 	}
-- 
cgit 


From dd6702241337bcd0bae01d2644b7bae1a496d937 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:22 +0200
Subject: posix-cpu-timers: Remove pointless comparisons

The soft RLIMIT expiry code checks whether the soft limit is greater than
the hard limit. That's pointless because if the soft RLIMIT is greater than
the hard RLIMIT then that code cannot be reached as the hard RLIMIT check
is before that and already killed the process.

Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192922.548747613@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index dcdf9c8241b1..115c8dfa4d46 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -814,15 +814,14 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
+
 		if (rtim >= soft) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
-			if (soft < hard) {
-				soft += USEC_PER_SEC;
-				tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur =
-					soft;
-			}
+			soft += USEC_PER_SEC;
+			tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
+
 			if (print_fatal_signals) {
 				pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
 					tsk->comm, task_pid_nr(tsk));
@@ -938,10 +937,9 @@ static void check_process_timers(struct task_struct *tsk,
 					tsk->comm, task_pid_nr(tsk));
 			}
 			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
-			if (soft < hard) {
-				sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
-				softns += NSEC_PER_SEC;
-			}
+
+			sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
+			softns += NSEC_PER_SEC;
 		}
 
 		/* Update the expiry cache */
-- 
cgit 


From 8991afe2640d05a805eba01277856e8549cdc838 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:23 +0200
Subject: posix-cpu-timers: Deduplicate rlimit handling

Both thread and process expiry functions have the same functionality for
sending signals for soft and hard RLIMITs duplicated in 4 different
ways.

Split it out into a common function and cleanup the callsites.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190821192922.653276779@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 67 ++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 42 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 115c8dfa4d46..ef39a7a4a95c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -772,6 +772,20 @@ static inline void check_dl_overrun(struct task_struct *tsk)
 	}
 }
 
+static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
+{
+	if (time < limit)
+		return false;
+
+	if (print_fatal_signals) {
+		pr_info("%s Watchdog Timeout (%s): %s[%d]\n",
+			rt ? "RT" : "CPU", hard ? "hard" : "soft",
+			current->comm, task_pid_nr(current));
+	}
+	__group_send_sig_info(signo, SEND_SIG_PRIV, current);
+	return true;
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them off
  * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
@@ -799,34 +813,18 @@ static void check_thread_timers(struct task_struct *tsk,
 	soft = task_rlimit(tsk, RLIMIT_RTTIME);
 	if (soft != RLIM_INFINITY) {
 		/* Task RT timeout is accounted in jiffies. RTTIME is usec */
-		unsigned long rtim = tsk->rt.timeout * (USEC_PER_SEC / HZ);
+		unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
 		unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
 
-		if (hard != RLIM_INFINITY && rtim >= hard) {
-			/*
-			 * At the hard limit, we just die.
-			 * No need to calculate anything else now.
-			 */
-			if (print_fatal_signals) {
-				pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
-					tsk->comm, task_pid_nr(tsk));
-			}
-			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+		/* At the hard limit, send SIGKILL. No further action. */
+		if (hard != RLIM_INFINITY &&
+		    check_rlimit(rttime, hard, SIGKILL, true, true))
 			return;
-		}
 
-		if (rtim >= soft) {
-			/*
-			 * At the soft limit, send a SIGXCPU every second.
-			 */
+		/* At the soft limit, send a SIGXCPU every second */
+		if (check_rlimit(rttime, soft, SIGXCPU, true, false)) {
 			soft += USEC_PER_SEC;
 			tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
-
-			if (print_fatal_signals) {
-				pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
-					tsk->comm, task_pid_nr(tsk));
-			}
-			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
 		}
 	}
 
@@ -916,28 +914,13 @@ static void check_process_timers(struct task_struct *tsk,
 		u64 softns = (u64)soft * NSEC_PER_SEC;
 		u64 hardns = (u64)hard * NSEC_PER_SEC;
 
-		if (hard != RLIM_INFINITY && ptime >= hardns) {
-			/*
-			 * At the hard limit, we just die.
-			 * No need to calculate anything else now.
-			 */
-			if (print_fatal_signals) {
-				pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
-					tsk->comm, task_pid_nr(tsk));
-			}
-			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+		/* At the hard limit, send SIGKILL. No further action. */
+		if (hard != RLIM_INFINITY &&
+		    check_rlimit(ptime, hardns, SIGKILL, false, true))
 			return;
-		}
-		if (ptime >= softns) {
-			/*
-			 * At the soft limit, send a SIGXCPU every second.
-			 */
-			if (print_fatal_signals) {
-				pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
-					tsk->comm, task_pid_nr(tsk));
-			}
-			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
 
+		/* At the soft limit, send a SIGXCPU every second */
+		if (check_rlimit(ptime, softns, SIGXCPU, false, false)) {
 			sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
 			softns += NSEC_PER_SEC;
 		}
-- 
cgit 


From 244d49e30653658d4e7e9b2b8427777cbbc5affe Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Aug 2019 21:09:24 +0200
Subject: posix-cpu-timers: Move state tracking to struct posix_cputimers

Put it where it belongs and clean up the ifdeffery in fork completely.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190821192922.743229404@linutronix.de
---
 include/linux/posix-timers.h   |  8 +++++
 include/linux/sched/cputime.h  |  9 ++++--
 include/linux/sched/signal.h   |  6 ----
 init/init_task.c               |  2 --
 kernel/fork.c                  |  6 ----
 kernel/time/posix-cpu-timers.c | 73 +++++++++++++++++++++++-------------------
 6 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 3ea920e8fe7f..a9e3f69d2db4 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -77,15 +77,23 @@ struct posix_cputimer_base {
 /**
  * posix_cputimers - Container for posix CPU timer related data
  * @bases:		Base container for posix CPU clocks
+ * @timers_active:	Timers are queued.
+ * @expiry_active:	Timer expiry is active. Used for
+ *			process wide timers to avoid multiple
+ *			task trying to handle expiry concurrently
  *
  * Used in task_struct and signal_struct
  */
 struct posix_cputimers {
 	struct posix_cputimer_base	bases[CPUCLOCK_MAX];
+	unsigned int			timers_active;
+	unsigned int			expiry_active;
 };
 
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
+	pct->timers_active = 0;
+	pct->expiry_active = 0;
 	pct->bases[0].nextevt = U64_MAX;
 	pct->bases[1].nextevt = U64_MAX;
 	pct->bases[2].nextevt = U64_MAX;
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index eefa5dff16b4..6c9f19a33865 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -70,7 +70,7 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples);
  */
 
 /**
- * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
+ * get_running_cputimer - return &tsk->signal->cputimer if cputimers are active
  *
  * @tsk:	Pointer to target task.
  */
@@ -80,8 +80,11 @@ struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
-	/* Check if cputimer isn't running. This is accessed without locking. */
-	if (!READ_ONCE(cputimer->running))
+	/*
+	 * Check whether posix CPU timers are active. If not the thread
+	 * group accounting is not active either. Lockless check.
+	 */
+	if (!READ_ONCE(tsk->signal->posix_cputimers.timers_active))
 		return NULL;
 
 	/*
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 729bd892ee45..88050259c466 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -57,18 +57,12 @@ struct task_cputime_atomic {
 /**
  * struct thread_group_cputimer - thread group interval timer counts
  * @cputime_atomic:	atomic thread group interval timers.
- * @running:		true when there are timers running and
- *			@cputime_atomic receives updates.
- * @checking_timer:	true when a thread in the group is in the
- *			process of checking for thread group timers.
  *
  * This structure contains the version of task_cputime, above, that is
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
 	struct task_cputime_atomic cputime_atomic;
-	bool running;
-	bool checking_timer;
 };
 
 struct multiprocess_signals {
diff --git a/init/init_task.c b/init/init_task.c
index 7ab773b9b3cd..d49692a0ec51 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -30,8 +30,6 @@ static struct signal_struct init_signals = {
 	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
 	.cputimer	= {
 		.cputime_atomic	= INIT_CPUTIME_ATOMIC,
-		.running	= false,
-		.checking_timer = false,
 	},
 #endif
 	INIT_CPU_TIMERS(init_signals)
diff --git a/kernel/fork.c b/kernel/fork.c
index 52bfe7c20ff6..f1228d9f0b11 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1517,7 +1517,6 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 	}
 }
 
-#ifdef CONFIG_POSIX_TIMERS
 /*
  * Initialize POSIX timer handling for a thread group.
  */
@@ -1528,12 +1527,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 
 	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	posix_cputimers_group_init(pct, cpu_limit);
-	if (cpu_limit != RLIM_INFINITY)
-		sig->cputimer.running = true;
 }
-#else
-static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
-#endif
 
 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index ef39a7a4a95c..52f4c99c1d60 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -23,8 +23,10 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer);
 void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
 {
 	posix_cputimers_init(pct);
-	if (cpu_limit != RLIM_INFINITY)
+	if (cpu_limit != RLIM_INFINITY) {
 		pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
+		pct->timers_active = true;
+	}
 }
 
 /*
@@ -248,8 +250,9 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
 void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
 
-	WARN_ON_ONCE(!cputimer->running);
+	WARN_ON_ONCE(!pct->timers_active);
 
 	proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
 }
@@ -269,9 +272,10 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
 static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
-	if (!READ_ONCE(cputimer->running)) {
+	if (!READ_ONCE(pct->timers_active)) {
 		struct task_cputime sum;
 
 		/*
@@ -283,13 +287,13 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
 		update_gt_cputime(&cputimer->cputime_atomic, &sum);
 
 		/*
-		 * We're setting cputimer->running without a lock. Ensure
-		 * this only gets written to in one operation. We set
-		 * running after update_gt_cputime() as a small optimization,
-		 * but barriers are not required because update_gt_cputime()
+		 * We're setting timers_active without a lock. Ensure this
+		 * only gets written to in one operation. We set it after
+		 * update_gt_cputime() as a small optimization, but
+		 * barriers are not required because update_gt_cputime()
 		 * can handle concurrent updates.
 		 */
-		WRITE_ONCE(cputimer->running, true);
+		WRITE_ONCE(pct->timers_active, true);
 	}
 	proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
 }
@@ -313,9 +317,10 @@ static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
 				  bool start)
 {
 	struct thread_group_cputimer *cputimer = &p->signal->cputimer;
+	struct posix_cputimers *pct = &p->signal->posix_cputimers;
 	u64 samples[CPUCLOCK_MAX];
 
-	if (!READ_ONCE(cputimer->running)) {
+	if (!READ_ONCE(pct->timers_active)) {
 		if (start)
 			thread_group_start_cputime(p, samples);
 		else
@@ -834,10 +839,10 @@ static void check_thread_timers(struct task_struct *tsk,
 
 static inline void stop_process_timers(struct signal_struct *sig)
 {
-	struct thread_group_cputimer *cputimer = &sig->cputimer;
+	struct posix_cputimers *pct = &sig->posix_cputimers;
 
-	/* Turn off cputimer->running. This is done without locking. */
-	WRITE_ONCE(cputimer->running, false);
+	/* Turn off the active flag. This is done without locking. */
+	WRITE_ONCE(pct->timers_active, false);
 	tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
 }
 
@@ -877,17 +882,17 @@ static void check_process_timers(struct task_struct *tsk,
 	unsigned long soft;
 
 	/*
-	 * If cputimer is not running, then there are no active
-	 * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
+	 * If there are no active process wide timers (POSIX 1.b, itimers,
+	 * RLIMIT_CPU) nothing to check.
 	 */
-	if (!READ_ONCE(sig->cputimer.running))
+	if (!READ_ONCE(pct->timers_active))
 		return;
 
        /*
 	 * Signify that a thread is checking for process timers.
 	 * Write access to this field is protected by the sighand lock.
 	 */
-	sig->cputimer.checking_timer = true;
+	pct->timers_active = true;
 
 	/*
 	 * Collect the current process totals. Group accounting is active
@@ -933,7 +938,7 @@ static void check_process_timers(struct task_struct *tsk,
 	if (expiry_cache_is_inactive(pct))
 		stop_process_timers(sig);
 
-	sig->cputimer.checking_timer = false;
+	pct->expiry_active = false;
 }
 
 /*
@@ -1027,39 +1032,41 @@ task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct)
  */
 static inline bool fastpath_timer_check(struct task_struct *tsk)
 {
+	struct posix_cputimers *pct = &tsk->posix_cputimers;
 	struct signal_struct *sig;
 
-	if (!expiry_cache_is_inactive(&tsk->posix_cputimers)) {
+	if (!expiry_cache_is_inactive(pct)) {
 		u64 samples[CPUCLOCK_MAX];
 
 		task_sample_cputime(tsk, samples);
-		if (task_cputimers_expired(samples, &tsk->posix_cputimers))
+		if (task_cputimers_expired(samples, pct))
 			return true;
 	}
 
 	sig = tsk->signal;
+	pct = &sig->posix_cputimers;
 	/*
-	 * Check if thread group timers expired when the cputimer is
-	 * running and no other thread in the group is already checking
-	 * for thread group cputimers. These fields are read without the
-	 * sighand lock. However, this is fine because this is meant to
-	 * be a fastpath heuristic to determine whether we should try to
-	 * acquire the sighand lock to check/handle timers.
+	 * Check if thread group timers expired when timers are active and
+	 * no other thread in the group is already handling expiry for
+	 * thread group cputimers. These fields are read without the
+	 * sighand lock. However, this is fine because this is meant to be
+	 * a fastpath heuristic to determine whether we should try to
+	 * acquire the sighand lock to handle timer expiry.
 	 *
-	 * In the worst case scenario, if 'running' or 'checking_timer' gets
-	 * set but the current thread doesn't see the change yet, we'll wait
-	 * until the next thread in the group gets a scheduler interrupt to
-	 * handle the timer. This isn't an issue in practice because these
-	 * types of delays with signals actually getting sent are expected.
+	 * In the worst case scenario, if concurrently timers_active is set
+	 * or expiry_active is cleared, but the current thread doesn't see
+	 * the change yet, the timer checks are delayed until the next
+	 * thread in the group gets a scheduler interrupt to handle the
+	 * timer. This isn't an issue in practice because these types of
+	 * delays with signals actually getting sent are expected.
 	 */
-	if (READ_ONCE(sig->cputimer.running) &&
-	    !READ_ONCE(sig->cputimer.checking_timer)) {
+	if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) {
 		u64 samples[CPUCLOCK_MAX];
 
 		proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic,
 					   samples);
 
-		if (task_cputimers_expired(samples, &sig->posix_cputimers))
+		if (task_cputimers_expired(samples, pct))
 			return true;
 	}
 
-- 
cgit 


From 60bda037f1dd8151e0c9ee5b09f0c091a0f643cd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 27 Aug 2019 21:31:02 +0200
Subject: posix-cpu-timers: Utilize timerqueue for storage

Using a linear O(N) search for timer insertion affects execution time and
D-cache footprint badly with a larger number of timers.

Switch the storage to a timerqueue which is already used for hrtimers and
alarmtimers. It does not affect the size of struct k_itimer as it.alarm is
still larger.

The extra list head for the expiry list will go away later once the expiry
is moved into task work context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1908272129220.1939@nanos.tec.linutronix.de
---
 include/linux/posix-timers.h   |  65 ++++++++++----
 include/linux/timerqueue.h     |  10 +++
 kernel/time/posix-cpu-timers.c | 189 +++++++++++++++++++++--------------------
 3 files changed, 155 insertions(+), 109 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a9e3f69d2db4..f9fbb4c620c1 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -5,17 +5,11 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/alarmtimer.h>
+#include <linux/timerqueue.h>
 
 struct kernel_siginfo;
 struct task_struct;
 
-struct cpu_timer_list {
-	struct list_head entry;
-	u64 expires;
-	struct task_struct *task;
-	int firing;
-};
-
 /*
  * Bit fields within a clockid:
  *
@@ -64,14 +58,58 @@ static inline int clockid_to_fd(const clockid_t clk)
 
 #ifdef CONFIG_POSIX_TIMERS
 
+/**
+ * cpu_timer - Posix CPU timer representation for k_itimer
+ * @node:	timerqueue node to queue in the task/sig
+ * @head:	timerqueue head on which this timer is queued
+ * @task:	Pointer to target task
+ * @elist:	List head for the expiry list
+ * @firing:	Timer is currently firing
+ */
+struct cpu_timer {
+	struct timerqueue_node	node;
+	struct timerqueue_head	*head;
+	struct task_struct	*task;
+	struct list_head	elist;
+	int			firing;
+};
+
+static inline bool cpu_timer_requeue(struct cpu_timer *ctmr)
+{
+	return timerqueue_add(ctmr->head, &ctmr->node);
+}
+
+static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
+				     struct cpu_timer *ctmr)
+{
+	ctmr->head = head;
+	return timerqueue_add(head, &ctmr->node);
+}
+
+static inline void cpu_timer_dequeue(struct cpu_timer *ctmr)
+{
+	if (!RB_EMPTY_NODE(&ctmr->node.node))
+		timerqueue_del(ctmr->head, &ctmr->node);
+}
+
+static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr)
+{
+	return ctmr->node.expires;
+}
+
+static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp)
+{
+	ctmr->node.expires = exp;
+}
+
 /**
  * posix_cputimer_base - Container per posix CPU clock
  * @nextevt:		Earliest-expiration cache
- * @cpu_timers:		List heads to queue posix CPU timers
+ * @tqhead:		timerqueue head for cpu_timers
  */
 struct posix_cputimer_base {
 	u64			nextevt;
-	struct list_head	cpu_timers;
+	struct timerqueue_head	tqhead;
 };
 
 /**
@@ -92,14 +130,10 @@ struct posix_cputimers {
 
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
-	pct->timers_active = 0;
-	pct->expiry_active = 0;
+	memset(pct, 0, sizeof(*pct));
 	pct->bases[0].nextevt = U64_MAX;
 	pct->bases[1].nextevt = U64_MAX;
 	pct->bases[2].nextevt = U64_MAX;
-	INIT_LIST_HEAD(&pct->bases[0].cpu_timers);
-	INIT_LIST_HEAD(&pct->bases[1].cpu_timers);
-	INIT_LIST_HEAD(&pct->bases[2].cpu_timers);
 }
 
 void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit);
@@ -113,7 +147,6 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 /* Init task static initializer */
 #define INIT_CPU_TIMERBASE(b) {						\
 	.nextevt	= U64_MAX,					\
-	.cpu_timers	= LIST_HEAD_INIT(b.cpu_timers),			\
 }
 
 #define INIT_CPU_TIMERBASES(b) {					\
@@ -182,7 +215,7 @@ struct k_itimer {
 		struct {
 			struct hrtimer	timer;
 		} real;
-		struct cpu_timer_list	cpu;
+		struct cpu_timer	cpu;
 		struct {
 			struct alarm	alarmtimer;
 		} alarm;
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index aff122f1062a..93884086f392 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -43,6 +43,16 @@ static inline void timerqueue_init(struct timerqueue_node *node)
 	RB_CLEAR_NODE(&node->node);
 }
 
+static inline bool timerqueue_node_queued(struct timerqueue_node *node)
+{
+	return !RB_EMPTY_NODE(&node->node);
+}
+
+static inline bool timerqueue_node_expires(struct timerqueue_node *node)
+{
+	return node->expires;
+}
+
 static inline void timerqueue_init_head(struct timerqueue_head *head)
 {
 	head->rb_root = RB_ROOT_CACHED;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 52f4c99c1d60..73c492ce404b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -96,19 +96,19 @@ static inline int validate_clock_permissions(const clockid_t clock)
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
  */
-static void bump_cpu_timer(struct k_itimer *timer, u64 now)
+static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
 {
+	u64 delta, incr, expires = timer->it.cpu.node.expires;
 	int i;
-	u64 delta, incr;
 
 	if (!timer->it_interval)
-		return;
+		return expires;
 
-	if (now < timer->it.cpu.expires)
-		return;
+	if (now < expires)
+		return expires;
 
 	incr = timer->it_interval;
-	delta = now + incr - timer->it.cpu.expires;
+	delta = now + incr - expires;
 
 	/* Don't use (incr*2 < delta), incr*2 might overflow. */
 	for (i = 0; incr < delta - incr; i++)
@@ -118,10 +118,11 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
 		if (delta < incr)
 			continue;
 
-		timer->it.cpu.expires += incr;
+		timer->it.cpu.node.expires += incr;
 		timer->it_overrun += 1LL << i;
 		delta -= incr;
 	}
+	return timer->it.cpu.node.expires;
 }
 
 /* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */
@@ -365,7 +366,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
 		return -EINVAL;
 
 	new_timer->kclock = &clock_posix_cpu;
-	INIT_LIST_HEAD(&new_timer->it.cpu.entry);
+	timerqueue_init(&new_timer->it.cpu.node);
 	new_timer->it.cpu.task = p;
 	return 0;
 }
@@ -378,10 +379,11 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
  */
 static int posix_cpu_timer_del(struct k_itimer *timer)
 {
-	int ret = 0;
-	unsigned long flags;
+	struct cpu_timer *ctmr = &timer->it.cpu;
+	struct task_struct *p = ctmr->task;
 	struct sighand_struct *sighand;
-	struct task_struct *p = timer->it.cpu.task;
+	unsigned long flags;
+	int ret = 0;
 
 	if (WARN_ON_ONCE(!p))
 		return -EINVAL;
@@ -393,15 +395,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 	sighand = lock_task_sighand(p, &flags);
 	if (unlikely(sighand == NULL)) {
 		/*
-		 * We raced with the reaping of the task.
-		 * The deletion should have cleared us off the list.
+		 * This raced with the reaping of the task. The exit cleanup
+		 * should have removed this timer from the timer queue.
 		 */
-		WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
+		WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
 	} else {
 		if (timer->it.cpu.firing)
 			ret = TIMER_RETRY;
 		else
-			list_del(&timer->it.cpu.entry);
+			cpu_timer_dequeue(ctmr);
 
 		unlock_task_sighand(p, &flags);
 	}
@@ -412,12 +414,16 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 	return ret;
 }
 
-static void cleanup_timers_list(struct list_head *head)
+static void cleanup_timerqueue(struct timerqueue_head *head)
 {
-	struct cpu_timer_list *timer, *next;
+	struct timerqueue_node *node;
+	struct cpu_timer *ctmr;
 
-	list_for_each_entry_safe(timer, next, head, entry)
-		list_del_init(&timer->entry);
+	while ((node = timerqueue_getnext(head))) {
+		timerqueue_del(head, node);
+		ctmr = container_of(node, struct cpu_timer, node);
+		ctmr->head = NULL;
+	}
 }
 
 /*
@@ -429,9 +435,9 @@ static void cleanup_timers_list(struct list_head *head)
  */
 static void cleanup_timers(struct posix_cputimers *pct)
 {
-	cleanup_timers_list(&pct->bases[CPUCLOCK_PROF].cpu_timers);
-	cleanup_timers_list(&pct->bases[CPUCLOCK_VIRT].cpu_timers);
-	cleanup_timers_list(&pct->bases[CPUCLOCK_SCHED].cpu_timers);
+	cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead);
+	cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead);
+	cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead);
 }
 
 /*
@@ -454,28 +460,18 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
  */
 static void arm_timer(struct k_itimer *timer)
 {
-	struct cpu_timer_list *const nt = &timer->it.cpu;
 	int clkidx = CPUCLOCK_WHICH(timer->it_clock);
-	struct task_struct *p = timer->it.cpu.task;
-	u64 newexp = timer->it.cpu.expires;
+	struct cpu_timer *ctmr = &timer->it.cpu;
+	u64 newexp = cpu_timer_getexpires(ctmr);
+	struct task_struct *p = ctmr->task;
 	struct posix_cputimer_base *base;
-	struct list_head *head, *listpos;
-	struct cpu_timer_list *next;
 
 	if (CPUCLOCK_PERTHREAD(timer->it_clock))
 		base = p->posix_cputimers.bases + clkidx;
 	else
 		base = p->signal->posix_cputimers.bases + clkidx;
 
-	listpos = head = &base->cpu_timers;
-	list_for_each_entry(next,head, entry) {
-		if (nt->expires < next->expires)
-			break;
-		listpos = &next->entry;
-	}
-	list_add(&nt->entry, listpos);
-
-	if (listpos != head)
+	if (!cpu_timer_enqueue(&base->tqhead, ctmr))
 		return;
 
 	/*
@@ -498,24 +494,26 @@ static void arm_timer(struct k_itimer *timer)
  */
 static void cpu_timer_fire(struct k_itimer *timer)
 {
+	struct cpu_timer *ctmr = &timer->it.cpu;
+
 	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
 		/*
 		 * User don't want any signal.
 		 */
-		timer->it.cpu.expires = 0;
+		cpu_timer_setexpires(ctmr, 0);
 	} else if (unlikely(timer->sigq == NULL)) {
 		/*
 		 * This a special case for clock_nanosleep,
 		 * not a normal timer from sys_timer_create.
 		 */
 		wake_up_process(timer->it_process);
-		timer->it.cpu.expires = 0;
+		cpu_timer_setexpires(ctmr, 0);
 	} else if (!timer->it_interval) {
 		/*
 		 * One-shot timer.  Clear it as soon as it's fired.
 		 */
 		posix_timer_event(timer, 0);
-		timer->it.cpu.expires = 0;
+		cpu_timer_setexpires(ctmr, 0);
 	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
 		/*
 		 * The signal did not get queued because the signal
@@ -539,10 +537,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 {
 	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
 	u64 old_expires, new_expires, old_incr, val;
-	struct task_struct *p = timer->it.cpu.task;
+	struct cpu_timer *ctmr = &timer->it.cpu;
+	struct task_struct *p = ctmr->task;
 	struct sighand_struct *sighand;
 	unsigned long flags;
-	int ret;
+	int ret = 0;
 
 	if (WARN_ON_ONCE(!p))
 		return -EINVAL;
@@ -562,22 +561,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * If p has just been reaped, we can no
 	 * longer get any information about it at all.
 	 */
-	if (unlikely(sighand == NULL)) {
+	if (unlikely(sighand == NULL))
 		return -ESRCH;
-	}
 
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-
-	ret = 0;
 	old_incr = timer->it_interval;
-	old_expires = timer->it.cpu.expires;
+	old_expires = cpu_timer_getexpires(ctmr);
+
 	if (unlikely(timer->it.cpu.firing)) {
 		timer->it.cpu.firing = -1;
 		ret = TIMER_RETRY;
-	} else
-		list_del_init(&timer->it.cpu.entry);
+	} else {
+		cpu_timer_dequeue(ctmr);
+	}
 
 	/*
 	 * We need to sample the current value to convert the new
@@ -598,18 +596,16 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 			old->it_value.tv_nsec = 0;
 		} else {
 			/*
-			 * Update the timer in case it has
-			 * overrun already.  If it has,
-			 * we'll report it as having overrun
-			 * and with the next reloaded timer
-			 * already ticking, though we are
-			 * swallowing that pending
-			 * notification here to install the
-			 * new setting.
+			 * Update the timer in case it has overrun already.
+			 * If it has, we'll report it as having overrun and
+			 * with the next reloaded timer already ticking,
+			 * though we are swallowing that pending
+			 * notification here to install the new setting.
 			 */
-			bump_cpu_timer(timer, val);
-			if (val < timer->it.cpu.expires) {
-				old_expires = timer->it.cpu.expires - val;
+			u64 exp = bump_cpu_timer(timer, val);
+
+			if (val < exp) {
+				old_expires = exp - val;
 				old->it_value = ns_to_timespec64(old_expires);
 			} else {
 				old->it_value.tv_nsec = 1;
@@ -638,7 +634,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * For a timer with no notification action, we don't actually
 	 * arm the timer (we'll just fake it for timer_gettime).
 	 */
-	timer->it.cpu.expires = new_expires;
+	cpu_timer_setexpires(ctmr, new_expires);
 	if (new_expires != 0 && val < new_expires) {
 		arm_timer(timer);
 	}
@@ -680,8 +676,9 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
 {
 	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
-	struct task_struct *p = timer->it.cpu.task;
-	u64 now;
+	struct cpu_timer *ctmr = &timer->it.cpu;
+	u64 now, expires = cpu_timer_getexpires(ctmr);
+	struct task_struct *p = ctmr->task;
 
 	if (WARN_ON_ONCE(!p))
 		return;
@@ -691,7 +688,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 	 */
 	itp->it_interval = ktime_to_timespec64(timer->it_interval);
 
-	if (!timer->it.cpu.expires)
+	if (!expires)
 		return;
 
 	/*
@@ -713,9 +710,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 			/*
 			 * The process has been reaped.
 			 * We can't even collect a sample any more.
-			 * Call the timer disarmed, nothing else to do.
+			 * Disarm the timer, nothing else to do.
 			 */
-			timer->it.cpu.expires = 0;
+			cpu_timer_setexpires(ctmr, 0);
 			return;
 		} else {
 			now = cpu_clock_sample_group(clkid, p, false);
@@ -723,8 +720,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 		}
 	}
 
-	if (now < timer->it.cpu.expires) {
-		itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
+	if (now < expires) {
+		itp->it_value = ns_to_timespec64(expires - now);
 	} else {
 		/*
 		 * The timer should have expired already, but the firing
@@ -735,37 +732,41 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 	}
 }
 
-static unsigned long long
-check_timers_list(struct list_head *timers,
-		  struct list_head *firing,
-		  unsigned long long curr)
-{
-	int maxfire = 20;
-
-	while (!list_empty(timers)) {
-		struct cpu_timer_list *t;
-
-		t = list_first_entry(timers, struct cpu_timer_list, entry);
+#define MAX_COLLECTED	20
 
-		if (!--maxfire || curr < t->expires)
-			return t->expires;
-
-		t->firing = 1;
-		list_move_tail(&t->entry, firing);
+static u64 collect_timerqueue(struct timerqueue_head *head,
+			      struct list_head *firing, u64 now)
+{
+	struct timerqueue_node *next;
+	int i = 0;
+
+	while ((next = timerqueue_getnext(head))) {
+		struct cpu_timer *ctmr;
+		u64 expires;
+
+		ctmr = container_of(next, struct cpu_timer, node);
+		expires = cpu_timer_getexpires(ctmr);
+		/* Limit the number of timers to expire at once */
+		if (++i == MAX_COLLECTED || now < expires)
+			return expires;
+
+		ctmr->firing = 1;
+		cpu_timer_dequeue(ctmr);
+		list_add_tail(&ctmr->elist, firing);
 	}
 
 	return U64_MAX;
 }
 
-static void collect_posix_cputimers(struct posix_cputimers *pct,
-				    u64 *samples, struct list_head *firing)
+static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,
+				    struct list_head *firing)
 {
 	struct posix_cputimer_base *base = pct->bases;
 	int i;
 
 	for (i = 0; i < CPUCLOCK_MAX; i++, base++) {
-		base->nextevt = check_timers_list(&base->cpu_timers, firing,
-						   samples[i]);
+		base->nextevt = collect_timerqueue(&base->tqhead, firing,
+						    samples[i]);
 	}
 }
 
@@ -948,7 +949,8 @@ static void check_process_timers(struct task_struct *tsk,
 static void posix_cpu_timer_rearm(struct k_itimer *timer)
 {
 	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
-	struct task_struct *p = timer->it.cpu.task;
+	struct cpu_timer *ctmr = &timer->it.cpu;
+	struct task_struct *p = ctmr->task;
 	struct sighand_struct *sighand;
 	unsigned long flags;
 	u64 now;
@@ -980,7 +982,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 			 * The process has been reaped.
 			 * We can't even collect a sample any more.
 			 */
-			timer->it.cpu.expires = 0;
+			cpu_timer_setexpires(ctmr, 0);
 			return;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
 			/* If the process is dying, no need to rearm */
@@ -1124,11 +1126,11 @@ void run_posix_cpu_timers(void)
 	 * each timer's lock before clearing its firing flag, so no
 	 * timer call will interfere.
 	 */
-	list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
+	list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
 		int cpu_firing;
 
 		spin_lock(&timer->it_lock);
-		list_del_init(&timer->it.cpu.entry);
+		list_del_init(&timer->it.cpu.elist);
 		cpu_firing = timer->it.cpu.firing;
 		timer->it.cpu.firing = 0;
 		/*
@@ -1204,6 +1206,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 	timer.it_overrun = -1;
 	error = posix_cpu_timer_create(&timer);
 	timer.it_process = current;
+
 	if (!error) {
 		static struct itimerspec64 zero_it;
 		struct restart_block *restart;
@@ -1219,7 +1222,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		}
 
 		while (!signal_pending(current)) {
-			if (timer.it.cpu.expires == 0) {
+			if (!cpu_timer_getexpires(&timer.it.cpu)) {
 				/*
 				 * Our timer fired and was reset, below
 				 * deletion can not fail.
@@ -1241,7 +1244,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		/*
 		 * We were interrupted by a signal.
 		 */
-		expires = timer.it.cpu.expires;
+		expires = cpu_timer_getexpires(&timer.it.cpu);
 		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
 		if (!error) {
 			/*
-- 
cgit 


From 41cfe2a2a7f4fad5647031ad3a1da166452b5437 Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Wed, 28 Aug 2019 16:07:47 +0800
Subject: x86/hyperv: Hide pv_ops access for CONFIG_PARAVIRT=n

hv_setup_sched_clock() references pv_ops which is only available when
CONFIG_PARAVIRT=Y.

Wrap it into a #ifdef

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190828080747.204419-1-Tianyu.Lan@microsoft.com
---
 arch/x86/kernel/cpu/mshyperv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 53afd33990eb..267daad8c036 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -346,7 +346,9 @@ static void __init ms_hyperv_init_platform(void)
 
 void hv_setup_sched_clock(void *sched_clock)
 {
+#ifdef CONFIG_PARAVIRT
 	pv_ops.time.sched_clock = sched_clock;
+#endif
 }
 
 const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
-- 
cgit 


From a67e408241783575716fcf3f79d0878f6cef0273 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 23 Aug 2019 13:38:44 +0200
Subject: hrtimer: Add kernel doc annotation for HRTIMER_MODE_HARD

Add kernel doc annotation for HRTIMER_MODE_HARD.

Fixes: ae6683d815895 ("hrtimer: Introduce HARD expiry mode")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190823113845.12125-2-bigeasy@linutronix.de
---
 include/linux/hrtimer.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5df4bcff96d5..1b9a51a1bccb 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -32,6 +32,8 @@ struct hrtimer_cpu_base;
  *				  when starting the timer)
  * HRTIMER_MODE_SOFT		- Timer callback function will be executed in
  *				  soft irq context
+ * HRTIMER_MODE_HARD		- Timer callback function will be executed in
+ *				  hard irq context even on PREEMPT_RT.
  */
 enum hrtimer_mode {
 	HRTIMER_MODE_ABS	= 0x00,
-- 
cgit 


From 71fed982d63cb2bb88db6f36059e3b14a7913846 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 23 Aug 2019 13:38:45 +0200
Subject: tick: Mark sched_timer to expire in hard interrupt context

sched_timer must be initialized with the _HARD mode suffix to ensure expiry
in hard interrupt context on RT.

The previous conversion to HARD expiry mode missed on one instance in
tick_nohz_switch_to_nohz(). Fix it up.

Fixes: 902a9f9c50905 ("tick: Mark tick related hrtimers to expiry in hard interrupt context")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190823113845.12125-3-bigeasy@linutronix.de
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 01ff32a02af2..955851748dc3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1233,7 +1233,7 @@ static void tick_nohz_switch_to_nohz(void)
 	 * Recycle the hrtimer in ts, so we can share the
 	 * hrtimer_forward with the highres code.
 	 */
-	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	/* Get the next period */
 	next = tick_init_jiffy_update();
 
-- 
cgit 


From 8f2edb4a78f7f5fa35c025849152b1d2dfaee4eb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Aug 2019 08:19:40 +0200
Subject: posix-timers: Unbreak CONFIG_POSIX_TIMERS=n build

The rework of the posix-cpu-timers patch series dropped the empty
declaration of struct cpu_timer for the CONFIG_POSIX_TIMERS=n case which
causes the build to fail:

./include/linux/posix-timers.h:218:20: error: field 'cpu' has incomplete type

Add it back.

Fixes: 60bda037f1dd ("posix-cpu-timers: Utilize timerqueue for storage")
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/posix-timers.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index f9fbb4c620c1..e685916551bf 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -161,6 +161,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 	},
 #else
 struct posix_cputimers { };
+struct cpu_timer { };
 #define INIT_CPU_TIMERS(s)
 static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
 static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
-- 
cgit 


From a2ed4fd685cd23e98922f933d5dbccfbe82a4f08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Aug 2019 12:52:28 +0200
Subject: posix-cpu-timers: Make expiry_active check actually work correctly

The state tracking changes broke the expiry active check by not writing to
it and instead sitting timers_active, which is already set.

That's not a big issue as the actual expiry is protected by sighand lock,
so concurrent handling is not possible. That means that the second task
which invokes that function executes the expiry code for nothing.

Write to the proper flag.

Also add a check whether the flag is set into check_process_timers(). That
check had been missing in the code before the rework already. The check for
another task handling the expiry of process wide timers was only done in
the fastpath check. If the fastpath check returns true because a per task
timer expired, then the checking of process wide timers was done in
parallel which is as explained above just a waste of cycles.

Fixes: 244d49e30653 ("posix-cpu-timers: Move state tracking to struct posix_cputimers")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/time/posix-cpu-timers.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 73c492ce404b..c3a95b122209 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -884,16 +884,17 @@ static void check_process_timers(struct task_struct *tsk,
 
 	/*
 	 * If there are no active process wide timers (POSIX 1.b, itimers,
-	 * RLIMIT_CPU) nothing to check.
+	 * RLIMIT_CPU) nothing to check. Also skip the process wide timer
+	 * processing when there is already another task handling them.
 	 */
-	if (!READ_ONCE(pct->timers_active))
+	if (!READ_ONCE(pct->timers_active) || pct->expiry_active)
 		return;
 
-       /*
+	/*
 	 * Signify that a thread is checking for process timers.
 	 * Write access to this field is protected by the sighand lock.
 	 */
-	pct->timers_active = true;
+	pct->expiry_active = true;
 
 	/*
 	 * Collect the current process totals. Group accounting is active
-- 
cgit 


From 5d2295f3a93b04986d069ebeaf5b07725f9096c1 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 4 Sep 2019 16:55:27 +0200
Subject: hrtimer: Add a missing bracket and hide `migration_base' on !SMP

The recent change to avoid taking the expiry lock when a timer is currently
migrated missed to add a bracket at the end of the if statement leading to
compile errors.  Since that commit the variable `migration_base' is always
used but it is only available on SMP configuration thus leading to another
compile error.  The changelog says "The timer base and base->cpu_base
cannot be NULL in the code path", so it is safe to limit this check to SMP
configurations only.

Add the missing bracket to the if statement and hide `migration_base'
behind CONFIG_SMP bars.

[ tglx: Mark the functions inline ... ]

Fixes: 68b2c8c1e4210 ("hrtimer: Don't take expiry_lock when timer is currently migrated")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190904145527.eah7z56ntwobqm6j@linutronix.de
---
 kernel/time/hrtimer.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ebbd0fb3512b..0d4dc241c0fb 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -140,6 +140,11 @@ static struct hrtimer_cpu_base migration_cpu_base = {
 
 #define migration_base	migration_cpu_base.clock_base[0]
 
+static inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+	return base == &migration_base;
+}
+
 /*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
@@ -264,6 +269,11 @@ again:
 
 #else /* CONFIG_SMP */
 
+static inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+	return false;
+}
+
 static inline struct hrtimer_clock_base *
 lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
@@ -1221,7 +1231,7 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer)
 	 * Just relax if the timer expires in hard interrupt context or if
 	 * it is currently on the migration base.
 	 */
-	if (!timer->is_soft || base == &migration_base)
+	if (!timer->is_soft || is_migration_base(base)) {
 		cpu_relax();
 		return;
 	}
-- 
cgit 


From 00d9e47f8ec2a293db9ebed86aab0583d9a49533 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 5 Sep 2019 14:03:40 +0200
Subject: posix-cpu-timers: Always clear head pointer on dequeue

The head pointer in struct cpu_timer is checked to be NULL in
posix_cpu_timer_del() when the delete raced with the exit cleanup. The
works correctly as long as the timer is actually dequeued via
posix_cpu_timers_exit*().

But if the timer was dequeued due to expiry the head pointer is still set
and triggers the warning.

In fact keeping the head pointer around after any dequeue is pointless as
is has no meaning at all after that.

Clear the head pointer always on dequeue and remove the unused requeue
function while at it.

Fixes: 60bda037f1dd ("posix-cpu-timers: Utilize timerqueue for storage")
Reported-by: syzbot+55acd54b57bb4b3840a4@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/20190905120539.707986830@linutronix.de
---
 include/linux/posix-timers.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index e685916551bf..3d10c84a97a9 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -74,11 +74,6 @@ struct cpu_timer {
 	int			firing;
 };
 
-static inline bool cpu_timer_requeue(struct cpu_timer *ctmr)
-{
-	return timerqueue_add(ctmr->head, &ctmr->node);
-}
-
 static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
 				     struct cpu_timer *ctmr)
 {
@@ -88,8 +83,10 @@ static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
 
 static inline void cpu_timer_dequeue(struct cpu_timer *ctmr)
 {
-	if (!RB_EMPTY_NODE(&ctmr->node.node))
+	if (ctmr->head) {
 		timerqueue_del(ctmr->head, &ctmr->node);
+		ctmr->head = NULL;
+	}
 }
 
 static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr)
-- 
cgit 


From 77b4b5420422fc037d00b8f3f0e89b2262e4ae29 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 5 Sep 2019 23:15:08 +0200
Subject: posix-cpu-timers: Fix permission check regression

The recent consolidation of the three permission checks introduced a subtle
regression. For timer_create() with a process wide timer it returns the
current task if the lookup through the PID which is encoded into the
clockid results in returning current.

That's broken because it does not validate whether the current task is the
group leader.

That was caused by the two different variants of permission checks:

  - posix_cpu_timer_get() allowed access to the process wide clock when the
    looked up task is current. That's not an issue because the process wide
    clock is in the shared sighand.

  - posix_cpu_timer_create() made sure that the looked up task is the group
    leader.

Restore the previous state.

Note, that these permission checks are more than questionable, but that's
subject to follow up changes.

Fixes: 6ae40e3fdcd3 ("posix-cpu-timers: Provide task validation functions")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1909052314110.1902@nanos.tec.linutronix.de
---
 kernel/time/posix-cpu-timers.c | 44 +++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index c3a95b122209..92a431981b1c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -47,25 +47,46 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 /*
  * Functions for validating access to tasks.
  */
-static struct task_struct *lookup_task(const pid_t pid, bool thread)
+static struct task_struct *lookup_task(const pid_t pid, bool thread,
+				       bool gettime)
 {
 	struct task_struct *p;
 
+	/*
+	 * If the encoded PID is 0, then the timer is targeted at current
+	 * or the process to which current belongs.
+	 */
 	if (!pid)
 		return thread ? current : current->group_leader;
 
 	p = find_task_by_vpid(pid);
-	if (!p || p == current)
+	if (!p)
 		return p;
+
 	if (thread)
 		return same_thread_group(p, current) ? p : NULL;
-	if (p == current)
-		return p;
+
+	if (gettime) {
+		/*
+		 * For clock_gettime(PROCESS) the task does not need to be
+		 * the actual group leader. tsk->sighand gives
+		 * access to the group's clock.
+		 *
+		 * Timers need the group leader because they take a
+		 * reference on it and store the task pointer until the
+		 * timer is destroyed.
+		 */
+		return (p == current || thread_group_leader(p)) ? p : NULL;
+	}
+
+	/*
+	 * For processes require that p is group leader.
+	 */
 	return has_group_leader_pid(p) ? p : NULL;
 }
 
 static struct task_struct *__get_task_for_clock(const clockid_t clock,
-						bool getref)
+						bool getref, bool gettime)
 {
 	const bool thread = !!CPUCLOCK_PERTHREAD(clock);
 	const pid_t pid = CPUCLOCK_PID(clock);
@@ -75,7 +96,7 @@ static struct task_struct *__get_task_for_clock(const clockid_t clock,
 		return NULL;
 
 	rcu_read_lock();
-	p = lookup_task(pid, thread);
+	p = lookup_task(pid, thread, gettime);
 	if (p && getref)
 		get_task_struct(p);
 	rcu_read_unlock();
@@ -84,12 +105,17 @@ static struct task_struct *__get_task_for_clock(const clockid_t clock,
 
 static inline struct task_struct *get_task_for_clock(const clockid_t clock)
 {
-	return __get_task_for_clock(clock, true);
+	return __get_task_for_clock(clock, true, false);
+}
+
+static inline struct task_struct *get_task_for_clock_get(const clockid_t clock)
+{
+	return __get_task_for_clock(clock, true, true);
 }
 
 static inline int validate_clock_permissions(const clockid_t clock)
 {
-	return __get_task_for_clock(clock, false) ? 0 : -EINVAL;
+	return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL;
 }
 
 /*
@@ -339,7 +365,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
 	struct task_struct *tsk;
 	u64 t;
 
-	tsk = get_task_for_clock(clock);
+	tsk = get_task_for_clock_get(clock);
 	if (!tsk)
 		return -EINVAL;
 
-- 
cgit