summaryrefslogtreecommitdiff
path: root/kernel/rcu/tree_exp.h
diff options
context:
space:
mode:
authorFrederic Weisbecker <frederic@kernel.org>2024-01-12 16:46:21 +0100
committerBoqun Feng <boqun.feng@gmail.com>2024-02-14 07:51:36 -0800
commit23da2ad64dbe9f3fab10af90484fe41e144337b1 (patch)
treedbdfc8b44dbb2d048b0d35b005e3c5e500f136ff /kernel/rcu/tree_exp.h
parentb67cffcbbf9dc759d95d330a5af5d1480af2b1f1 (diff)
rcu/exp: Remove rcu_par_gp_wq
TREE04 running on short iterations can produce writer stalls of the following kind: ??? Writer stall state RTWS_EXP_SYNC(4) g3968 f0x0 ->state 0x2 cpu 0 task:rcu_torture_wri state:D stack:14568 pid:83 ppid:2 flags:0x00004000 Call Trace: <TASK> __schedule+0x2de/0x850 ? trace_event_raw_event_rcu_exp_funnel_lock+0x6d/0xb0 schedule+0x4f/0x90 synchronize_rcu_expedited+0x430/0x670 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_synchronize_rcu_expedited+0x10/0x10 do_rtws_sync.constprop.0+0xde/0x230 rcu_torture_writer+0x4b4/0xcd0 ? __pfx_rcu_torture_writer+0x10/0x10 kthread+0xc7/0xf0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 </TASK> Waiting for an expedited grace period and polling for an expedited grace period both are operations that internally rely on the same workqueue performing necessary asynchronous work. However, a dependency chain is involved between those two operations, as depicted below: ====== CPU 0 ======= ====== CPU 1 ======= synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); start_poll_synchronize_rcu_expedited queue_work(rcu_gp_wq, &rnp->exp_poll_wq); synchronize_rcu_expedited_queue_work() queue_work(rcu_gp_wq, &rew->rew_work); wait_event() // A, wait for &rew->rew_work completion mutex_unlock() // B //======> switch to kworker sync_rcu_do_polled_gp() { synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); // C, wait B .... } // D Since workqueues are usually implemented on top of several kworkers handling the queue concurrently, the above situation wouldn't deadlock most of the time because A then doesn't depend on D. But in case of memory stress, a single kworker may end up handling alone all the works in a serialized way. In that case the above layout becomes a problem because A then waits for D, closing a circular dependency: A -> D -> C -> B -> A This however only happens when CONFIG_RCU_EXP_KTHREAD=n. Indeed synchronize_rcu_expedited() is otherwise implemented on top of a kthread worker while polling still relies on rcu_gp_wq workqueue, breaking the above circular dependency chain. Fix this with making expedited grace period to always rely on kthread worker. The workqueue based implementation is essentially a duplicate anyway now that the per-node initialization is performed by per-node kthread workers. Meanwhile the CONFIG_RCU_EXP_KTHREAD switch is still kept around to manage the scheduler policy of these kthread workers. Reported-by: Anna-Maria Behnsen <anna-maria@linutronix.de> Reported-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Joel Fernandes <joel@joelfernandes.org> Suggested-by: Paul E. McKenney <paulmck@kernel.org> Suggested-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com> Signed-off-by: Frederic Weisbecker <frederic@kernel.org> Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Diffstat (limited to 'kernel/rcu/tree_exp.h')
-rw-r--r--kernel/rcu/tree_exp.h73
1 files changed, 1 insertions, 72 deletions
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 0318a8a062d5..6b83537480b1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -418,7 +418,6 @@ retry_ipi:
static void rcu_exp_sel_wait_wake(unsigned long s);
-#ifdef CONFIG_RCU_EXP_KTHREAD
static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
{
struct rcu_exp_work *rewp =
@@ -470,69 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew
kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
}
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp =
- container_of(wp, struct rcu_exp_work, rew_work);
-
- __sync_rcu_exp_select_node_cpus(rewp);
-}
-
-static inline bool rcu_exp_worker_started(void)
-{
- return !!READ_ONCE(rcu_gp_wq);
-}
-
-static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
-{
- return !!READ_ONCE(rcu_par_gp_wq);
-}
-
-static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
-{
- int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
-
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
-}
-
-static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
-{
- flush_work(&rnp->rew.rew_work);
-}
-
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp;
-
- rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
-static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
-{
- INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew->rew_work);
-}
-
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
- destroy_work_on_stack(&rew->rew_work);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
@@ -965,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
- bool use_worker;
unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
@@ -976,9 +911,6 @@ void synchronize_rcu_expedited(void)
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
- use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) &&
- rcu_exp_worker_started();
-
/* Is the state is such that the call is a grace period? */
if (rcu_blocking_is_gp()) {
// Note well that this code runs with !PREEMPT && !SMP.
@@ -1008,7 +940,7 @@ void synchronize_rcu_expedited(void)
return; /* Someone else did our work for us. */
/* Ensure that load happens before action based on it. */
- if (unlikely(!use_worker)) {
+ if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(s);
} else {
@@ -1025,9 +957,6 @@ void synchronize_rcu_expedited(void)
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
-
- if (likely(use_worker))
- synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);