From 1b04fa9900263b4e217ca2509fd778b32c2b4eb2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 9 Dec 2020 21:27:31 +0100 Subject: rcu-tasks: Move RCU-tasks initialization to before early_initcall() PowerPC testing encountered boot failures due to RCU Tasks not being fully initialized until core_initcall() time. This commit therefore initializes RCU Tasks (along with Rude RCU and RCU Tasks Trace) just before early_initcall() time, thus allowing waiting on RCU Tasks grace periods from early_initcall() handlers. Link: https://lore.kernel.org/rcu/87eekfh80a.fsf@dja-thinkpad.axtens.net/ Fixes: 36dadef23fcc ("kprobes: Init kprobes in early_initcall") Tested-by: Daniel Axtens Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5d9f2d03e8a..73bbe792fe1e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -241,7 +241,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +/* Spawn RCU-tasks grace-period kthread. */ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; @@ -569,7 +569,6 @@ static int __init rcu_spawn_tasks_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } -core_initcall(rcu_spawn_tasks_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_classic_gp_kthread(void) @@ -697,7 +696,6 @@ static int __init rcu_spawn_tasks_rude_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } -core_initcall(rcu_spawn_tasks_rude_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_rude_gp_kthread(void) @@ -975,6 +973,11 @@ static void rcu_tasks_trace_pregp_step(void) static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { + // During early boot when there is only the one boot CPU, there + // is no idle task for the other CPUs. Just return. + if (unlikely(t == NULL)) + return; + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; @@ -1200,7 +1203,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); return 0; } -core_initcall(rcu_spawn_tasks_trace_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_trace_gp_kthread(void) @@ -1229,6 +1231,21 @@ void show_rcu_tasks_gp_kthreads(void) } #endif /* #ifndef CONFIG_TINY_RCU */ +void __init rcu_init_tasks_generic(void) +{ +#ifdef CONFIG_TASKS_RCU + rcu_spawn_tasks_kthread(); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + rcu_spawn_tasks_rude_kthread(); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + rcu_spawn_tasks_trace_kthread(); +#endif +} + #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} void show_rcu_tasks_gp_kthreads(void) {} -- cgit From 6bc335828056f3b301a3deadda782de4e8f0db08 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 3 Nov 2020 09:25:57 -0500 Subject: rcu/tree: Make rcu_do_batch count how many callbacks were executed The rcu_do_batch() function extracts the ready-to-invoke callbacks from the rcu_segcblist located in the ->cblist field of the current CPU's rcu_data structure. These callbacks are first moved to a local (unsegmented) rcu_cblist. The rcu_do_batch() function then uses this rcu_cblist's ->len field to count how many CBs it has invoked, but it does so by counting that field down from zero. Finally, this function negates the value in this ->len field (resulting in a positive number) and subtracts the result from the ->len field of the current CPU's ->cblist field. Except that it is sometimes necessary for rcu_do_batch() to stop invoking callbacks mid-stream, despite there being more ready to invoke, for example, if a high-priority task wakes up. In this case the remaining not-yet-invoked callbacks are requeued back onto the CPU's ->cblist, but remain in the ready-to-invoke segment of that list. As above, the negative of the local rcu_cblist's ->len field is still subtracted from the ->len field of the current CPU's ->cblist field. The design of counting down from 0 is confusing and error-prone, plus use of a positive count will make it easier to provide a uniform and consistent API to deal with the per-segment counts that are added later in this series. For example, rcu_segcblist_extract_done_cbs() can unconditionally populate the resulting unsegmented list's ->len field during extraction. This commit therefore explicitly counts how many callbacks were executed in rcu_do_batch() itself, counting up from zero, and then uses that to update the per-CPU segcb list's ->len field, without relying on the downcounting of rcl->len from zero. Reviewed-by: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 2 +- kernel/rcu/rcu_segcblist.h | 1 + kernel/rcu/tree.c | 11 +++++------ 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2d2a6b6b9dfb..bb246d8c6ef1 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -95,7 +95,7 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) * This increase is fully ordered with respect to the callers accesses * both before and after. */ -static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU smp_mb__before_atomic(); /* Up to the caller! */ diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 492262bcb591..1d2d61406463 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -76,6 +76,7 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) } void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); void rcu_segcblist_offload(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..cc6f379c7ddf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2434,7 +2434,7 @@ static void rcu_do_batch(struct rcu_data *rdp) const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count; + long bl, count = 0; long pending, tlimit = 0; /* If no callbacks are ready, just return. */ @@ -2479,6 +2479,7 @@ static void rcu_do_batch(struct rcu_data *rdp) for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { rcu_callback_t f; + count++; debug_rcu_head_unqueue(rhp); rcu_lock_acquire(&rcu_callback_map); @@ -2492,15 +2493,14 @@ static void rcu_do_batch(struct rcu_data *rdp) /* * Stop only if limit reached and CPU has something to do. - * Note: The rcl structure counts down from zero. */ - if (-rcl.len >= bl && !offloaded && + if (count >= bl && !offloaded && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; if (unlikely(tlimit)) { /* only call local_clock() every 32 callbacks */ - if (likely((-rcl.len & 31) || local_clock() < tlimit)) + if (likely((count & 31) || local_clock() < tlimit)) continue; /* Exceeded the time limit, so leave. */ break; @@ -2517,7 +2517,6 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); - count = -rcl.len; rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2525,7 +2524,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rcu_segcblist_insert_count(&rdp->cblist, &rcl); + rcu_segcblist_add_len(&rdp->cblist, -count); /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); -- cgit From 84109ab58590dc6c4e7eb36329fdc7ec121ed5a5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 20 Nov 2020 06:53:11 -0800 Subject: rcu: Record kvfree_call_rcu() call stack for KASAN This commit adds a call to kasan_record_aux_stack() in kvfree_call_rcu() in order to record the call stack of the code that caused the object to be freed. Please note that this function does not update the allocated/freed state, which is important because RCU readers might still be referencing this object. Acked-by: Dmitry Vyukov Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..2db736cbe342 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3498,6 +3498,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) goto unlock_return; } + kasan_record_aux_stack(ptr); success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); if (!success) { run_page_cache_worker(krcp); -- cgit From 2341bc4a0311e4319ced6c2828bb19309dee74fd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Dec 2020 15:16:45 +0100 Subject: rcu: Make RCU_BOOST default on CONFIG_PREEMPT_RT On PREEMPT_RT kernels, RCU callbacks are deferred to the `rcuc' kthread. This can stall RCU grace periods due to lengthy preemption not only of RCU readers but also of 'rcuc' kthreads, either of which prevent grace periods from completing, which can in turn result in OOM. Because PREEMPT_RT kernels have more kthreads that can block grace periods, it is more important for such kernels to enable RCU_BOOST. This commit therefore makes RCU_BOOST the default on PREEMPT_RT. RCU_BOOST can still be manually disabled if need be. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index cdc57b4f6d48..aa8cc8c977e7 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -188,8 +188,8 @@ config RCU_FAST_NO_HZ config RCU_BOOST bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT - default n + depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT + default y if PREEMPT_RT help This option boosts the priority of preempted RCU readers that block the current preemptible RCU grace period for too long. -- cgit From 8b9a0ecc7ef5e1ed3afbc926de17399a37128c82 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 15 Dec 2020 15:16:46 +0100 Subject: rcu: Unconditionally use rcuc threads on PREEMPT_RT PREEMPT_RT systems have long used the rcutree.use_softirq kernel boot parameter to avoid use of RCU_SOFTIRQ handlers, which can disrupt real-time applications by invoking callbacks during return from interrupts that arrived while executing time-critical code. This kernel boot parameter instead runs RCU core processing in an 'rcuc' kthread, thus allowing the scheduler to do its job of avoiding disrupting time-critical code. This commit therefore disables the rcutree.use_softirq kernel boot parameter on PREEMPT_RT systems, thus forcing such systems to do RCU core processing in 'rcuc' kthreads. This approach has long been in use by users of the -rt patchset, and there have been no complaints. There is therefore no way for the system administrator to override this choice, at least without modifying and rebuilding the kernel. Signed-off-by: Scott Wood [bigeasy: Reword commit message] Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Update kernel-parameters.txt accordingly. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..d60903581300 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -100,8 +100,10 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = true; +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(use_softirq, bool, 0444); +#endif /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); -- cgit From 36221e109eb20ac111bc3bf3e8d5639aa457c7e0 Mon Sep 17 00:00:00 2001 From: Julia Cartwright Date: Tue, 15 Dec 2020 15:16:47 +0100 Subject: rcu: Enable rcu_normal_after_boot unconditionally for RT Expedited RCU grace periods send IPIs to all non-idle CPUs, and thus can disrupt time-critical code in real-time applications. However, there is a portion of boot-time processing (presumably before any real-time applications have started) where expedited RCU grace periods are the only option. And so it is that experience with the -rt patchset indicates that PREEMPT_RT systems should always set the rcupdate.rcu_normal_after_boot kernel boot parameter. This commit therefore makes the post-boot application environment safe for real-time applications by making PREEMPT_RT systems disable the rcupdate.rcu_normal_after_boot kernel boot parameter and acting as if this parameter had been set. This means that post-boot calls to synchronize_rcu_expedited() will be treated as if they were instead calls to synchronize_rcu(), thus preventing the IPIs, and thus avoiding disrupting real-time applications. Suggested-by: Luiz Capitulino Acked-by: Paul E. McKenney Signed-off-by: Julia Cartwright Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Update kernel-parameters.txt accordingly. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 39334d2d2b37..b95ae86c40a7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -56,8 +56,10 @@ #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); -static int rcu_normal_after_boot; +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(rcu_normal_after_boot, int, 0); +#endif #endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -- cgit From 74612a07b83fc46c2b2e6f71a541d55b024ebefc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Nov 2020 16:34:09 -0800 Subject: srcu: Make Tiny SRCU use multi-bit grace-period counter There is a need for a polling interface for SRCU grace periods. This polling needs to distinguish between an SRCU instance being idle on the one hand or in the middle of a grace period on the other. This commit therefore converts the Tiny SRCU srcu_struct structure's srcu_idx from a defacto boolean to a free-running counter, using the bottom bit to indicate that a grace period is in progress. The second-from-bottom bit is thus used as the index returned by srcu_read_lock(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet [ paulmck: Fix ->srcu_lock_nesting[] indexing per Neeraj Upadhyay. ] Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 6208c1dae5c9..5598cf6f1edc 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -124,11 +124,12 @@ void srcu_drive_gp(struct work_struct *wp) ssp->srcu_cb_head = NULL; ssp->srcu_cb_tail = &ssp->srcu_cb_head; local_irq_enable(); - idx = ssp->srcu_idx; - WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); + idx = (ssp->srcu_idx & 0x2) / 2; + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); /* Invoke the callbacks we removed above. */ while (lh) { -- cgit From 1a893c711a600ab57526619b56e6f6b7be00956e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 09:37:39 -0800 Subject: srcu: Provide internal interface to start a Tiny SRCU grace period There is a need for a polling interface for SRCU grace periods. This polling needs to initiate an SRCU grace period without having to queue (and manage) a callback. This commit therefore splits the Tiny SRCU call_srcu() function into callback-queuing and start-grace-period portions, with the latter in a new function named srcu_gp_start_if_needed(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5598cf6f1edc..3bac1db85a85 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -152,6 +152,16 @@ void srcu_drive_gp(struct work_struct *wp) } EXPORT_SYMBOL_GPL(srcu_drive_gp); +static void srcu_gp_start_if_needed(struct srcu_struct *ssp) +{ + if (!READ_ONCE(ssp->srcu_gp_running)) { + if (likely(srcu_init_done)) + schedule_work(&ssp->srcu_work); + else if (list_empty(&ssp->srcu_work.entry)) + list_add(&ssp->srcu_work.entry, &srcu_boot_list); + } +} + /* * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. @@ -167,12 +177,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(ssp->srcu_gp_running)) { - if (likely(srcu_init_done)) - schedule_work(&ssp->srcu_work); - else if (list_empty(&ssp->srcu_work.entry)) - list_add(&ssp->srcu_work.entry, &srcu_boot_list); - } + srcu_gp_start_if_needed(ssp); } EXPORT_SYMBOL_GPL(call_srcu); -- cgit From 29d2bb94a8a126ce80ffbb433b648b32fdea524e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 10:08:09 -0800 Subject: srcu: Provide internal interface to start a Tree SRCU grace period There is a need for a polling interface for SRCU grace periods. This polling needs to initiate an SRCU grace period without having to queue (and manage) a callback. This commit therefore splits the Tree SRCU __call_srcu() function into callback-initialization and queuing/start-grace-period portions, with the latter in a new function named srcu_gp_start_if_needed(). This function may be passed a NULL callback pointer, in which case it will refrain from queuing anything. Why have the new function mess with queuing? Locking considerations, of course! Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 66 +++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0f23d20d485a..9a7b6505fdc8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -807,6 +807,42 @@ static void srcu_leak_callback(struct rcu_head *rhp) { } +/* + * Start an SRCU grace period, and also queue the callback if non-NULL. + */ +static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rhp, bool do_norm) +{ + unsigned long flags; + int idx; + bool needexp = false; + bool needgp = false; + unsigned long s; + struct srcu_data *sdp; + + idx = srcu_read_lock(ssp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&ssp->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_gp_seq); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; + } + if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { + sdp->srcu_gp_seq_needed_exp = s; + needexp = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + if (needgp) + srcu_funnel_gp_start(ssp, sdp, s, do_norm); + else if (needexp) + srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_read_unlock(ssp, idx); +} + /* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating @@ -838,13 +874,6 @@ static void srcu_leak_callback(struct rcu_head *rhp) static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { - unsigned long flags; - int idx; - bool needexp = false; - bool needgp = false; - unsigned long s; - struct srcu_data *sdp; - check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ @@ -853,28 +882,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, return; } rhp->func = func; - idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); - if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { - sdp->srcu_gp_seq_needed = s; - needgp = true; - } - if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { - sdp->srcu_gp_seq_needed_exp = s; - needexp = true; - } - spin_unlock_irqrestore_rcu_node(sdp, flags); - if (needgp) - srcu_funnel_gp_start(ssp, sdp, s, do_norm); - else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); - srcu_read_unlock(ssp, idx); + srcu_gp_start_if_needed(ssp, rhp, do_norm); } /** -- cgit From 8b5bd67cf6422b63ee100d76d8de8960ca2df7f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 12:54:48 -0800 Subject: srcu: Provide polling interfaces for Tiny SRCU grace periods There is a need for a polling interface for SRCU grace periods, so this commit supplies get_state_synchronize_srcu(), start_poll_synchronize_srcu(), and poll_state_synchronize_srcu() for this purpose. The first can be used if future grace periods are inevitable (perhaps due to a later call_srcu() invocation), the second if future grace periods might not otherwise happen, and the third to check if a grace period has elapsed since the corresponding call to either of the first two. As with get_state_synchronize_rcu() and cond_synchronize_rcu(), the return value from either get_state_synchronize_srcu() or start_poll_synchronize_srcu() must be passed in to a later call to poll_state_synchronize_srcu(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet [ paulmck: Add EXPORT_SYMBOL_GPL() per kernel test robot feedback. ] [ paulmck: Apply feedback from Neeraj Upadhyay. ] Link: https://lore.kernel.org/lkml/20201117004017.GA7444@paulmck-ThinkPad-P72/ Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 3bac1db85a85..26344dc6483b 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp) ssp->srcu_gp_running = false; ssp->srcu_gp_waiting = false; ssp->srcu_idx = 0; + ssp->srcu_idx_max = 0; INIT_WORK(&ssp->srcu_work, srcu_drive_gp); INIT_LIST_HEAD(&ssp->srcu_work.entry); return 0; @@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) WARN_ON(ssp->srcu_gp_waiting); WARN_ON(ssp->srcu_cb_head); WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); + WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max); + WARN_ON(ssp->srcu_idx & 0x1); } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) + if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ @@ -147,13 +150,19 @@ void srcu_drive_gp(struct work_struct *wp) * straighten that out. */ WRITE_ONCE(ssp->srcu_gp_running, false); - if (READ_ONCE(ssp->srcu_cb_head)) + if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { + unsigned short cookie; + + cookie = get_state_synchronize_srcu(ssp); + if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + return; + WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) schedule_work(&ssp->srcu_work); @@ -196,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(synchronize_srcu); +/* + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret; + + barrier(); + ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; + barrier(); + return ret & USHRT_MAX; +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/* + * start_poll_synchronize_srcu - Provide cookie and start grace period + * + * The difference between this and get_state_synchronize_srcu() is that + * this function ensures that the poll_state_synchronize_srcu() will + * eventually return the value true. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret = get_state_synchronize_srcu(ssp); + + srcu_gp_start_if_needed(ssp); + return ret; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/* + * poll_state_synchronize_srcu - Has cookie's grace period ended? + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie); + + barrier(); + return ret; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { -- cgit From 5358c9fa54b09b5d3d7811b033aa0838c1bbaaf2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 17:31:55 -0800 Subject: srcu: Provide polling interfaces for Tree SRCU grace periods There is a need for a polling interface for SRCU grace periods, so this commit supplies get_state_synchronize_srcu(), start_poll_synchronize_srcu(), and poll_state_synchronize_srcu() for this purpose. The first can be used if future grace periods are inevitable (perhaps due to a later call_srcu() invocation), the second if future grace periods might not otherwise happen, and the third to check if a grace period has elapsed since the corresponding call to either of the first two. As with get_state_synchronize_rcu() and cond_synchronize_rcu(), the return value from either get_state_synchronize_srcu() or start_poll_synchronize_srcu() must be passed in to a later call to poll_state_synchronize_srcu(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet [ paulmck: Add EXPORT_SYMBOL_GPL() per kernel test robot feedback. ] [ paulmck: Apply feedback from Neeraj Upadhyay. ] Link: https://lore.kernel.org/lkml/20201117004017.GA7444@paulmck-ThinkPad-P72/ Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9a7b6505fdc8..c5d0c036fab5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -810,7 +810,8 @@ static void srcu_leak_callback(struct rcu_head *rhp) /* * Start an SRCU grace period, and also queue the callback if non-NULL. */ -static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rhp, bool do_norm) +static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, + struct rcu_head *rhp, bool do_norm) { unsigned long flags; int idx; @@ -819,10 +820,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rh unsigned long s; struct srcu_data *sdp; + check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + if (rhp) + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); s = rcu_seq_snap(&ssp->srcu_gp_seq); @@ -841,6 +844,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rh else if (needexp) srcu_funnel_exp_start(ssp, sdp->mynode, s); srcu_read_unlock(ssp, idx); + return s; } /* @@ -874,7 +878,6 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rh static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { - check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ WRITE_ONCE(rhp->func, srcu_leak_callback); @@ -882,7 +885,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, return; } rhp->func = func; - srcu_gp_start_if_needed(ssp, rhp, do_norm); + (void)srcu_gp_start_if_needed(ssp, rhp, do_norm); } /** @@ -1011,6 +1014,62 @@ void synchronize_srcu(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(synchronize_srcu); +/** + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + * @ssp: srcu_struct to provide cookie for. + * + * This function returns a cookie that can be passed to + * poll_state_synchronize_srcu(), which will return true if a full grace + * period has elapsed in the meantime. It is the caller's responsibility + * to make sure that grace period happens, for example, by invoking + * call_srcu() after return from get_state_synchronize_srcu(). + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + // Any prior manipulation of SRCU-protected data must happen + // before the load from ->srcu_gp_seq. + smp_mb(); + return rcu_seq_snap(&ssp->srcu_gp_seq); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/** + * start_poll_synchronize_srcu - Provide cookie and start grace period + * @ssp: srcu_struct to provide cookie for. + * + * This function returns a cookie that can be passed to + * poll_state_synchronize_srcu(), which will return true if a full grace + * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(), + * this function also ensures that any needed SRCU grace period will be + * started. This convenience does come at a cost in terms of CPU overhead. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + return srcu_gp_start_if_needed(ssp, NULL, true); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/** + * poll_state_synchronize_srcu - Has cookie's grace period ended? + * @ssp: srcu_struct to provide cookie for. + * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu(). + * + * This function takes the cookie that was returned from either + * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and + * returns @true if an SRCU grace period elapsed since the time that the + * cookie was created. + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) + return false; + // Ensure that the end of the SRCU grace period happens before + // any subsequent code that the caller might execute. + smp_mb(); // ^^^ + return true; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + /* * Callback function for srcu_barrier() use. */ -- cgit From 4e7ccfae52b39aeee93ed39d4184d50ea201fbef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Nov 2020 20:33:38 -0800 Subject: srcu: Add comment explaining cookie overflow/wrap This commit adds to the poll_state_synchronize_srcu() header comment describing the issues surrounding SRCU cookie overflow/wrap for the different kernel configurations. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c5d0c036fab5..119938d381e2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1058,6 +1058,21 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and * returns @true if an SRCU grace period elapsed since the time that the * cookie was created. + * + * Because cookies are finite in size, wrapping/overflow is possible. + * This is more pronounced on 32-bit systems where cookies are 32 bits, + * where in theory wrapping could happen in about 14 hours assuming + * 25-microsecond expedited SRCU grace periods. However, a more likely + * overflow lower bound is on the order of 24 days in the case of + * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit + * system requires geologic timespans, as in more than seven million years + * even for expedited SRCU grace periods. + * + * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems + * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses + * a 16-bit cookie, which rcutorture routinely wraps in a matter of a + * few minutes. If this proves to be a problem, this counter will be + * expanded to the same size as for Tree SRCU. */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { -- cgit From fd56f64b4e3b9c53fbb12ef74c6f1f5fde4cc1c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 20:14:27 -0800 Subject: rcutorture: Prepare for ->start_gp_poll and ->poll_gp_state The new get_state_synchronize_srcu(), start_poll_synchronize_srcu() and poll_state_synchronize_srcu() functions need to be tested, and so this commit prepares by renaming the rcu_torture_ops field ->get_state to ->get_gp_state in order to be consistent with the upcoming ->start_gp_poll and ->poll_gp_state fields. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 528ed10b78fd..bcea23ceceb6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -311,7 +311,7 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); - unsigned long (*get_state)(void); + unsigned long (*get_gp_state)(void); void (*cond_sync)(unsigned long oldstate); call_rcu_func_t call; void (*cb_barrier)(void); @@ -461,7 +461,7 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, - .get_state = get_state_synchronize_rcu, + .get_gp_state = get_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, .call = call_rcu, .cb_barrier = rcu_barrier, @@ -1050,10 +1050,10 @@ rcu_torture_writer(void *arg) /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { + if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); - } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { + } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } if (gp_exp1 && cur_ops->exp_sync) { @@ -1119,7 +1119,7 @@ rcu_torture_writer(void *arg) break; case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; - gp_snap = cur_ops->get_state(); + gp_snap = cur_ops->get_gp_state(); i = torture_random(&rand) % 16; if (i != 0) schedule_timeout_interruptible(i); -- cgit From 0fd0548db13346bfb3bb23860ab270a32d6e385a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 20:43:59 -0800 Subject: rcutorture: Add writer-side tests of polling grace-period API This commit adds writer-side testing of the polling grace-period API. One test verifies that the polling API sees a grace period caused by some other mechanism. Another test verifies that using the polling API to wait for a grace period does not result in too-short grace periods. A third test verifies that the polling API does not report completion within a read-side critical section. A fourth and final test verifies that the polling API does report completion given an intervening grace period. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 79 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bcea23ceceb6..78ba95dfe05d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -85,6 +85,7 @@ torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -183,9 +184,11 @@ static int rcu_torture_writer_state; #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 #define RTWS_COND_SYNC 6 -#define RTWS_SYNC 7 -#define RTWS_STUTTER 8 -#define RTWS_STOPPING 9 +#define RTWS_POLL_GET 7 +#define RTWS_POLL_WAIT 8 +#define RTWS_SYNC 9 +#define RTWS_STUTTER 10 +#define RTWS_STOPPING 11 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -194,6 +197,8 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_EXP_SYNC", "RTWS_COND_GET", "RTWS_COND_SYNC", + "RTWS_POLL_GET", + "RTWS_POLL_WAIT", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -312,6 +317,8 @@ struct rcu_torture_ops { void (*sync)(void); void (*exp_sync)(void); unsigned long (*get_gp_state)(void); + unsigned long (*start_gp_poll)(void); + bool (*poll_gp_state)(unsigned long oldstate); void (*cond_sync)(unsigned long oldstate); call_rcu_func_t call; void (*cb_barrier)(void); @@ -570,6 +577,21 @@ static void srcu_torture_synchronize(void) synchronize_srcu(srcu_ctlp); } +static unsigned long srcu_torture_get_gp_state(void) +{ + return get_state_synchronize_srcu(srcu_ctlp); +} + +static unsigned long srcu_torture_start_gp_poll(void) +{ + return start_poll_synchronize_srcu(srcu_ctlp); +} + +static bool srcu_torture_poll_gp_state(unsigned long oldstate) +{ + return poll_state_synchronize_srcu(srcu_ctlp, oldstate); +} + static void srcu_torture_call(struct rcu_head *head, rcu_callback_t func) { @@ -601,6 +623,9 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .get_gp_state = srcu_torture_get_gp_state, + .start_gp_poll = srcu_torture_start_gp_poll, + .poll_gp_state = srcu_torture_poll_gp_state, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -1027,18 +1052,20 @@ static int rcu_torture_writer(void *arg) { bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); + unsigned long cookie; int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_sync1 = gp_sync; + bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; int i; + int idx; int oldnice = task_nice(current); struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); bool stutter_waited; int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, - RTWS_COND_GET, RTWS_SYNC }; + RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); @@ -1048,8 +1075,8 @@ rcu_torture_writer(void *arg) torture_type, cur_ops->name); /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; + if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) + gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); @@ -1068,6 +1095,12 @@ rcu_torture_writer(void *arg) } else if (gp_normal && !cur_ops->deferred_free) { pr_alert("%s: gp_normal without primitives.\n", __func__); } + if (gp_poll1 && cur_ops->start_gp_poll && cur_ops->poll_gp_state) { + synctype[nsynctypes++] = RTWS_POLL_GET; + pr_info("%s: Testing polling GPs.\n", __func__); + } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { + pr_alert("%s: gp_poll without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1107,6 +1140,18 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { + idx = cur_ops->readlock(); + cookie = cur_ops->get_gp_state(); + WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && + cur_ops->poll_gp_state(cookie), + "%s: Cookie check 1 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); + cur_ops->readunlock(idx); + } switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1128,6 +1173,18 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET: + rcu_torture_writer_state = RTWS_POLL_GET; + gp_snap = cur_ops->start_gp_poll(); + rcu_torture_writer_state = RTWS_POLL_WAIT; + while (!cur_ops->poll_gp_state(gp_snap)) { + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + } + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; cur_ops->sync(); @@ -1137,6 +1194,14 @@ rcu_torture_writer(void *arg) WARN_ON_ONCE(1); break; } + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && + !cur_ops->poll_gp_state(cookie), + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); } WRITE_ONCE(rcu_torture_current_version, rcu_torture_current_version + 1); -- cgit From bc480a6354ef2e15c26c3bdbd0db647026e788a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Nov 2020 12:45:57 -0800 Subject: rcutorture: Add reader-side tests of polling grace-period API This commit adds reader-side testing of the polling grace-period API. This testing verifies that a cookie obtained in an SRCU read-side critical section does not get a true return from poll_state_synchronize_srcu() within that same critical section. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 78ba95dfe05d..96d55f05e344 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1429,6 +1429,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, */ static bool rcu_torture_one_read(struct torture_random_state *trsp) { + unsigned long cookie; int i; unsigned long started; unsigned long completed; @@ -1444,6 +1445,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1480,6 +1483,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(cur_ops->poll_gp_state(cookie), + "%s: Cookie check 3 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); // This next splat is expected behavior if leakpointer, especially -- cgit From 00504537f44422a99d97f615f2b3ee17cfba194d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Oct 2020 15:08:57 -0700 Subject: rcutorture: Add testing for RCU's global memory ordering RCU guarantees that anything seen by a given reader will also be seen after any grace period that must wait on that reader. This is very likely to hold based on inspection, but the advantage of having rcutorture do the inspecting is that rcutorture doesn't mind inspecting frequently and often. This commit therefore adds code to test RCU's global memory ordering. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 96d55f05e344..338e1182b4b5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -143,11 +143,22 @@ static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 +// Mailbox-like structure to check RCU global memory ordering. +struct rcu_torture_reader_check { + unsigned long rtc_myloops; + int rtc_chkrdr; + unsigned long rtc_chkloops; + int rtc_ready; + struct rcu_torture_reader_check *rtc_assigner; +} ____cacheline_internodealigned_in_smp; + +// Update-side data structure used to check RCU readers. struct rcu_torture { struct rcu_head rtort_rcu; int rtort_pipe_count; struct list_head rtort_free; int rtort_mbtest; + struct rcu_torture_reader_check *rtort_chkp; }; static LIST_HEAD(rcu_torture_freelist); @@ -158,10 +169,13 @@ static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +static struct rcu_torture_reader_check *rcu_torture_reader_mbchk; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_mbchk_fail; +static atomic_t n_rcu_torture_mbchk_tries; static atomic_t n_rcu_torture_error; static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; @@ -393,7 +407,12 @@ static bool rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; + struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp); + if (rtrcp) { + WRITE_ONCE(rp->rtort_chkp, NULL); + smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire(). + } i = READ_ONCE(rp->rtort_pipe_count); if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; @@ -1292,6 +1311,62 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +// Set up and carry out testing of RCU's global memory ordering +static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, + struct torture_random_state *trsp) +{ + unsigned long loops; + int noc = num_online_cpus(); + int rdrchked; + int rdrchker; + struct rcu_torture_reader_check *rtrcp; // Me. + struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking. + struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked. + struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me. + + if (myid < 0) + return; // Don't try this from timer handlers. + + // Increment my counter. + rtrcp = &rcu_torture_reader_mbchk[myid]; + WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1); + + // Attempt to assign someone else some checking work. + rdrchked = torture_random(trsp) % nrealreaders; + rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked]; + rdrchker = torture_random(trsp) % nrealreaders; + rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker]; + if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker && + smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below. + !READ_ONCE(rtp->rtort_chkp) && + !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below. + rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops); + WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0); + rtrcp->rtc_chkrdr = rdrchked; + WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends. + if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) || + cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp)) + (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out. + } + + // If assigned some completed work, do it! + rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner); + if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready)) + return; // No work or work not yet ready. + rdrchked = rtrcp_assigner->rtc_chkrdr; + if (WARN_ON_ONCE(rdrchked < 0)) + return; + rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked]; + loops = READ_ONCE(rtrcp_chked->rtc_myloops); + atomic_inc(&n_rcu_torture_mbchk_tries); + if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops)) + atomic_inc(&n_rcu_torture_mbchk_fail); + rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2; + rtrcp_assigner->rtc_ready = 0; + smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work. + smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign. +} + /* * Do one extension of an RCU read-side critical section using the * current reader state in readstate (set to zero for initial entry @@ -1427,7 +1502,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, * no data to read. Can be invoked both from process context and * from a timer handler. */ -static bool rcu_torture_one_read(struct torture_random_state *trsp) +static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) { unsigned long cookie; int i; @@ -1462,6 +1537,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); + rcu_torture_reader_do_mbchk(myid, p, trsp); rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); pipe_count = READ_ONCE(p->rtort_pipe_count); @@ -1518,7 +1594,7 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); static void rcu_torture_timer(struct timer_list *unused) { atomic_long_inc(&n_rcu_torture_timers); - (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand)); + (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1); /* Test call_rcu() invocation from interrupt handler. */ if (cur_ops->call) { @@ -1554,7 +1630,7 @@ rcu_torture_reader(void *arg) if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - if (!rcu_torture_one_read(&rand) && !torture_must_stop()) + if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop()) schedule_timeout_interruptible(HZ); if (time_after(jiffies, lastsleep) && !torture_must_stop()) { schedule_timeout_interruptible(1); @@ -1614,8 +1690,9 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), + atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries), n_rcu_torture_barrier_error, n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); @@ -1632,12 +1709,14 @@ rcu_torture_stats_print(void) pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || + atomic_read(&n_rcu_torture_mbchk_fail) || n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); + WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail)); WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio @@ -2467,7 +2546,7 @@ static int rcu_torture_read_exit_child(void *trsp_in) // Minimize time between reading and exiting. while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); - (void)rcu_torture_one_read(trsp); + (void)rcu_torture_one_read(trsp, -1); return 0; } @@ -2582,6 +2661,8 @@ rcu_torture_cleanup(void) kfree(reader_tasks); reader_tasks = NULL; } + kfree(rcu_torture_reader_mbchk); + rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) @@ -2785,6 +2866,8 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_alloc_fail, 0); atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_mbchk_fail, 0); + atomic_set(&n_rcu_torture_mbchk_tries, 0); atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; @@ -2826,12 +2909,15 @@ rcu_torture_init(void) } reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); - if (reader_tasks == NULL) { + rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk), + GFP_KERNEL); + if (!reader_tasks || !rcu_torture_reader_mbchk) { VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { + rcu_torture_reader_mbchk[i].rtc_chkrdr = -1; firsterr = torture_create_kthread(rcu_torture_reader, (void *)i, reader_tasks[i]); if (firsterr) -- cgit From f3ea978b712f768a02137e867aced5bfdcea670e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Nov 2020 10:12:05 -0800 Subject: scftorture: Add debug output for wrong-CPU warning This commit adds the desired CPU, the actual CPU, and nr_cpu_ids to the wrong-CPU warning in scftorture_invoker(), the better to help with debugging. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index d55a9f8cda3d..2377cbb32474 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -398,6 +398,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra static int scftorture_invoker(void *arg) { int cpu; + int curcpu; DEFINE_TORTURE_RANDOM(rand); struct scf_statistics *scfp = (struct scf_statistics *)arg; bool was_offline = false; @@ -412,7 +413,10 @@ static int scftorture_invoker(void *arg) VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(smp_processor_id() != scfp->cpu); + curcpu = smp_processor_id(); + WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids, + "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n", + __func__, scfp->cpu, curcpu, nr_cpu_ids); if (!atomic_dec_return(&n_started)) while (atomic_read_acquire(&n_started)) { -- cgit From b08ea1de6a8f8929c7dafd6f708799365fa90c11 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Nov 2020 13:52:31 -0800 Subject: rcu: Mark obtuse portion of stall warning as internal debug There is a rather obtuse string that can be printed as part of an expedited RCU CPU stall-warning message that starts with "blocking rcu_node structures". Under normal conditions, most of this message is just repeating the list of CPUs blocking the current expedited grace period, but in a manner that is rather difficult to read. This commit therefore marks this message as "(internal RCU debug)" in an effort to give people the option of avoiding wasting time attempting to extract nonexistent additional meaning from this portion of the message. Reported-by: Jonathan Lemon Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8760b6ead770..6c6ff06d4ae6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -545,7 +545,7 @@ static void synchronize_rcu_expedited_wait(void) data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { - pr_err("blocking rcu_node structures:"); + pr_err("blocking rcu_node structures (internal RCU debug):"); rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ -- cgit From 243027a3c80564bf96e40437ffac46efb9f5f2b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Nov 2020 16:08:01 -0800 Subject: rcu: For RCU grace-period kthread starvation, dump last CPU it ran on When the RCU CPU stall-warning code detects that the RCU grace-period kthread is being starved, it dumps that kthread's stack. This can sometimes be useful, but it is also useful to know what is running on the CPU that this kthread is attempting to run on. This commit therefore adds a stack trace of this CPU in order to help track down whatever it is that might be preventing RCU's grace-period kthread from running. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 70d48c52fabc..35c1355d44e7 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -449,20 +449,27 @@ static void print_cpu_stall_info(int cpu) /* Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) { + int cpu; struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; if (rcu_is_gp_kthread_starving(&j)) { + cpu = gpk ? task_cpu(gpk) : -1; pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + gpk ? gpk->state : ~0, cpu); if (gpk) { pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); + if (cpu >= 0) { + pr_err("Stack dump where RCU grace-period kthread last ran:\n"); + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + } wake_up_process(gpk); } } -- cgit From 725969ac11d7fa50aa701321daa600ce421fc21b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Nov 2020 12:19:47 -0800 Subject: rcu: Do not NMI offline CPUs Currently, RCU CPU stall warning messages will NMI whatever CPU looks like it is blocking either the current grace period or the grace-period kthread. This can produce confusing output if the target CPU is offline. This commit therefore checks for offline CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 35c1355d44e7..29cf096b5d20 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -333,9 +333,12 @@ static void rcu_dump_cpu_stacks(void) rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) - if (!trigger_single_cpu_backtrace(cpu)) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + if (cpu_is_offline(cpu)) + pr_err("Offline CPU %d blocking current GP.\n", cpu); + else if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -466,9 +469,13 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); if (cpu >= 0) { - pr_err("Stack dump where RCU grace-period kthread last ran:\n"); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + } } wake_up_process(gpk); } -- cgit From 1b7af295541d75535374325fd617944534853919 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 10:22:24 -0700 Subject: sched/core: Allow try_invoke_on_locked_down_task() with irqs disabled The try_invoke_on_locked_down_task() function currently requires that interrupts be enabled, but it is called with interrupts disabled from rcu_print_task_stall(), resulting in an "IRQs not enabled as expected" diagnostic. This commit therefore updates try_invoke_on_locked_down_task() to use raw_spin_lock_irqsave() instead of raw_spin_lock_irq(), thus allowing use from either context. Link: https://lore.kernel.org/lkml/000000000000903d5805ab908fc4@google.com/ Link: https://lore.kernel.org/lkml/20200928075729.GC2611@hirez.programming.kicks-ass.net/ Reported-by: syzbot+cb3b69ae80afd6535b0e@syzkaller.appspotmail.com Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7e453492cff..f768bb0df74b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2989,7 +2989,7 @@ out: /** * try_invoke_on_locked_down_task - Invoke a function on task in fixed state - * @p: Process for which the function is to be invoked. + * @p: Process for which the function is to be invoked, can be @current. * @func: Function to invoke. * @arg: Argument to function. * @@ -3007,12 +3007,11 @@ out: */ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) { - bool ret = false; struct rq_flags rf; + bool ret = false; struct rq *rq; - lockdep_assert_irqs_enabled(); - raw_spin_lock_irq(&p->pi_lock); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (p->on_rq) { rq = __task_rq_lock(p, &rf); if (task_rq(p) == rq) @@ -3029,7 +3028,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t ret = func(p, arg); } } - raw_spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; } -- cgit From c5586e32dfe258925c5dbb599bea3eadf34e79c1 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 7 Nov 2020 16:24:03 +0800 Subject: locking: Remove duplicate include of percpu-rwsem.h This commit removes an unnecessary #include. Signed-off-by: Wang Qing Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 62d215b2e39f..af99e9ca285a 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -27,7 +27,6 @@ #include #include #include -#include #include MODULE_LICENSE("GPL"); -- cgit From a649d25dcc671a33b9cc3176411920fdc5fbd98e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Nov 2020 10:13:06 -0800 Subject: rcu: Add lockdep_assert_irqs_disabled() to rcu_sched_clock_irq() and callees This commit adds a number of lockdep_assert_irqs_disabled() calls to rcu_sched_clock_irq() and a number of the functions that it calls. The point of this is to help track down a situation where lockdep appears to be insisting that interrupts are enabled within these functions, which should only ever be invoked from the scheduling-clock interrupt handler. Link: https://lore.kernel.org/lkml/20201111133813.GA81547@elver.google.com/ Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ kernel/rcu/tree_plugin.h | 1 + kernel/rcu/tree_stall.h | 8 ++++++++ 3 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bd04b09b84b3..f70634f7c3aa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2553,6 +2553,7 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_sched_clock_irq(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); + lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { @@ -2566,6 +2567,7 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); } @@ -3690,6 +3692,8 @@ static int rcu_pending(int user) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + lockdep_assert_irqs_disabled(); + /* Check for CPU stalls, if enabled. */ check_cpu_stall(rdp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fd8a52e9a887..cb76e70da8ca 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -682,6 +682,7 @@ static void rcu_flavor_sched_clock_irq(int user) { struct task_struct *t = current; + lockdep_assert_irqs_disabled(); if (user || rcu_is_cpu_rrupt_from_idle()) { rcu_note_voluntary_context_switch(current); } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index ca21d28a0f98..4024dcc78aac 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -260,6 +260,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) struct task_struct *t; struct task_struct *ts[8]; + lockdep_assert_irqs_disabled(); if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", @@ -284,6 +285,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) ".q"[rscr.rs.b.need_qs], ".e"[rscr.rs.b.exp_hint], ".l"[rscr.on_blkd_list]); + lockdep_assert_irqs_disabled(); put_task_struct(t); ndetected++; } @@ -472,6 +474,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) struct rcu_node *rnp; long totqlen = 0; + lockdep_assert_irqs_disabled(); + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); if (rcu_stall_is_suppressed()) @@ -493,6 +497,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) } } ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. + lockdep_assert_irqs_disabled(); } for_each_possible_cpu(cpu) @@ -538,6 +543,8 @@ static void print_cpu_stall(unsigned long gps) struct rcu_node *rnp = rcu_get_root(); long totqlen = 0; + lockdep_assert_irqs_disabled(); + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); if (rcu_stall_is_suppressed()) @@ -592,6 +599,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; + lockdep_assert_irqs_disabled(); if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || !rcu_gp_in_progress()) return; -- cgit From 7dffe01765d9309b8bd5505503933ec0ec53d192 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Nov 2020 13:30:33 -0800 Subject: rcu: Add lockdep_assert_irqs_disabled() to raw_spin_unlock_rcu_node() macros This commit adds a lockdep_assert_irqs_disabled() call to the helper macros that release the rcu_node structure's ->lock, namely to raw_spin_unlock_rcu_node(), raw_spin_unlock_irq_rcu_node() and raw_spin_unlock_irqrestore_rcu_node(). The point of this is to help track down a situation where lockdep appears to be insisting that interrupts are enabled while holding an rcu_node structure's ->lock. Link: https://lore.kernel.org/lkml/20201111133813.GA81547@elver.google.com/ Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e01cba5e4b52..839f5be65244 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -378,7 +378,11 @@ do { \ smp_mb__after_unlock_lock(); \ } while (0) -#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) +#define raw_spin_unlock_rcu_node(p) \ +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \ +} while (0) #define raw_spin_lock_irq_rcu_node(p) \ do { \ @@ -387,7 +391,10 @@ do { \ } while (0) #define raw_spin_unlock_irq_rcu_node(p) \ - raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \ +} while (0) #define raw_spin_lock_irqsave_rcu_node(p, flags) \ do { \ @@ -396,7 +403,10 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \ +} while (0) #define raw_spin_trylock_rcu_node(p) \ ({ \ -- cgit From bfba7ed084f8ab0269a5a1d2f51b07865456c334 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 9 Dec 2020 21:27:32 +0100 Subject: rcu-tasks: Add RCU-tasks self tests This commit adds self tests for early-boot use of RCU-tasks grace periods. It tests all three variants (Rude, Tasks, and Tasks Trace) and covers both synchronous (e.g., synchronize_rcu_tasks()) and asynchronous (e.g., call_rcu_tasks()) grace-period APIs. Self-tests are run only in kernels built with CONFIG_PROVE_RCU=y. Signed-off-by: Uladzislau Rezki (Sony) [ paulmck: Handle CONFIG_PROVE_RCU=n and identify test cases' callbacks. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 73bbe792fe1e..74767d365752 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1231,6 +1231,82 @@ void show_rcu_tasks_gp_kthreads(void) } #endif /* #ifndef CONFIG_TINY_RCU */ +#ifdef CONFIG_PROVE_RCU +struct rcu_tasks_test_desc { + struct rcu_head rh; + const char *name; + bool notrun; +}; + +static struct rcu_tasks_test_desc tests[] = { + { + .name = "call_rcu_tasks()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_RCU), + }, + { + .name = "call_rcu_tasks_rude()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU), + }, + { + .name = "call_rcu_tasks_trace()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU) + } +}; + +static void test_rcu_tasks_callback(struct rcu_head *rhp) +{ + struct rcu_tasks_test_desc *rttd = + container_of(rhp, struct rcu_tasks_test_desc, rh); + + pr_info("Callback from %s invoked.\n", rttd->name); + + rttd->notrun = true; +} + +static void rcu_tasks_initiate_self_tests(void) +{ + pr_info("Running RCU-tasks wait API self tests\n"); +#ifdef CONFIG_TASKS_RCU + synchronize_rcu_tasks(); + call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + synchronize_rcu_tasks_rude(); + call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + synchronize_rcu_tasks_trace(); + call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); +#endif +} + +static int rcu_tasks_verify_self_tests(void) +{ + int ret = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + if (!tests[i].notrun) { // still hanging. + pr_err("%s has been failed.\n", tests[i].name); + ret = -1; + } + } + + if (ret) + WARN_ON(1); + + return ret; +} +late_initcall(rcu_tasks_verify_self_tests); +#else /* #ifdef CONFIG_PROVE_RCU */ +static void rcu_tasks_initiate_self_tests(void) { } +#endif /* #else #ifdef CONFIG_PROVE_RCU */ + void __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU @@ -1244,6 +1320,9 @@ void __init rcu_init_tasks_generic(void) #ifdef CONFIG_TASKS_TRACE_RCU rcu_spawn_tasks_trace_kthread(); #endif + + // Run the self-tests. + rcu_tasks_initiate_self_tests(); } #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ -- cgit From c26165efac41bce0c7764262b21f5897e771f34f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Dec 2020 21:00:18 -0800 Subject: rcu: Make TASKS_TRACE_RCU select IRQ_WORK Tasks Trace RCU uses irq_work_queue() to safely awaken its grace-period kthread, so this commit therefore causes the TASKS_TRACE_RCU Kconfig option select the IRQ_WORK Kconfig option. Reported-by: kernel test robot Acked-by: Randy Dunlap # build-tested Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b71e21f73c40..84dfa8dae1b2 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -95,6 +95,7 @@ config TASKS_RUDE_RCU config TASKS_TRACE_RCU def_bool 0 + select IRQ_WORK help This option enables a task-based RCU implementation that uses explicit rcu_read_lock_trace() read-side markers, and allows -- cgit From c2e13112e830c06825339cbadf0b3bc2bdb9a716 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 3 Nov 2020 09:26:03 -0500 Subject: rcu/segcblist: Add additional comments to explain smp_mb() One counter-intuitive property of RCU is the fact that full memory barriers are needed both before and after updates to the full (non-segmented) length. This patch therefore helps to assist the reader's intuition by adding appropriate comments. [ paulmck: Wordsmithing. ] Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 68 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index bb246d8c6ef1..3cff8003f707 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -94,17 +94,77 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) * field to disagree with the actual number of callbacks on the structure. * This increase is fully ordered with respect to the callers accesses * both before and after. + * + * So why on earth is a memory barrier required both before and after + * the update to the ->len field??? + * + * The reason is that rcu_barrier() locklessly samples each CPU's ->len + * field, and if a given CPU's field is zero, avoids IPIing that CPU. + * This can of course race with both queuing and invoking of callbacks. + * Failing to correctly handle either of these races could result in + * rcu_barrier() failing to IPI a CPU that actually had callbacks queued + * which rcu_barrier() was obligated to wait on. And if rcu_barrier() + * failed to wait on such a callback, unloading certain kernel modules + * would result in calls to functions whose code was no longer present in + * the kernel, for but one example. + * + * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully + * ordered with respect with both list modifications and the rcu_barrier(). + * + * The queuing case is CASE 1 and the invoking case is CASE 2. + * + * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes + * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field + * will transition from 0->1, which is one of the transitions that must + * be handled carefully. Without the full memory barriers after the ->len + * update and at the beginning of rcu_barrier(), the following could happen: + * + * CPU 0 CPU 1 + * + * call_rcu(). + * rcu_barrier() sees ->len as 0. + * set ->len = 1. + * rcu_barrier() does nothing. + * module is unloaded. + * callback invokes unloaded function! + * + * With the full barriers, any case where rcu_barrier() sees ->len as 0 will + * have unambiguously preceded the return from the racing call_rcu(), which + * means that this call_rcu() invocation is OK to not wait on. After all, + * you are supposed to make sure that any problematic call_rcu() invocations + * happen before the rcu_barrier(). + * + * + * CASE 2: Suppose that CPU 0 is invoking its last callback just as + * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from + * 1->0, which is one of the transitions that must be handled carefully. + * Without the full memory barriers before the ->len update and at the + * end of rcu_barrier(), the following could happen: + * + * CPU 0 CPU 1 + * + * start invoking last callback + * set ->len = 0 (reordered) + * rcu_barrier() sees ->len as 0 + * rcu_barrier() does nothing. + * module is unloaded + * callback executing after unloaded! + * + * With the full barriers, any case where rcu_barrier() sees ->len as 0 + * will be fully ordered after the completion of the callback function, + * so that the module unloading operation is completely safe. + * */ void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU - smp_mb__before_atomic(); /* Up to the caller! */ + smp_mb__before_atomic(); // Read header comment above. atomic_long_add(v, &rsclp->len); - smp_mb__after_atomic(); /* Up to the caller! */ + smp_mb__after_atomic(); // Read header comment above. #else - smp_mb(); /* Up to the caller! */ + smp_mb(); // Read header comment above. WRITE_ONCE(rsclp->len, rsclp->len + v); - smp_mb(); /* Up to the caller! */ + smp_mb(); // Read header comment above. #endif } -- cgit From ae5c2341ed3987bd434ed495bd4f3d8b2bc3e623 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 23 Sep 2020 11:22:09 -0400 Subject: rcu/segcblist: Add counters to segcblist datastructure Add counting of segment lengths of segmented callback list. This will be useful for a number of things such as knowing how big the ready-to-execute segment have gotten. The immediate benefit is ability to trace how the callbacks in the segmented callback list change. Also this patch remove hacks related to using donecbs's ->len field as a temporary variable to save the segmented callback list's length. This cannot be done anymore and is not needed. Also fix SRCU: The negative counting of the unsegmented list cannot be used to adjust the segmented one. To fix this, sample the unsegmented length in advance, and use it after CB execution to adjust the segmented list's length. Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 120 +++++++++++++++++++++++++++++---------------- kernel/rcu/rcu_segcblist.h | 2 - kernel/rcu/srcutree.c | 5 +- 3 files changed, 81 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 3cff8003f707..777780447887 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -7,10 +7,10 @@ * Authors: Paul E. McKenney */ -#include -#include +#include #include -#include +#include +#include #include "rcu_segcblist.h" @@ -88,6 +88,46 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) #endif } +/* Get the length of a segment of the rcu_segcblist structure. */ +static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) +{ + return READ_ONCE(rsclp->seglen[seg]); +} + +/* Set the length of a segment of the rcu_segcblist structure. */ +static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], v); +} + +/* Increase the numeric length of a segment by a specified amount. */ +static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v); +} + +/* Move from's segment length to to's segment. */ +static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to) +{ + long len; + + if (from == to) + return; + + len = rcu_segcblist_get_seglen(rsclp, from); + if (!len) + return; + + rcu_segcblist_add_seglen(rsclp, to, len); + rcu_segcblist_set_seglen(rsclp, from, 0); +} + +/* Increment segment's length. */ +static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg) +{ + rcu_segcblist_add_seglen(rsclp, seg, 1); +} + /* * Increase the numeric length of an rcu_segcblist structure by the * specified amount, which can be negative. This can cause the ->len @@ -179,26 +219,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) rcu_segcblist_add_len(rsclp, 1); } -/* - * Exchange the numeric length of the specified rcu_segcblist structure - * with the specified value. This can cause the ->len field to disagree - * with the actual number of callbacks on the structure. This exchange is - * fully ordered with respect to the callers accesses both before and after. - */ -static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) -{ -#ifdef CONFIG_RCU_NOCB_CPU - return atomic_long_xchg(&rsclp->len, v); -#else - long ret = rsclp->len; - - smp_mb(); /* Up to the caller! */ - WRITE_ONCE(rsclp->len, v); - smp_mb(); /* Up to the caller! */ - return ret; -#endif -} - /* * Initialize an rcu_segcblist structure. */ @@ -209,8 +229,10 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); rsclp->head = NULL; - for (i = 0; i < RCU_CBLIST_NSEGS; i++) + for (i = 0; i < RCU_CBLIST_NSEGS; i++) { rsclp->tails[i] = &rsclp->head; + rcu_segcblist_set_seglen(rsclp, i, 0); + } rcu_segcblist_set_len(rsclp, 0); rsclp->enabled = 1; } @@ -306,6 +328,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, { rcu_segcblist_inc_len(rsclp); smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); @@ -334,27 +357,13 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) if (rsclp->tails[i] != rsclp->tails[i - 1]) break; + rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); for (; i <= RCU_NEXT_TAIL; i++) WRITE_ONCE(rsclp->tails[i], &rhp->next); return true; } -/* - * Extract only the counts from the specified rcu_segcblist structure, - * and place them in the specified rcu_cblist structure. This function - * supports both callback orphaning and invocation, hence the separation - * of counts and callbacks. (Callbacks ready for invocation must be - * orphaned and adopted separately from pending callbacks, but counts - * apply to all callbacks. Locking must be used to make sure that - * both orphaned-callbacks lists are consistent.) - */ -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - rclp->len = rcu_segcblist_xchg_len(rsclp, 0); -} - /* * Extract only those callbacks ready to be invoked from the specified * rcu_segcblist structure and place them in the specified rcu_cblist @@ -367,6 +376,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_ready_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL); *rclp->tail = rsclp->head; WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); @@ -374,6 +384,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) WRITE_ONCE(rsclp->tails[i], &rsclp->head); + rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0); } /* @@ -390,11 +401,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_pend_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = 0; *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); - for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) { + rclp->len += rcu_segcblist_get_seglen(rsclp, i); WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); + rcu_segcblist_set_seglen(rsclp, i, 0); + } } /* @@ -405,7 +420,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len = 0; } /* @@ -419,6 +433,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, if (!rclp->head) return; /* No callbacks to move. */ + rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len); *rclp->tail = rsclp->head; WRITE_ONCE(rsclp->head, rclp->head); for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) @@ -439,6 +454,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, { if (!rclp->head) return; /* Nothing to do. */ + + rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len); WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); } @@ -463,6 +480,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) break; WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL); } /* If no callbacks moved, nothing more need be done. */ @@ -483,6 +501,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) break; /* No more callbacks. */ WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, j); rsclp->gp_seq[j] = rsclp->gp_seq[i]; } } @@ -504,7 +523,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) */ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) { - int i; + int i, j; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) @@ -547,6 +566,10 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; + /* Accounting: everything below i is about to get merged into i. */ + for (j = i + 1; j <= RCU_NEXT_TAIL; j++) + rcu_segcblist_move_seglen(rsclp, j, i); + /* * Merge all later callbacks, including newly arrived callbacks, * into the segment located by the for-loop above. Assign "seq" @@ -574,13 +597,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_cblist donecbs; struct rcu_cblist pendcbs; + lockdep_assert_cpus_held(); + rcu_cblist_init(&donecbs); rcu_cblist_init(&pendcbs); - rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + + /* + * No need smp_mb() before setting length to 0, because CPU hotplug + * lock excludes rcu_barrier. + */ + rcu_segcblist_set_len(src_rsclp, 0); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_count(dst_rsclp, &pendcbs); rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 1d2d61406463..cd35c9faaf51 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -89,8 +89,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, struct rcu_head *rhp); -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0f23d20d485a..79b7081143a7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1160,6 +1160,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ static void srcu_invoke_callbacks(struct work_struct *work) { + long len; bool more; struct rcu_cblist ready_cbs; struct rcu_head *rhp; @@ -1182,6 +1183,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + len = ready_cbs.len; spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { @@ -1190,13 +1192,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp->func(rhp); local_bh_enable(); } + WARN_ON_ONCE(ready_cbs.len); /* * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ spin_lock_irq_rcu_node(sdp); - rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); + rcu_segcblist_add_len(&sdp->srcu_cblist, -len); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; -- cgit From 68804cf1c905ce227e4e1d0bc252c216811c59fd Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 14 Oct 2020 18:21:53 -0400 Subject: rcu/tree: segcblist: Remove redundant smp_mb()s The full memory barriers in rcu_segcblist_enqueue() and in rcu_do_batch() are not needed because rcu_segcblist_add_len(), and thus also rcu_segcblist_inc_len(), already includes a memory barrier *before* and *after* the length of the list is updated. This commit therefore removes these redundant smp_mb() invocations. Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 1 - kernel/rcu/tree.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 777780447887..1e80a0a9036a 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -327,7 +327,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp) { rcu_segcblist_inc_len(rsclp); - smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cc6f379c7ddf..b0fb654cba80 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2523,7 +2523,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); - smp_mb(); /* List handling before counting for rcu_barrier(). */ rcu_segcblist_add_len(&rdp->cblist, -count); /* Reinstate batch limit if we have worked down the excess. */ -- cgit From 3afe7fa535491ecd0382c3968dc2349602bff8a2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 14 Nov 2020 14:31:32 -0500 Subject: rcu/trace: Add tracing for how segcb list changes This commit adds tracing to track how the segcb list changes before/after acceleration, during queuing and during dequeuing. This tracing helped discover an optimization that avoided needless GP requests when no callbacks were accelerated. The tracing overhead is minimal as each segment's length is now stored in the respective segment. Reviewed-by: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0fb654cba80..6bf269c91393 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1495,6 +1495,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc")); + /* * Callbacks are often registered with incomplete grace-period * information. Something about the fact that getting exact @@ -1515,6 +1517,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) else trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc")); + return ret; } @@ -2471,11 +2475,14 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); + + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued")); rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { rcu_callback_t f; @@ -2987,6 +2994,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) trace_rcu_callback(rcu_state.name, head, rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); + /* Go handle any RCU core processing required. */ if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ -- cgit From b4e6039e8af8c20dfbbdfcaebfcbd7c9d9ffe713 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 18 Nov 2020 11:15:41 -0500 Subject: rcu/segcblist: Add debug checks for segment lengths This commit adds debug checks near the end of rcu_do_batch() that emit warnings if an empty rcu_segcblist structure has non-zero segment counts, or, conversely, if a non-empty structure has all-zero segment counts. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix queue/segment-length checks. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 12 ++++++++++++ kernel/rcu/rcu_segcblist.h | 3 +++ kernel/rcu/tree.c | 8 ++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 1e80a0a9036a..89e0dff890bf 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) return READ_ONCE(rsclp->seglen[seg]); } +/* Return number of callbacks in segmented callback list by summing seglen. */ +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp) +{ + long len = 0; + int i; + + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + len += rcu_segcblist_get_seglen(rsclp, i); + + return len; +} + /* Set the length of a segment of the rcu_segcblist structure. */ static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v) { diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index cd35c9faaf51..18e101de8747 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } +/* Return number of callbacks in segmented callback list by summing seglen. */ +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp); + void rcu_cblist_init(struct rcu_cblist *rclp); void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6bf269c91393..8086c0467c15 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2434,6 +2434,7 @@ int rcutree_dead_cpu(unsigned int cpu) static void rcu_do_batch(struct rcu_data *rdp) { int div; + bool __maybe_unused empty; unsigned long flags; const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; @@ -2548,9 +2549,12 @@ static void rcu_do_batch(struct rcu_data *rdp) * The following usually indicates a double call_rcu(). To track * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. */ - WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); + empty = rcu_segcblist_empty(&rdp->cblist); + WARN_ON_ONCE(count == 0 && !empty); WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - count != 0 && rcu_segcblist_empty(&rdp->cblist)); + count != 0 && empty); + WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0); + WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0); rcu_nocb_unlock_irqrestore(rdp, flags); -- cgit From 65e560327fe68153a9ad7452d5fd3171a1927d33 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:16 +0100 Subject: rcu/nocb: Turn enabled/offload states into a common flag This commit gathers the rcu_segcblist ->enabled and ->offloaded property field into a single ->flags bitmask to avoid further proliferation of individual u8 fields in the structure. This change prepares for the state formerly known as ->offloaded state to be modified at runtime. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 6 +++--- kernel/rcu/rcu_segcblist.h | 23 +++++++++++++++++++++-- 2 files changed, 24 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 89e0dff890bf..934945d72ca5 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -246,7 +246,7 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) rcu_segcblist_set_seglen(rsclp, i, 0); } rcu_segcblist_set_len(rsclp, 0); - rsclp->enabled = 1; + rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -257,7 +257,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - rsclp->enabled = 0; + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -266,7 +266,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) { - rsclp->offloaded = 1; + rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 18e101de8747..ff372db09f8b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -53,19 +53,38 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } +static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp, + int flags) +{ + rsclp->flags |= flags; +} + +static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp, + int flags) +{ + rsclp->flags &= ~flags; +} + +static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp, + int flags) +{ + return READ_ONCE(rsclp->flags) & flags; +} + /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? */ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) { - return rsclp->enabled; + return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } /* Is the specified rcu_segcblist offloaded? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded; + return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED); } /* -- cgit From 8d346d438f93b5344e99d429727ec9c2f392d4ec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:17 +0100 Subject: rcu/nocb: Provide basic callback offloading state machine bits Offloading and de-offloading RCU callback processes must be done carefully. There must never be a time at which callback processing is disabled because the task driving the offloading or de-offloading might be preempted or otherwise stalled at that point in time, which would result in OOM due to calbacks piling up indefinitely. This implies that there will be times during which a given CPU's callbacks might be concurrently invoked by both that CPU's RCU_SOFTIRQ handler (or, equivalently, that CPU's rcuc kthread) and by that CPU's rcuo kthread. This situation could fatally confuse both rcu_barrier() and the CPU-hotplug offlining process, so these must be excluded during any concurrent-callback-invocation period. In addition, during times of concurrent callback invocation, changes to ->cblist must be protected both as needed for RCU_SOFTIRQ and as needed for the rcuo kthread. This commit therefore defines and documents the states for a state machine that coordinates offloading and deoffloading. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 1 + kernel/rcu/rcu_segcblist.h | 12 ++++++++++-- kernel/rcu/tree.c | 3 +++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 934945d72ca5..7fc6362625b2 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -266,6 +266,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index ff372db09f8b..e05952ab9b87 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -83,8 +83,16 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) /* Is the specified rcu_segcblist offloaded? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { + /* + * Complete de-offloading happens only when SEGCBLIST_SOFTIRQ_ONLY + * is set. + */ + if (!rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + return true; + } + + return false; } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8086c0467c15..7cfc2e8a2500 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -83,6 +83,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), +#ifdef CONFIG_RCU_NOCB_CPU + .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, +#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, -- cgit From 126d9d49528dae792859e5f11f3b447ce8a9a9b4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:18 +0100 Subject: rcu/nocb: Always init segcblist on CPU up How the rdp->cblist enabled state is treated at CPU-hotplug time depends on whether or not that ->cblist is offloaded. 1) Not offloaded: The ->cblist is disabled when the CPU goes down. All its callbacks are migrated and none can to enqueued until after some later CPU-hotplug operation brings the CPU back up. 2) Offloaded: The ->cblist is not disabled on CPU down because the CB/GP kthreads must finish invoking the remaining callbacks. There is thus no need to re-enable it on CPU up. Since the ->cblist offloaded state is set in stone at boot, it cannot change between CPU down and CPU up. So 1) and 2) are symmetrical. However, given runtime toggling of the offloaded state, there are two additional asymmetrical scenarios: 3) The ->cblist is not offloaded when the CPU goes down. The ->cblist is later toggled to offloaded and then the CPU comes back up. 4) The ->cblist is offloaded when the CPU goes down. The ->cblist is later toggled to no longer be offloaded and then the CPU comes back up. Scenario 4) is currently handled correctly. The ->cblist remains enabled on CPU down and gets re-initialized on CPU up. The toggling operation will wait until ->cblist is empty, so ->cblist will remain empty until CPU-up time. The scenario 3) would run into trouble though, as the rdp is disabled on CPU down and not re-initialized/re-enabled on CPU up. Except that in this case, ->cblist is guaranteed to be empty because all its callbacks were migrated away at CPU-down time. And the CPU-up code already initializes and enables any empty ->cblist structures in order to handle the possibility of early-boot invocations of call_rcu() in the case where such invocations don't occur. So all that need be done is to adjust the locking. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7cfc2e8a2500..83362f6f1119 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4015,12 +4015,18 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->blimit = blimit; - if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ - !rcu_segcblist_is_offloaded(&rdp->cblist)) - rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + /* + * Lock in case the CB/GP kthreads are still around handling + * old callbacks (longer term we should flush all callbacks + * before completing CPU offline) + */ + rcu_nocb_lock(rdp); + if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ + rcu_nocb_unlock(rdp); /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed -- cgit From d97b078182406c0bd0aacd36fc0a693e118e608f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:19 +0100 Subject: rcu/nocb: De-offloading CB kthread To de-offload callback processing back onto a CPU, it is necessary to clear SEGCBLIST_OFFLOAD and notify the nocb CB kthread, which will then clear its own bit flag and go to sleep to stop handling callbacks. This commit makes that change. It will also be necessary to notify the nocb GP kthread in this same way, which is the subject of a follow-on commit. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker [ paulmck: Add export per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 10 ++-- kernel/rcu/rcu_segcblist.h | 2 +- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 130 ++++++++++++++++++++++++++++++++++++++------- 4 files changed, 121 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7fc6362625b2..7f181c9675f7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -264,10 +264,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) * Mark the specified rcu_segcblist structure as offloaded. This * structure must be empty. */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp) +void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); - rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); + if (offload) { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); + rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); + } else { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); + } } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index e05952ab9b87..28c9a5225afc 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -109,7 +109,7 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp); +void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7708ed161f4a..e0deb4829847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -201,6 +201,7 @@ struct rcu_data { /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ atomic_t nocb_lock_contended; /* Contention experienced. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7e291ce0a1d6..1b870d0d2445 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2081,16 +2081,29 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } +static inline bool nocb_cb_can_run(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) +{ + return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); +} + /* * Invoke any ready callbacks from the corresponding no-CBs CPU, * then, if there are no more, wait for more to appear. */ static void nocb_cb_wait(struct rcu_data *rdp) { + struct rcu_segcblist *cblist = &rdp->cblist; + struct rcu_node *rnp = rdp->mynode; + bool needwake_state = false; + bool needwake_gp = false; unsigned long cur_gp_seq; unsigned long flags; - bool needwake_gp = false; - struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -2100,32 +2113,50 @@ static void nocb_cb_wait(struct rcu_data *rdp) local_bh_enable(); lockdep_assert_irqs_enabled(); rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_gp) - rcu_gp_kthread_wake(); - return; - } - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); WRITE_ONCE(rdp->nocb_cb_sleep, true); + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (rcu_segcblist_ready_cbs(cblist)) + WRITE_ONCE(rdp->nocb_cb_sleep, false); + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We won't touch the callbacks and keep sleeping until we ever + * get re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + + if (rdp->nocb_cb_sleep) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - !READ_ONCE(rdp->nocb_cb_sleep)); - if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */ + + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + /* ^^^ Ensure CB invocation follows _sleep test. */ - return; - } - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); } /* @@ -2187,6 +2218,67 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) do_nocb_deferred_wakeup_common(rdp); } +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + bool wake_cb = false; + unsigned long flags; + + printk("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_offload(cblist, false); + + if (rdp->nocb_cb_sleep) { + rdp->nocb_cb_sleep = false; + wake_cb = true; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + + if (wake_cb) + swake_up_one(&rdp->nocb_cb_wq); + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + + return 0; +} + +static long rcu_nocb_rdp_deoffload(void *arg) +{ + struct rcu_data *rdp = arg; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + return __rcu_nocb_rdp_deoffload(rdp); +} + +int rcu_nocb_cpu_deoffload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + if (rdp == rdp->nocb_gp_rdp) { + pr_info("Can't deoffload an rdp GP leader (yet)\n"); + return -EINVAL; + } + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + } else { + ret = __rcu_nocb_rdp_deoffload(rdp); + } + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + void __init rcu_init_nohz(void) { int cpu; @@ -2229,7 +2321,8 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist, true); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); } rcu_organize_nocb_kthreads(); } @@ -2239,6 +2332,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { init_swait_queue_head(&rdp->nocb_cb_wq); init_swait_queue_head(&rdp->nocb_gp_wq); + init_swait_queue_head(&rdp->nocb_state_wq); raw_spin_lock_init(&rdp->nocb_lock); raw_spin_lock_init(&rdp->nocb_bypass_lock); raw_spin_lock_init(&rdp->nocb_gp_lock); -- cgit From ef005345e6e49859e225f549c88c985e79477bb9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:20 +0100 Subject: rcu/nocb: Don't deoffload an offline CPU with pending work Offloaded CPUs do not migrate their callbacks, instead relying on their rcuo kthread to invoke them. But if the CPU is offline, it will be running neither its RCU_SOFTIRQ handler nor its rcuc kthread. This means that de-offloading an offline CPU that still has pending callbacks will strand those callbacks. This commit therefore refuses to toggle offline CPUs having pending callbacks. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Suggested-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1b870d0d2445..b70cc91a7831 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2227,6 +2227,15 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) printk("De-offloading %d\n", rdp->cpu); rcu_nocb_lock_irqsave(rdp, flags); + /* + * If there are still pending work offloaded, the offline + * CPU won't help much handling them. + */ + if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + return -EBUSY; + } + rcu_segcblist_offload(cblist, false); if (rdp->nocb_cb_sleep) { -- cgit From 5bb39dc956f3d4f1bb75b5962b503426c45340ae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:21 +0100 Subject: rcu/nocb: De-offloading GP kthread To de-offload callback processing back onto a CPU, it is necessary to clear SEGCBLIST_OFFLOAD and notify the nocb GP kthread, which will then clear its own bit flag and ignore this CPU until further notice. Whichever of the nocb CB and nocb GP kthreads is last to clear its own bit notifies the de-offloading worker kthread. Once notified, this worker kthread can proceed safe in the knowledge that the nocb CB and GP kthreads will no longer be manipulating this CPU's RCU callback list. This commit makes this change. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 54 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b70cc91a7831..fe46e7008667 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1928,6 +1928,33 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t) __call_rcu_nocb_wake(rdp, true, flags); } +static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; + + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + return true; + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + return false; + } +} + + /* * No-CBs GP kthreads come here to wait for additional callbacks to show up * or for grace periods to end. @@ -1956,8 +1983,17 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) */ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + bool needwake_state = false; + if (!nocb_gp_enabled_cb(rdp)) + continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); + if (!nocb_gp_update_state(rdp, &needwake_state)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; + } bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); if (bypass_ncbs && (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || @@ -2221,7 +2257,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; - bool wake_cb = false; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool wake_cb = false, wake_gp = false; unsigned long flags; printk("De-offloading %d\n", rdp->cpu); @@ -2247,9 +2284,19 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) if (wake_cb) swake_up_one(&rdp->nocb_cb_wq); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (rdp_gp->nocb_gp_sleep) { + rdp_gp->nocb_gp_sleep = false; + wake_gp = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | + SEGCBLIST_KTHREAD_GP)); return 0; } @@ -2332,6 +2379,7 @@ void __init rcu_init_nohz(void) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); } rcu_organize_nocb_kthreads(); } -- cgit From 254e11efde66ca0a0ce0c99a62c377314b5984ff Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:22 +0100 Subject: rcu/nocb: Re-offload support To re-offload the callback processing off of a CPU, it is necessary to clear SEGCBLIST_SOFTIRQ_ONLY, set SEGCBLIST_OFFLOADED, and then notify both the CB and GP kthreads so that they both set their own bit flag and start processing the callbacks remotely. The re-offloading worker is then notified that it can stop the RCU_SOFTIRQ handler (or rcuc kthread, as the case may be) from processing the callbacks locally. Ordering must be carefully enforced so that the callbacks that used to be processed locally without locking will have the same ordering properties when they are invoked by the nocb CB and GP kthreads. This commit makes this change. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker [ paulmck: Export rcu_nocb_cpu_offload(). ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 158 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 136 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fe46e7008667..03ae1ce4790f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1928,6 +1928,20 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t) __call_rcu_nocb_wake(rdp, true, flags); } +/* + * Check if we ignore this rdp. + * + * We check that without holding the nocb lock but + * we make sure not to miss a freshly offloaded rdp + * with the current ordering: + * + * rdp_offload_toggle() nocb_gp_enabled_cb() + * ------------------------- ---------------------------- + * WRITE flags LOCK nocb_gp_lock + * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep + * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock + * UNLOCK nocb_gp_lock READ flags + */ static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) { u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; @@ -1940,6 +1954,11 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta struct rcu_segcblist *cblist = &rdp->cblist; if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + } return true; } else { /* @@ -2003,6 +2022,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); continue; /* No callbacks here, try next. */ } if (bypass_ncbs) { @@ -2054,6 +2075,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (needwake_gp) rcu_gp_kthread_wake(); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); } my_rdp->nocb_gp_bypass = bypass; @@ -2159,6 +2182,11 @@ static void nocb_cb_wait(struct rcu_data *rdp) WRITE_ONCE(rdp->nocb_cb_sleep, true); if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } if (rcu_segcblist_ready_cbs(cblist)) WRITE_ONCE(rdp->nocb_cb_sleep, false); } else { @@ -2254,35 +2282,25 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) do_nocb_deferred_wakeup_common(rdp); } -static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +static int rdp_offload_toggle(struct rcu_data *rdp, + bool offload, unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_segcblist *cblist = &rdp->cblist; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - bool wake_cb = false, wake_gp = false; - unsigned long flags; - - printk("De-offloading %d\n", rdp->cpu); - - rcu_nocb_lock_irqsave(rdp, flags); - /* - * If there are still pending work offloaded, the offline - * CPU won't help much handling them. - */ - if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - return -EBUSY; - } + bool wake_gp = false; - rcu_segcblist_offload(cblist, false); + rcu_segcblist_offload(cblist, offload); - if (rdp->nocb_cb_sleep) { + if (rdp->nocb_cb_sleep) rdp->nocb_cb_sleep = false; - wake_cb = true; - } rcu_nocb_unlock_irqrestore(rdp, flags); - if (wake_cb) - swake_up_one(&rdp->nocb_cb_wq); + /* + * Ignore former value of nocb_cb_sleep and force wake up as it could + * have been spuriously set to false already. + */ + swake_up_one(&rdp->nocb_cb_wq); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); if (rdp_gp->nocb_gp_sleep) { @@ -2294,10 +2312,32 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); + return 0; +} + +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + printk("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + /* + * If there are still pending work offloaded, the offline + * CPU won't help much handling them. + */ + if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + return -EBUSY; + } + + ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); - return 0; + return ret; } static long rcu_nocb_rdp_deoffload(void *arg) @@ -2335,6 +2375,80 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); +static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + /* + * For now we only support re-offload, ie: the rdp must have been + * offloaded on boot first. + */ + if (!rdp->nocb_gp_rdp) + return -EINVAL; + + printk("Offloading %d\n", rdp->cpu); + /* + * Can't use rcu_nocb_lock_irqsave() while we are in + * SEGCBLIST_SOFTIRQ_ONLY mode. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + /* + * We didn't take the nocb lock while working on the + * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * Every modifications that have been done previously on + * rdp->cblist must be visible remotely by the nocb kthreads + * upon wake up after reading the cblist flags. + * + * The layout against nocb_lock enforces that ordering: + * + * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() + * ------------------------- ---------------------------- + * WRITE callbacks rcu_nocb_lock() + * rcu_nocb_lock() READ flags + * WRITE flags READ callbacks + * rcu_nocb_unlock() rcu_nocb_unlock() + */ + ret = rdp_offload_toggle(rdp, true, flags); + swait_event_exclusive(rdp->nocb_state_wq, + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + + return ret; +} + +static long rcu_nocb_rdp_offload(void *arg) +{ + struct rcu_data *rdp = arg; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + return __rcu_nocb_rdp_offload(rdp); +} + +int rcu_nocb_cpu_offload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + } else { + ret = __rcu_nocb_rdp_offload(rdp); + } + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); + void __init rcu_init_nohz(void) { int cpu; -- cgit From 69cdea873cde261586a2cae2440178df1a313bbe Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:23 +0100 Subject: rcu/nocb: Shutdown nocb timer on de-offloading This commit ensures that the nocb timer is shut down before reaching the final de-offloaded state. The key goal is to prevent the timer handler from manipulating the callbacks without the protection of the nocb locks. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e0deb4829847..5d359b9f9fec 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -257,6 +257,7 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ +#define RCU_NOCB_WAKE_OFF -1 #define RCU_NOCB_WAKE_NOT 0 #define RCU_NOCB_WAKE 1 #define RCU_NOCB_WAKE_FORCE 2 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 03ae1ce4790f..c88ad628595b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1665,6 +1665,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, const char *reason) { + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF) + return; if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) mod_timer(&rdp->nocb_timer, jiffies + 1); if (rdp->nocb_defer_wakeup < waketype) @@ -2243,7 +2245,7 @@ static int rcu_nocb_cb_kthread(void *arg) /* Is a deferred wakeup of rcu_nocb_kthread() required? */ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { - return READ_ONCE(rdp->nocb_defer_wakeup); + return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT; } /* Do a deferred wakeup of rcu_nocb_kthread(). */ @@ -2337,6 +2339,12 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + /* Make sure nocb timer won't stay around */ + rcu_nocb_lock_irqsave(rdp, flags); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF); + rcu_nocb_unlock_irqrestore(rdp, flags); + del_timer_sync(&rdp->nocb_timer); + return ret; } @@ -2394,6 +2402,8 @@ static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) * SEGCBLIST_SOFTIRQ_ONLY mode. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + /* Re-enable nocb timer */ + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); /* * We didn't take the nocb lock while working on the * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. -- cgit From 314202f84ddd61e4d7576ef62570ad2e2d9db06b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:24 +0100 Subject: rcu/nocb: Flush bypass before setting SEGCBLIST_SOFTIRQ_ONLY This commit flushes the bypass queue and sets state to avoid its being refilled before switching to the final de-offloaded state. To avoid refilling, this commit sets SEGCBLIST_SOFTIRQ_ONLY before re-enabling IRQs. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c88ad628595b..35dc9b31e9a4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2339,12 +2339,21 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); - /* Make sure nocb timer won't stay around */ rcu_nocb_lock_irqsave(rdp, flags); + /* Make sure nocb timer won't stay around */ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF); rcu_nocb_unlock_irqrestore(rdp, flags); del_timer_sync(&rdp->nocb_timer); + /* + * Flush bypass. While IRQs are disabled and once we set + * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be + * enqueued on bypass. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_nocb_flush_bypass(rdp, NULL, jiffies); + rcu_nocb_unlock_irqrestore(rdp, flags); + return ret; } -- cgit From b9ced9e1ab51ed6057ac8198fd1eeb404a32a867 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:25 +0100 Subject: rcu/nocb: Set SEGCBLIST_SOFTIRQ_ONLY at the very last stage of de-offloading This commit sets SEGCBLIST_SOFTIRQ_ONLY once toggling is otherwise fully complete, allowing further RCU callback manipulation to be carried out locklessly and locally. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 35dc9b31e9a4..8641b72f6d0d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2352,7 +2352,14 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) */ rcu_nocb_lock_irqsave(rdp, flags); rcu_nocb_flush_bypass(rdp, NULL, jiffies); - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + /* + * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * rcu_nocb_unlock_irqrestore() anymore. Theoretically we + * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs + * disabled now, but let's be paranoid. + */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return ret; } -- cgit From e3abe959fbd57aa751bc533677a35c411cee9b16 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:26 +0100 Subject: rcu/nocb: Only cond_resched() from actual offloaded batch processing During a toggle operations, rcu_do_batch() may be invoked concurrently by softirqs and offloaded processing for a given CPU's callbacks. This commit therefore makes sure cond_resched() is invoked only from the offloaded context. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 83362f6f1119..4ef59a5416a3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2516,8 +2516,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Exceeded the time limit, so leave. */ break; } - if (offloaded) { - WARN_ON_ONCE(in_serving_softirq()); + if (!in_serving_softirq()) { local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); -- cgit From 32aa2f4170d22f0b9fcb75ab05679ab122fae373 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:27 +0100 Subject: rcu/nocb: Process batch locally as long as offloading isn't complete This commit makes sure to process the callbacks locally (via either RCU_SOFTIRQ or the rcuc kthread) whenever the segcblist isn't entirely offloaded. This ensures that callbacks are invoked one way or another while a CPU is in the middle of a toggle operation. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 12 ++++++++++++ kernel/rcu/tree.c | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 28c9a5225afc..afad6fc6311c 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -95,6 +95,18 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) return false; } +static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) +{ + int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED; + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { + if ((rsclp->flags & flags) == flags) + return true; + } + + return false; +} + /* * Are all segments following the specified segment of the specified * rcu_segcblist structure empty of callbacks? (The specified diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4ef59a5416a3..ec14c017c0e3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2700,6 +2700,7 @@ static __latent_entropy void rcu_core(void) struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2729,7 +2730,7 @@ static __latent_entropy void rcu_core(void) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) rcu_do_batch(rdp); -- cgit From 634954c2dbf88e67aa267798f60af6b9a476cf4b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:28 +0100 Subject: rcu/nocb: Locally accelerate callbacks as long as offloading isn't complete The local callbacks processing checks if any callbacks need acceleration. This commit carries out this checking under nocb lock protection in the middle of toggle operations, during which time rcu_core() executes concurrently with GP/CB kthreads. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ec14c017c0e3..03810a58fdd0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2699,7 +2699,6 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) @@ -2720,11 +2719,11 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) { - local_irq_save(flags); + rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { + rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); + rcu_nocb_unlock_irqrestore(rdp, flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); -- cgit From 43759fe5a137389e94ed6d4680c3c63c17273158 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Nov 2020 23:53:13 +0100 Subject: cpu/hotplug: Add lockdep_is_cpus_held() This commit adds a lockdep_is_cpus_held() function to verify that the proper locks are held and that various operations are running in the correct context. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 4e11e91010e1..1b6302ecbabe 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -330,6 +330,13 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +#ifdef CONFIG_LOCKDEP +int lockdep_is_cpus_held(void) +{ + return percpu_rwsem_is_held(&cpu_hotplug_lock); +} +#endif + static void lockdep_acquire_cpus_lock(void) { rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); -- cgit From dcd42591ebb8a25895b551a5297ea9c24414ba54 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:33 +0100 Subject: timer: Add timer_curr_running() This commit adds a timer_curr_running() function that verifies that the current code is running in the context of the specified timer's handler. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/time/timer.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8dbc008f8942..f9b2096456e5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1237,6 +1237,19 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); +bool timer_curr_running(struct timer_list *timer) +{ + int i; + + for (i = 0; i < NR_BASES; i++) { + struct timer_base *base = this_cpu_ptr(&timer_bases[i]); + if (base->running_timer == timer) + return true; + } + + return false; +} + #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { -- cgit From 2c4319bd1d14d01f5b6654a90c2b6362f3a407d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2020 17:39:46 -0700 Subject: rcutorture: Test runtime toggling of CPUs' callback offloading Frederic Weisbecker is adding the ability to change the rcu_nocbs state of CPUs at runtime, that is, to offload and deoffload their RCU callback processing without the need to reboot. As the old saying goes, "if it ain't tested, it don't work", so this commit therefore adds prototype rcutorture testing for this capability. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker --- kernel/rcu/rcutorture.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 528ed10b78fd..22735bc3eacc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -97,6 +97,8 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); +torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, @@ -127,10 +129,12 @@ static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); +static int nrealnocbers; static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; @@ -174,6 +178,8 @@ static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; static unsigned long start_gp_seq; +static atomic_long_t n_nocb_offload; +static atomic_long_t n_nocb_deoffload; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -1498,6 +1504,53 @@ rcu_torture_reader(void *arg) return 0; } +/* + * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to + * increase race probabilities and fuzzes the interval between toggling. + */ +static int rcu_nocb_toggle(void *arg) +{ + int cpu; + int maxcpu = -1; + int oldnice = task_nice(current); + long r; + DEFINE_TORTURE_RANDOM(rand); + ktime_t toggle_delay; + unsigned long toggle_fuzz; + ktime_t toggle_interval = ms_to_ktime(nocbs_toggle); + + VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started"); + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (toggle_interval > ULONG_MAX) + toggle_fuzz = ULONG_MAX >> 3; + else + toggle_fuzz = toggle_interval >> 3; + if (toggle_fuzz <= 0) + toggle_fuzz = NSEC_PER_USEC; + do { + r = torture_random(&rand); + cpu = (r >> 4) % (maxcpu + 1); + if (r & 0x1) { + rcu_nocb_cpu_offload(cpu); + atomic_long_inc(&n_nocb_offload); + } else { + rcu_nocb_cpu_deoffload(cpu); + atomic_long_inc(&n_nocb_deoffload); + } + toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval; + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL); + if (stutter_wait("rcu_nocb_toggle")) + sched_set_normal(current, oldnice); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_nocb_toggle"); + return 0; +} + /* * Print torture statistics. Caller must ensure that there is only * one call to this function at a given time!!! This is normally @@ -1553,7 +1606,9 @@ rcu_torture_stats_print(void) data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); - pr_cont("read-exits: %ld\n", data_race(n_read_exits)); + pr_cont("read-exits: %ld ", data_race(n_read_exits)); + pr_cont("nocb-toggles: %ld:%ld\n", + atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1647,7 +1702,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " - "read_exit_delay=%d read_exit_burst=%d\n", + "read_exit_delay=%d read_exit_burst=%d " + "nocbs_nthreads=%d nocbs_toggle=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -1657,7 +1713,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff, - read_exit_delay, read_exit_burst); + read_exit_delay, read_exit_burst, + nocbs_nthreads, nocbs_toggle); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2500,6 +2557,13 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); + if (nocb_tasks) { + for (i = 0; i < nrealnocbers; i++) + torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]); + kfree(nocb_tasks); + nocb_tasks = NULL; + } + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, @@ -2762,6 +2826,26 @@ rcu_torture_init(void) if (firsterr) goto unwind; } + nrealnocbers = nocbs_nthreads; + if (WARN_ON(nrealnocbers < 0)) + nrealnocbers = 1; + if (WARN_ON(nocbs_toggle < 0)) + nocbs_toggle = HZ; + if (nrealnocbers > 0) { + nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL); + if (nocb_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + } else { + nocb_tasks = NULL; + } + for (i = 0; i < nrealnocbers; i++) { + firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]); + if (firsterr) + goto unwind; + } if (stat_interval > 0) { firsterr = torture_create_kthread(rcu_torture_stats, NULL, stats_task); -- cgit From 341690611f8d488859f42a761f5d7cbac6ba2940 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Dec 2020 10:20:34 -0800 Subject: rcu/nocb: Add grace period and task state to show_rcu_nocb_state() output This commit improves debuggability by indicating which grace period each batch of nocb callbacks is waiting on and by showing the task state and last CPU for reach nocb kthread. [ paulmck: Handle !SMP CB offloading per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 11 +++++++++++ kernel/rcu/tree_plugin.h | 39 ++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index afad6fc6311c..311060279f1a 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -117,6 +117,17 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) return !READ_ONCE(*READ_ONCE(rsclp->tails[seg])); } +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8641b72f6d0d..5ee1113fbf39 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2672,6 +2672,19 @@ void rcu_bind_current_to_nocb(void) } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); +// The ->on_cpu field is available only in CONFIG_SMP=y, so... +#ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : ""; +} +#else // #ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return ""; +} +#endif // #else #ifdef CONFIG_SMP + /* * Dump out nocb grace-period kthread state for the specified rcu_data * structure. @@ -2680,7 +2693,7 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n", + pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", rdp->cpu, "kK"[!!rdp->nocb_gp_kthread], "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], @@ -2694,12 +2707,17 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) ".B"[!!rdp->nocb_gp_bypass], ".G"[!!rdp->nocb_gp_gp], (long)rdp->nocb_gp_seq, - rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops)); + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), + rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); } /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { + char bufw[20]; + char bufr[20]; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; bool wastimer; @@ -2708,7 +2726,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); - pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n", + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], @@ -2720,11 +2740,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], - ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)], - ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)], - ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)], + ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, + ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, + ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], - rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_segcblist_n_cbs(&rdp->cblist), + rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); /* It is OK for GP kthreads to have GP state. */ if (rdp->nocb_gp_rdp == rdp) -- cgit From 3d0cef50f32e2bc69f60909584c18623bba9a6c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Dec 2020 13:17:37 -0800 Subject: rcu/nocb: Add nocb CB kthread list to show_rcu_nocb_state() output This commit improves debuggability by indicating laying out the order in which rcuoc kthreads appear in the ->nocb_next_cb_rdp list. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5ee1113fbf39..bc63a6b9d532 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2730,6 +2730,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, + rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], "cC"[!!atomic_read(&rdp->nocb_lock_contended)], -- cgit From f759081e8f5ac640df1c7125540759bbcb4eb0e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Dec 2020 11:17:16 -0800 Subject: rcu/nocb: Code-style nits in callback-offloading toggling This commit addresses a few code-style nits in callback-offloading toggling, including one that predates this toggling. Cc: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 19 ++++++------------- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree_plugin.h | 45 ++++++++++++++++++++++----------------------- kernel/time/timer.c | 1 + 4 files changed, 30 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 311060279f1a..9a19328ff251 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -80,17 +80,12 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } -/* Is the specified rcu_segcblist offloaded? */ +/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { - /* - * Complete de-offloading happens only when SEGCBLIST_SOFTIRQ_ONLY - * is set. - */ - if (!rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) - return true; - } + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + return true; return false; } @@ -99,10 +94,8 @@ static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rscl { int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { - if ((rsclp->flags & flags) == flags) - return true; - } + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags) + return true; return false; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 22735bc3eacc..b9dd63c166b9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1606,7 +1606,7 @@ rcu_torture_stats_print(void) data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); - pr_cont("read-exits: %ld ", data_race(n_read_exits)); + pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic. pr_cont("nocb-toggles: %ld:%ld\n", atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bc63a6b9d532..6f56f9e51e67 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1962,17 +1962,17 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta *needwake_state = true; } return true; - } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return false; } + + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + return false; } @@ -2005,6 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { bool needwake_state = false; + if (!nocb_gp_enabled_cb(rdp)) continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); @@ -2160,11 +2161,11 @@ static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) static void nocb_cb_wait(struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; - struct rcu_node *rnp = rdp->mynode; - bool needwake_state = false; - bool needwake_gp = false; unsigned long cur_gp_seq; unsigned long flags; + bool needwake_state = false; + bool needwake_gp = false; + struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -2217,8 +2218,8 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); - /* ^^^ Ensure CB invocation follows _sleep test. */ - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); } @@ -2323,7 +2324,7 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) unsigned long flags; int ret; - printk("De-offloading %d\n", rdp->cpu); + pr_info("De-offloading %d\n", rdp->cpu); rcu_nocb_lock_irqsave(rdp, flags); /* @@ -2384,11 +2385,10 @@ int rcu_nocb_cpu_deoffload(int cpu) mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); if (rcu_segcblist_is_offloaded(&rdp->cblist)) { - if (cpu_online(cpu)) { + if (cpu_online(cpu)) ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); - } else { + else ret = __rcu_nocb_rdp_deoffload(rdp); - } if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } @@ -2412,7 +2412,7 @@ static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) if (!rdp->nocb_gp_rdp) return -EINVAL; - printk("Offloading %d\n", rdp->cpu); + pr_info("Offloading %d\n", rdp->cpu); /* * Can't use rcu_nocb_lock_irqsave() while we are in * SEGCBLIST_SOFTIRQ_ONLY mode. @@ -2460,11 +2460,10 @@ int rcu_nocb_cpu_offload(int cpu) mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { - if (cpu_online(cpu)) { + if (cpu_online(cpu)) ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); - } else { + else ret = __rcu_nocb_rdp_offload(rdp); - } if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f9b2096456e5..f475f1a027c8 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1243,6 +1243,7 @@ bool timer_curr_running(struct timer_list *timer) for (i = 0; i < NR_BASES; i++) { struct timer_base *base = this_cpu_ptr(&timer_bases[i]); + if (base->running_timer == timer) return true; } -- cgit From 147c6852d34563b87ff0e67383c2bf675e8248f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Dec 2020 16:49:11 -0800 Subject: rcu: Do any deferred nocb wakeups at CPU offline time Because the need to wake a nocb GP kthread ("rcuog") is sometimes detected when wakeups cannot be done, these wakeups can be deferred. The wakeups are then carried out by calls to do_nocb_deferred_wakeup() at various safe points in the code, including RCU's idle hooks. However, when a CPU goes offline, it invokes arch_cpu_idle_dead() without invoking any of RCU's idle hooks. This commit therefore adds a call to do_nocb_deferred_wakeup() in rcu_report_dead() in order to handle any deferred wakeups that have been requested by the outgoing CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 03810a58fdd0..e6dee714efe0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4178,6 +4178,9 @@ void rcu_report_dead(unsigned int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + // Do any dangling deferred wakeups. + do_nocb_deferred_wakeup(rdp); + /* QS for any half-done expedited grace period. */ preempt_disable(); rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); -- cgit From 683954e55c981467bfd4688417e914bafc40959f Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 16 Nov 2020 21:36:00 +0530 Subject: rcu: Check and report missed fqs timer wakeup on RCU stall For a new grace period request, the RCU GP kthread transitions through following states: a. [RCU_GP_WAIT_GPS] -> [RCU_GP_DONE_GPS] The RCU_GP_WAIT_GPS state is where the GP kthread waits for a request for a new GP. Once it receives a request (for example, when a new RCU callback is queued), the GP kthread transitions to RCU_GP_DONE_GPS. b. [RCU_GP_DONE_GPS] -> [RCU_GP_ONOFF] Grace period initialization starts in rcu_gp_init(), which records the start of new GP in rcu_state.gp_seq and transitions to RCU_GP_ONOFF. c. [RCU_GP_ONOFF] -> [RCU_GP_INIT] The purpose of the RCU_GP_ONOFF state is to apply the online/offline information that was buffered for any CPUs that recently came online or went offline. This state is maintained in per-leaf rcu_node bitmasks, with the buffered state in ->qsmaskinitnext and the state for the upcoming GP in ->qsmaskinit. At the end of this RCU_GP_ONOFF state, each bit in ->qsmaskinit will correspond to a CPU that must pass through a quiescent state before the upcoming grace period is allowed to complete. However, a leaf rcu_node structure with an all-zeroes ->qsmaskinit cannot necessarily be ignored. In preemptible RCU, there might well be tasks still in RCU read-side critical sections that were first preempted while running on one of the CPUs managed by this structure. Such tasks will be queued on this structure's ->blkd_tasks list. Only after this list fully drains can this leaf rcu_node structure be ignored, and even then only if none of its CPUs have come back online in the meantime. Once that happens, the ->qsmaskinit masks further up the tree will be updated to exclude this leaf rcu_node structure. Once the ->qsmaskinitnext and ->qsmaskinit fields have been updated as needed, the GP kthread transitions to RCU_GP_INIT. d. [RCU_GP_INIT] -> [RCU_GP_WAIT_FQS] The purpose of the RCU_GP_INIT state is to copy each ->qsmaskinit to the ->qsmask field within each rcu_node structure. This copying is done breadth-first from the root to the leaves. Why not just copy directly from ->qsmaskinitnext to ->qsmask? Because the ->qsmaskinitnext masks can change in the meantime as additional CPUs come online or go offline. Such changes would result in inconsistencies in the ->qsmask fields up and down the tree, which could in turn result in too-short grace periods or grace-period hangs. These issues are avoided by snapshotting the leaf rcu_node structures' ->qsmaskinitnext fields into their ->qsmaskinit counterparts, generating a consistent set of ->qsmaskinit fields throughout the tree, and only then copying these consistent ->qsmaskinit fields to their ->qsmask counterparts. Once this initialization step is complete, the GP kthread transitions to RCU_GP_WAIT_FQS, where it waits to do a force-quiescent-state scan on the one hand or for the end of the grace period on the other. e. [RCU_GP_WAIT_FQS] -> [RCU_GP_DOING_FQS] The RCU_GP_WAIT_FQS state waits for one of three things: (1) An explicit request to do a force-quiescent-state scan, (2) The end of the grace period, or (3) A short interval of time, after which it will do a force-quiescent-state (FQS) scan. The explicit request can come from rcutorture or from any CPU that has too many RCU callbacks queued (see the qhimark kernel parameter and the RCU_GP_FLAG_OVLD flag). The aforementioned "short period of time" is specified by the jiffies_till_first_fqs boot parameter for a given grace period's first FQS scan and by the jiffies_till_next_fqs for later FQS scans. Either way, once the wait is over, the GP kthread transitions to RCU_GP_DOING_FQS. f. [RCU_GP_DOING_FQS] -> [RCU_GP_CLEANUP] The RCU_GP_DOING_FQS state performs an FQS scan. Each such scan carries out two functions for any CPU whose bit is still set in its leaf rcu_node structure's ->qsmask field, that is, for any CPU that has not yet reported a quiescent state for the current grace period: i. Report quiescent states on behalf of CPUs that have been observed to be idle (from an RCU perspective) since the beginning of the grace period. ii. If the current grace period is too old, take various actions to encourage holdout CPUs to pass through quiescent states, including enlisting the aid of any calls to cond_resched() and might_sleep(), and even including IPIing the holdout CPUs. These checks are skipped for any leaf rcu_node structure with a all-zero ->qsmask field, however such structures are subject to RCU priority boosting if there are tasks on a given structure blocking the current grace period. The end of the grace period is detected when the root rcu_node structure's ->qsmask is zero and when there are no longer any preempted tasks blocking the current grace period. (No, this last check is not redundant. To see this, consider an rcu_node tree having exactly one structure that serves as both root and leaf.) Once the end of the grace period is detected, the GP kthread transitions to RCU_GP_CLEANUP. g. [RCU_GP_CLEANUP] -> [RCU_GP_CLEANED] The RCU_GP_CLEANUP state marks the end of grace period by updating the rcu_state structure's ->gp_seq field and also all rcu_node structures' ->gp_seq field. As before, the rcu_node tree is traversed in breadth first order. Once this update is complete, the GP kthread transitions to the RCU_GP_CLEANED state. i. [RCU_GP_CLEANED] -> [RCU_GP_INIT] Once in the RCU_GP_CLEANED state, the GP kthread immediately transitions into the RCU_GP_INIT state. j. The role of timers. If there is at least one idle CPU, and if timers are not firing, the transition from RCU_GP_DOING_FQS to RCU_GP_CLEANUP will never happen. Timers can fail to fire for a number of reasons, including issues in timer configuration, issues in the timer framework, and failure to handle softirqs (for example, when there is a storm of interrupts). Whatever the reason, if the timers fail to fire, the GP kthread will never be awakened, resulting in RCU CPU stall warnings and eventually in OOM. However, an RCU CPU stall warning has a large number of potential causes, as documented in Documentation/RCU/stallwarn.rst. This commit therefore adds analysis to the RCU CPU stall-warning code to emit an additional message if the cause of the stall is likely to be timer failure. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 +++++++++++++++---------- kernel/rcu/tree_stall.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..e918f100cc34 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1765,7 +1765,7 @@ static bool rcu_gp_init(void) * go offline later. Please also refer to "Hotplug CPU" section * of RCU's Requirements documentation. */ - rcu_state.gp_state = RCU_GP_ONOFF; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); rcu_for_each_leaf_node(rnp) { smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. firstseq = READ_ONCE(rnp->ofl_seq); @@ -1831,7 +1831,7 @@ static bool rcu_gp_init(void) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ - rcu_state.gp_state = RCU_GP_INIT; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT); rcu_for_each_node_breadth_first(rnp) { rcu_gp_slow(gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1930,17 +1930,22 @@ static void rcu_gp_fqs_loop(void) ret = 0; for (;;) { if (!ret) { - rcu_state.jiffies_force_qs = jiffies + j; + WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); + /* + * jiffies_force_qs before RCU_GP_WAIT_FQS state + * update; required for stall checks. + */ + smp_wmb(); WRITE_ONCE(rcu_state.jiffies_kick_kthreads, jiffies + (j ? 3 * j : 2)); } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); - rcu_state.gp_state = RCU_GP_WAIT_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); - rcu_state.gp_state = RCU_GP_DOING_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && @@ -2054,7 +2059,7 @@ static void rcu_gp_cleanup(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - rcu_state.gp_state = RCU_GP_IDLE; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE); /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { @@ -2093,12 +2098,12 @@ static int __noreturn rcu_gp_kthread(void *unused) for (;;) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwait")); - rcu_state.gp_state = RCU_GP_WAIT_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS); swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); rcu_gp_torture_wait(); - rcu_state.gp_state = RCU_GP_DONE_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS); /* Locking provides needed memory barrier. */ if (rcu_gp_init()) break; @@ -2113,9 +2118,9 @@ static int __noreturn rcu_gp_kthread(void *unused) rcu_gp_fqs_loop(); /* Handle grace-period end. */ - rcu_state.gp_state = RCU_GP_CLEANUP; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP); rcu_gp_cleanup(); - rcu_state.gp_state = RCU_GP_CLEANED; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED); } } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 29cf096b5d20..353da30bd558 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -482,6 +482,36 @@ static void rcu_check_gp_kthread_starvation(void) } } +/* Complain about missing wakeups from expired fqs wait timer */ +static void rcu_check_gp_kthread_expired_fqs_timer(void) +{ + struct task_struct *gpk = rcu_state.gp_kthread; + short gp_state; + unsigned long jiffies_fqs; + int cpu; + + /* + * Order reads of .gp_state and .jiffies_force_qs. + * Matching smp_wmb() is present in rcu_gp_fqs_loop(). + */ + gp_state = smp_load_acquire(&rcu_state.gp_state); + jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs); + + if (gp_state == RCU_GP_WAIT_FQS && + time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) && + gpk && !READ_ONCE(gpk->on_rq)) { + cpu = task_cpu(gpk); + pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n", + rcu_state.name, (jiffies - jiffies_fqs), + (long)rcu_seq_current(&rcu_state.gp_seq), + data_race(rcu_state.gp_flags), + gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, + gpk->state); + pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", + cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu)); + } +} + static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; @@ -543,6 +573,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) WRITE_ONCE(rcu_state.jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); panic_on_rcu_stall(); @@ -578,6 +609,7 @@ static void print_cpu_stall(unsigned long gps) jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); rcu_dump_cpu_stacks(); -- cgit From e76506f0e85129d726c487c873a2245c92446515 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Nov 2020 10:24:52 -0800 Subject: refscale: Allow summarization of verbose output The refscale test prints enough per-kthread console output to provoke RCU CPU stall warnings on large systems. This commit therefore allows this output to be summarized. For example, the refscale.verbose_batched=32 boot parameter would causes only every 32nd line of output to be logged. Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 23ff36a66f97..3da246f2ccf5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -46,6 +46,16 @@ #define VERBOSE_SCALEOUT(s, x...) \ do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) +static atomic_t verbose_batch_ctr; + +#define VERBOSE_SCALEOUT_BATCH(s, x...) \ +do { \ + if (verbose && \ + (verbose_batched <= 0 || \ + !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \ +} while (0) + #define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) @@ -57,6 +67,7 @@ module_param(scale_type, charp, 0444); MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, @@ -368,14 +379,14 @@ ref_scale_reader(void *arg) u64 start; s64 duration; - VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); repeat: - VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); // Wait for signal that this reader can start. wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || @@ -392,7 +403,7 @@ repeat: while (atomic_read_acquire(&n_started)) cpu_relax(); - VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx); // To reduce noise, do an initial cache-warming invocation, check @@ -421,8 +432,8 @@ repeat: if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); - VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", - me, exp_idx, atomic_read(&nreaders_exp)); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); if (!torture_must_stop()) goto repeat; -- cgit From 12a910e3cd3d11e00b2a2df24ea995ffa3e27ae5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Nov 2020 16:01:50 -0800 Subject: rcutorture: Require entire stutter period be post-boot Currently, the rcu_torture_writer() function checks that all required grace periods elapse during a stutter interval, which is a multi-second time period during which the test load is removed. However, this check is suppressed during early boot (that is, before init is spawned) in order to avoid false positives that otherwise occur due to heavy load on the single boot CPU. Unfortunately, this approach is insufficient. It is possible that the stutter interval might end just as init is spawned, so that early boot conditions prevailed during almost the entire stutter interval. This commit therefore takes a snapshot of boot-complete state just before the stutter interval, thus suppressing the check for failure to complete grace periods unless the entire stutter interval took place after early boot. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 338e1182b4b5..1930d92f4d15 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1070,6 +1070,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { + bool boot_ended; bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); unsigned long cookie; int expediting = 0; @@ -1239,12 +1240,13 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; + boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - rcu_inkernel_boot_has_ended()) + boot_ended) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != -- cgit From 18fbf307b7319af3725c36e16af6ae9f35a8699c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Nov 2020 16:46:06 -0800 Subject: rcutorture: Make synctype[] and nsynctype be static global Full testing of the new SRCU polling API requires that the fake writers also use it in order to test concurrent calls to all of the API members, especially start_poll_synchronize_srcu(). This commit prepares the ground for this by making the synctype[] and nsynctype variables be static globals so that the rcu_torture_fakewriter() function can access them. Initialization of these variables is moved from rcu_torture_writer() to a new rcu_torture_write_types() function that is invoked from rcu_torture_init() just before the first writer kthread is spawned. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 62 ++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1930d92f4d15..0d257e2dc9c6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1062,37 +1062,18 @@ rcu_torture_fqs(void *arg) return 0; } +// Used by writers to randomly choose from the available grace-period +// primitives. The only purpose of the initialization is to size the array. +static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; +static int nsynctypes; + /* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). + * Determine which grace-period primitives are available. */ -static int -rcu_torture_writer(void *arg) +static void rcu_torture_write_types(void) { - bool boot_ended; - bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); - unsigned long cookie; - int expediting = 0; - unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; - int i; - int idx; - int oldnice = task_nice(current); - struct rcu_torture *rp; - struct rcu_torture *old_rp; - static DEFINE_TORTURE_RANDOM(rand); - bool stutter_waited; - int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, - RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; - int nsynctypes = 0; - - VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) - pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s.\n", - torture_type, cur_ops->name); /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) @@ -1127,6 +1108,34 @@ rcu_torture_writer(void *arg) } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + bool boot_ended; + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); + unsigned long cookie; + int expediting = 0; + unsigned long gp_snap; + int i; + int idx; + int oldnice = task_nice(current); + struct rcu_torture *rp; + struct rcu_torture *old_rp; + static DEFINE_TORTURE_RANDOM(rand); + bool stutter_waited; + + VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " GP expediting controlled from boot/sysfs for %s.\n", + torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* @@ -2889,6 +2898,7 @@ rcu_torture_init(void) /* Start up the kthreads. */ + rcu_torture_write_types(); firsterr = torture_create_kthread(rcu_torture_writer, NULL, writer_task); if (firsterr) -- cgit From 682189a3f874db57b3e755512f2a2953f61fc54e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Nov 2020 17:10:39 -0800 Subject: rcutorture: Make rcu_torture_fakewriter() use blocking wait primitives Full testing of the new SRCU polling API requires that the fake writers also use it in order to test concurrent calls to all of the API members, especially start_poll_synchronize_srcu(). This commit makes rcu_torture_fakewriter() use all available blocking grace-period-wait primitives available from the RCU flavor under test. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0d257e2dc9c6..03bdf6752fe4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1289,6 +1289,8 @@ rcu_torture_writer(void *arg) static int rcu_torture_fakewriter(void *arg) { + unsigned long gp_snap; + int i; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); @@ -1300,15 +1302,37 @@ rcu_torture_fakewriter(void *arg) if (cur_ops->cb_barrier != NULL && torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); - } else if (gp_normal == gp_exp) { - if (cur_ops->sync && torture_random(&rand) & 0x80) - cur_ops->sync(); - else if (cur_ops->exp_sync) + } else { + switch (synctype[torture_random(&rand) % nsynctypes]) { + case RTWS_DEF_FREE: + break; + case RTWS_EXP_SYNC: cur_ops->exp_sync(); - } else if (gp_normal && cur_ops->sync) { - cur_ops->sync(); - } else if (cur_ops->exp_sync) { - cur_ops->exp_sync(); + break; + case RTWS_COND_GET: + gp_snap = cur_ops->get_gp_state(); + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + cur_ops->cond_sync(gp_snap); + break; + case RTWS_POLL_GET: + gp_snap = cur_ops->start_gp_poll(); + while (!cur_ops->poll_gp_state(gp_snap)) { + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + } + break; + case RTWS_SYNC: + cur_ops->sync(); + break; + default: + WARN_ON_ONCE(1); + break; + } } stutter_wait("rcu_torture_fakewriter"); } while (!torture_must_stop()); -- cgit From ae19aaafae95a5487469433e9cae4c208f8d15cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 11:30:18 -0800 Subject: torture: Add fuzzed hrtimer-based sleep functions This commit adds torture_hrtimeout_ns(), torture_hrtimeout_us(), torture_hrtimeout_ms(), torture_hrtimeout_jiffies(), and torture_hrtimeout_s(), each of which uses hrtimers to block for a fuzzed time interval. These functions are intended to be used by the various torture tests to decouple wakeups from the timer wheel, thus providing more opportunity for Murphy to insert destructive race conditions. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 8562ac18d2eb..7548634e8a89 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -58,6 +58,81 @@ static int verbose; static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +/* + * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit + * nanosecond random fuzz. This function and its friends desynchronize + * testing from the timer wheel. + */ +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp) +{ + ktime_t hto = baset_ns; + + if (trsp) + hto += (torture_random(trsp) >> 3) % fuzzt_ns; + set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); + +/* + * Schedule a high-resolution-timer sleep in microseconds, with a 32-bit + * nanosecond (not microsecond!) random fuzz. + */ +int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_us * NSEC_PER_USEC; + + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_us); + +/* + * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit + * microsecond (not millisecond!) random fuzz. + */ +int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_ms * NSEC_PER_MSEC; + u32 fuzzt_ns; + + if ((u32)~0U / NSEC_PER_USEC < fuzzt_us) + fuzzt_ns = (u32)~0U; + else + fuzzt_ns = fuzzt_us * NSEC_PER_USEC; + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_ms); + +/* + * Schedule a high-resolution-timer sleep in jiffies, with an + * implied one-jiffy random fuzz. This is intended to replace calls to + * schedule_timeout_interruptible() and friends. + */ +int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp) +{ + ktime_t baset_ns = jiffies_to_nsecs(baset_j); + + return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies); + +/* + * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit + * millisecond (not second!) random fuzz. + */ +int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_s * NSEC_PER_SEC; + u32 fuzzt_ns; + + if ((u32)~0U / NSEC_PER_MSEC < fuzzt_ms) + fuzzt_ns = (u32)~0U; + else + fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC; + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_s); + #ifdef CONFIG_HOTPLUG_CPU /* -- cgit From ea31fd9ca87399ac4e03cd6c215451fa7dc366e4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 11:32:54 -0800 Subject: rcutorture: Use torture_hrtimeout_jiffies() to avoid busy-waits Because rcu_torture_writer() and rcu_torture_fakewriter() predate hrtimers, they do timer-wheel-decoupled timed waits by using the timer-wheel-based schedule_timeout_interruptible() functions in conjunction with a random udelay()-based wait. This latter unnecessarily burns CPU time, so this commit instead uses torture_hrtimeout_jiffies() to decouple from the timer wheels without busy-waiting. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 03bdf6752fe4..9414e3027a04 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1194,10 +1194,7 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; gp_snap = cur_ops->get_gp_state(); - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); rcu_torture_writer_state = RTWS_COND_SYNC; cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1206,12 +1203,9 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state = RTWS_POLL_GET; gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; - while (!cur_ops->poll_gp_state(gp_snap)) { - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); - } + while (!cur_ops->poll_gp_state(gp_snap)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_SYNC: @@ -1290,7 +1284,6 @@ static int rcu_torture_fakewriter(void *arg) { unsigned long gp_snap; - int i; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); @@ -1311,19 +1304,14 @@ rcu_torture_fakewriter(void *arg) break; case RTWS_COND_GET: gp_snap = cur_ops->get_gp_state(); - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync(gp_snap); break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); } break; case RTWS_SYNC: -- cgit From ed24affa71f7abf7d81698a99b6c2623491a35b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 12:17:42 -0800 Subject: torture: Make stutter use torture_hrtimeout_*() functions This commit saves a few lines of code by making the stutter_wait() and torture_stutter() functions use torture_hrtimeout_jiffies() and torture_hrtimeout_us(). Signed-off-by: Paul E. McKenney --- kernel/torture.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 7548634e8a89..7ad5c9681580 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -677,7 +677,6 @@ static int stutter_gap; */ bool stutter_wait(const char *title) { - ktime_t delay; unsigned int i = 0; bool ret = false; int spt; @@ -693,11 +692,8 @@ bool stutter_wait(const char *title) schedule_timeout_interruptible(1); } else if (spt == 2) { while (READ_ONCE(stutter_pause_test)) { - if (!(i++ & 0xffff)) { - set_current_state(TASK_INTERRUPTIBLE); - delay = 10 * NSEC_PER_USEC; - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); - } + if (!(i++ & 0xffff)) + torture_hrtimeout_us(10, 0, NULL); cond_resched(); } } else { @@ -715,7 +711,6 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { - ktime_t delay; DEFINE_TORTURE_RANDOM(rand); int wtime; @@ -726,20 +721,15 @@ static int torture_stutter(void *arg) if (stutter > 2) { WRITE_ONCE(stutter_pause_test, 1); wtime = stutter - 3; - delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); - delay += (torture_random(&rand) >> 3) % NSEC_PER_MSEC; - set_current_state(TASK_INTERRUPTIBLE); - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + torture_hrtimeout_jiffies(wtime, &rand); wtime = 2; } WRITE_ONCE(stutter_pause_test, 2); - delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); - set_current_state(TASK_INTERRUPTIBLE); - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + torture_hrtimeout_jiffies(wtime, NULL); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) - schedule_timeout_interruptible(stutter_gap); + torture_hrtimeout_jiffies(stutter_gap, NULL); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); -- cgit From 1eba0ef981fd3b5d5e94243aeced8884f43aef50 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 14:12:24 -0800 Subject: rcutorture: Use hrtimers for reader and writer delays This commit replaces schedule_timeout_uninterruptible() and schedule_timeout_interruptible() with torture_hrtimeout_us() and torture_hrtimeout_jiffies() to avoid timer-wheel synchronization. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9414e3027a04..007595d4783f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1149,7 +1149,7 @@ rcu_torture_writer(void *arg) do { rcu_torture_writer_state = RTWS_FIXED_DELAY; - schedule_timeout_uninterruptible(1); + torture_hrtimeout_us(500, 1000, &rand); rp = rcu_torture_alloc(); if (rp == NULL) continue; @@ -1290,8 +1290,7 @@ rcu_torture_fakewriter(void *arg) set_user_nice(current, MAX_NICE); do { - schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); - udelay(torture_random(&rand) & 0x3ff); + torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); @@ -1656,7 +1655,7 @@ rcu_torture_reader(void *arg) if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop()) schedule_timeout_interruptible(HZ); if (time_after(jiffies, lastsleep) && !torture_must_stop()) { - schedule_timeout_interruptible(1); + torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } while (num_online_cpus() < mynumonline && !torture_must_stop()) -- cgit From 414c116e016584137118067f506125f6ace6128c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Nov 2020 10:50:35 -0800 Subject: torture: Make refscale throttle high-rate printk()s This commit adds a short delay for verbose_batched-throttled printk()s to further decrease console flooding. Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 3da246f2ccf5..02dd9767b559 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -52,8 +52,10 @@ static atomic_t verbose_batch_ctr; do { \ if (verbose && \ (verbose_batched <= 0 || \ - !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) \ + !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \ + schedule_timeout_uninterruptible(1); \ pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \ + } \ } while (0) #define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ -- cgit From 8a67a20bf257ca378d6e5588fbe4382966395ac8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Nov 2020 13:00:04 -0800 Subject: torture: Throttle VERBOSE_TOROUT_*() output This commit adds kernel boot parameters torture.verbose_sleep_frequency and torture.verbose_sleep_duration, which allow VERBOSE_TOROUT_*() output to be throttled with periodic sleeps on large systems. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 7ad5c9681580..93eeeb2b3d88 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -48,6 +48,12 @@ module_param(disable_onoff_at_boot, bool, 0444); static bool ftrace_dump_at_shutdown; module_param(ftrace_dump_at_shutdown, bool, 0444); +static int verbose_sleep_frequency; +module_param(verbose_sleep_frequency, int, 0444); + +static int verbose_sleep_duration = 1; +module_param(verbose_sleep_duration, int, 0444); + static char *torture_type; static int verbose; @@ -58,6 +64,20 @@ static int verbose; static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +static atomic_t verbose_sleep_counter; + +/* + * Sleep if needed from VERBOSE_TOROUT*(). + */ +void verbose_torout_sleep(void) +{ + if (verbose_sleep_frequency > 0 && + verbose_sleep_duration > 0 && + !(atomic_inc_return(&verbose_sleep_counter) % verbose_sleep_frequency)) + schedule_timeout_uninterruptible(verbose_sleep_duration); +} +EXPORT_SYMBOL_GPL(verbose_torout_sleep); + /* * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit * nanosecond random fuzz. This function and its friends desynchronize -- cgit From edf7b8417834c89d00ef88355ea507b0b0a630ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Dec 2020 17:52:07 -0800 Subject: rcutorture: Make object_debug also double call_rcu() heap object This commit provides a test for call_rcu() printing the allocation address of a double-freed callback by double-freeing a callback allocated via kmalloc(). However, this commit does not depend on any other commit. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 007595d4783f..76c838696366 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2782,6 +2782,7 @@ static void rcu_test_debug_objects(void) #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); @@ -2794,6 +2795,10 @@ static void rcu_test_debug_objects(void) local_irq_disable(); /* Make it harder to start a new grace period. */ call_rcu(&rh2, rcu_torture_leak_cb); call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + if (rhp) { + call_rcu(rhp, rcu_torture_leak_cb); + call_rcu(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + } local_irq_enable(); rcu_read_unlock(); preempt_enable(); -- cgit From 0b962c8fe0e5c72a252b236814a6b6e9df799061 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 19 Dec 2020 07:05:58 -0800 Subject: torture: Clean up after torture-test CPU hotplugging This commit puts all CPUs back online at the end of a torture test, and also unconditionally puts them online at the beginning of the test, rather than just in the case of built-in tests. This allows torture tests to behave in a predictable manner, whether built-in or based on modules. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 93eeeb2b3d88..507a20be6950 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -291,6 +291,26 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, } EXPORT_SYMBOL_GPL(torture_online); +/* + * Get everything online at the beginning and ends of tests. + */ +static void torture_online_all(char *phase) +{ + int cpu; + int ret; + + for_each_possible_cpu(cpu) { + if (cpu_online(cpu)) + continue; + ret = add_cpu(cpu); + if (ret && verbose) { + pr_alert("%s" TORTURE_FLAG + "%s: %s online %d: errno %d\n", + __func__, phase, torture_type, cpu, ret); + } + } +} + /* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. @@ -301,25 +321,12 @@ torture_onoff(void *arg) int cpu; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); - if (!IS_MODULE(CONFIG_TORTURE_TEST)) { - for_each_possible_cpu(cpu) { - if (cpu_online(cpu)) - continue; - ret = add_cpu(cpu); - if (ret && verbose) { - pr_alert("%s" TORTURE_FLAG - "%s: Initial online %d: errno %d\n", - __func__, torture_type, cpu, ret); - } - } - } - + torture_online_all("Initial"); if (maxcpu == 0) { VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); goto stop; @@ -347,6 +354,7 @@ torture_onoff(void *arg) stop: torture_kthread_stopping("torture_onoff"); + torture_online_all("Final"); return 0; } -- cgit From 1afb95fee0342b8d9e05b0433e8e44a6dfd7c4a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 19 Dec 2020 07:34:35 -0800 Subject: torture: Maintain torture-specific set of CPUs-online books The TREE01 rcutorture scenario intentionally creates confusion as to the number of available CPUs by specifying the "maxcpus=8 nr_cpus=43" kernel boot parameters. This can disable rcutorture's load shedding, which currently uses num_online_cpus(), which would count the extra 35 CPUs. However, the rcutorture guest OS will be provisioned with only 8 CPUs, which means that rcutorture will present full load even when all but one of the original 8 CPUs are offline. This can result in spurious errors due to extreme overloading of that single remaining CPU. This commit therefore keeps a separate set of books on the number of usable online CPUs, so that torture_num_online_cpus() is used for load shedding instead of num_online_cpus(). Note that initial sizing must use num_online_cpus() because torture_num_online_cpus() will return NR_CPUS until shortly after torture_onoff_init() is invoked. Reported-by: Frederic Weisbecker [ paulmck: Apply feedback from kernel test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- kernel/torture.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 76c838696366..a816df4e86e0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1338,7 +1338,7 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, struct torture_random_state *trsp) { unsigned long loops; - int noc = num_online_cpus(); + int noc = torture_num_online_cpus(); int rdrchked; int rdrchker; struct rcu_torture_reader_check *rtrcp; // Me. @@ -1658,7 +1658,7 @@ rcu_torture_reader(void *arg) torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } - while (num_online_cpus() < mynumonline && !torture_must_stop()) + while (torture_num_online_cpus() < mynumonline && !torture_must_stop()) schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); diff --git a/kernel/torture.c b/kernel/torture.c index 507a20be6950..01e336f1e5b2 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -175,6 +175,19 @@ static unsigned long sum_online; static int min_online = -1; static int max_online; +static int torture_online_cpus = NR_CPUS; + +/* + * Some torture testing leverages confusion as to the number of online + * CPUs. This function returns the torture-testing view of this number, + * which allows torture tests to load-balance appropriately. + */ +int torture_num_online_cpus(void) +{ + return READ_ONCE(torture_online_cpus); +} +EXPORT_SYMBOL_GPL(torture_num_online_cpus); + /* * Attempt to take a CPU offline. Return false if the CPU is already * offline or if it is not subject to CPU-hotplug operations. The @@ -229,6 +242,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, *min_offl = delta; if (*max_offl < delta) *max_offl = delta; + WRITE_ONCE(torture_online_cpus, torture_online_cpus - 1); + WARN_ON_ONCE(torture_online_cpus <= 0); } return true; @@ -285,6 +300,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, *min_onl = delta; if (*max_onl < delta) *max_onl = delta; + WRITE_ONCE(torture_online_cpus, torture_online_cpus + 1); } return true; -- cgit From b4b7914a6a73fc169fd1ce2fcd78a1d83d9528a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Dec 2020 13:45:49 -0800 Subject: rcu: Make call_rcu() print mem_dump_obj() info for double-freed callback The debug-object double-free checks in __call_rcu() print out the RCU callback function, which is usually sufficient to track down the double free. However, all uses of things like queue_rcu_work() will have the same RCU callback function (rcu_work_rcufn() in this case), so a diagnostic message for a double queue_rcu_work() needs more than just the callback function. This commit therefore calls mem_dump_obj() to dump out any additional available information on the double-freed callback. Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Reported-by: Andrii Nakryiko Tested-by: Naresh Kamboju Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..84513c52d9b0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2941,6 +2941,7 @@ static void check_cb_ovld(struct rcu_data *rdp) static void __call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; struct rcu_data *rdp; bool was_alldone; @@ -2954,8 +2955,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", - head, head->func); + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } WRITE_ONCE(head->func, rcu_leak_callback); return; } -- cgit